In [18]:
import pandas as pd
import numpy as np
import math
import os

In [19]:
def construct_indu_index_mapping(df):
    """
    Construct a dictionary with
    key: industry code
    value: indexes of all reports in the dataframe
    """
    industries_to_index = {}
    industries = df["ggroup"].dropna().astype(int).unique()
    industries = industries.tolist()
    quarters = (df["year"].astype("str") + " q" + df["quarter"].astype("str")).unique()
    for i in range(df.shape[0]):
        row = df.iloc[i, :]
        if math.isnan(row["ggroup"]):
            continue
        industries_to_index[int(row["ggroup"])] = industries_to_index.get(int(row["ggroup"]), set())
        industries_to_index[int(row["ggroup"])].add(i)
    return industries_to_index

In [20]:
def construct_quar_index_mapping(df):
    """
    Construct a dictionary with
    key: quarter
    value: indexes of all reports in the dataframe
    """
    quarters = (df["year"].astype("str") + " q" + df["quarter"].astype("str")).unique()
    quarter_to_index = {}
    for i in range(df.shape[0]):
        row = df.iloc[i, :]
        quarter = row["year"].astype("str") + " q" + row["quarter"].astype("str")
        quarter_to_index[quarter] = quarter_to_index.get(quarter, set())
        quarter_to_index[quarter].add(i)
    return quarter_to_index

In [21]:
def construct_analyst_index_mapping(df, all_files_dcns):
    """
    Construct a dictionary with
    key: analyst
    value: indexes of all reports in the dataframe with the given DCNs(unique identification code for the reports)
    """
    analyst_to_index = {}
    for i, (_, dcn) in enumerate(all_files_dcns):
        analyst = max(df[df["DCN"] == dcn]["Analyst"])
        if not analyst is np.nan:
            analyst_to_index[analyst] = analyst_to_index.get(analyst, []) + [i]
    return analyst_to_index

In [34]:
def get_all_companies(df, indexes):
    """
    Return the set of companies in the dataframe with the given indexes
    """
    raw_companies = df.iloc[list(indexes), 4].unique()
    all_companies = set()
    for item in raw_companies:
        l = item.split(",")
        for company in l:
            all_companies.add(company.strip(" ").strip("^L19"))
    return all_companies

In [23]:
def get_company_files(target_dcns, company):
    """
    Return a list of tuples that contains file paths and DCNs of all reports with the target DCNs
    """
    directory = r".\PDFParsing\parsed_files"
    files = []
    temp = os.path.join(directory, company)
    list_files = os.listdir(temp)
    for item in list_files:
        l = item.split("-")
        dcn = l[-1].rstrip(".txt")
        while dcn and not dcn[-1].isdigit():
            dcn = dcn[:-1]
        while dcn and not dcn[0].isdigit():
            dcn = dcn[1:]
        if dcn:
            dcn = int(dcn)
        else:
            continue
        if dcn in target_dcns:
            files.append((os.path.join(temp, item), dcn))
    return files

<h2> This is an example run on industry code 2030 and Q4 of 2018. We will first find all reports with industry code 2030 and Q4 of 2018 and group them by analyst</h2>

In [24]:
df = pd.read_csv("metadata_reports_noduplicates_with_industry.csv")

In [25]:
industries_to_index = construct_indu_index_mapping(df)

In [26]:
quarter_to_index = construct_quar_index_mapping(df)

In [28]:
# We now have a list of indexes of reports with industry code 2030 and Q4 of 2018
indexes = industries_to_index[2030].intersection(quarter_to_index["2018 q4"])

In [54]:
# DCN is the unique identification code for the reports
dcns = set(df.iloc[list(indexes), :]["DCN"])

In [55]:
df.iloc[list(indexes), :].groupby('TICKER')["DCN"].count().reset_index()['TICKER'].tolist()

['AAL',
 'ALK',
 'CHRW',
 'CSX',
 'DAL',
 'EXPD',
 'FDX',
 'JBHT',
 'KSU',
 'LUV',
 'NSC',
 'ODFL',
 'UAL',
 'UNP',
 'UPS']

Unnamed: 0,Available,Date,Doc Grp,Info,Prim. Ticker,Second. Ticker,RI,Title,Pages,Price,...,TICKER,PrimaryTicker,SecondaryTicker,Match_Ticker,GenderAnalyst,year,quarter,PERMNO,gsector,ggroup
0,2018-11-02,2018-10-26,Inv.,pdf,AAL.OQ,,"Estimate Increase, TargetPrice Increase","3Q18 Recap: Maintaining Our Rating, Raising Ou...",9.0,92.0,...,AAL,['AAL.OQ'],['nan'],1,male,2018,4,21020.0,20.0,2030.0
1,2018-11-25,2018-10-26,Inv.,pdf,AAL.OQ,,Estimate Increase,Morningstar | Raising Our FVE After American S...,13.0,69.0,...,AAL,['AAL.OQ'],['nan'],1,male,2018,4,21020.0,20.0,2030.0
2,2018-11-25,2018-10-26,Inv.,pdf,AAL.OQ,,"Estimate Revision, TargetPrice Increase",Morningstar | Rising Fuel Costs Raise American...,25.0,195.5,...,AAL,['AAL.OQ'],['nan'],1,male,2018,4,21020.0,20.0,2030.0
3,2018-11-02,2018-10-26,Inv.,pdf,AAL.OQ,,Estimate Increase,3Q18 Recap; Strategic Initiatives Aimed At Dri...,5.0,80.0,...,AAL,['AAL.OQ'],['nan'],1,male,2018,4,21020.0,20.0,2030.0
4,2018-11-02,2018-10-26,Inv.,pdf,AAL.OQ,,"Estimate Revision, Recommendation Decrease, Ta...",American Airlines Group Inc.,9.0,69.0,...,AAL,['AAL.OQ'],['nan'],1,male,2018,4,21020.0,20.0,2030.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98729,2018-01-29,2018-01-22,Inv.,pdf,"ZIONP.OQ, ZION.OQ",,,Zions Bancorporation - 4Q Core EPS $0.80 - So...,12.0,92.0,...,ZION,"['ZIONP.OQ', 'ZION.OQ']",['nan'],1,male,2018,1,84129.0,40.0,4010.0
98730,2018-01-29,2018-01-22,Inv.,pdf,"ZIONP.OQ, ZION.OQ",,,Solid Fees Offset Slightly Smaller Balance She...,7.0,100.0,...,ZION,"['ZIONP.OQ', 'ZION.OQ']",['nan'],1,,2018,1,84129.0,40.0,4010.0
98731,2018-02-05,2018-01-22,Inv.,pdf,"ZIONP.OQ, ZION.OQ",,,4Q17 - Post Conference Call Model Update,7.0,69.0,...,ZION,"['ZIONP.OQ', 'ZION.OQ']",['nan'],1,male,2018,1,84129.0,40.0,4010.0
98732,2018-01-17,2018-01-10,Inv.,pdf,"ZIONP.OQ, ZION.OQ",,,Zions Bancorporation,9.0,69.0,...,ZION,"['ZIONP.OQ', 'ZION.OQ']",['nan'],1,male,2018,1,84129.0,40.0,4010.0


In [35]:
all_companies = get_all_companies(df, indexes)

In [234]:
all_companies

{'AAL.OQ',
 'AAWW.OQ',
 'ALK.N',
 'ARCB.OQ',
 'AZUL.N',
 'CHRW.OQ',
 'CNR.TO',
 'CP.TO',
 'CSX.OQ',
 'CYRX.OQ',
 'DAL.N',
 'DSKE.OQ',
 'ECHO.OQ',
 'EXPD.OQ',
 'FDX.N',
 'GWR.N',
 'HAG.DE',
 'HTLD.OQ',
 'HUBG.OQ',
 'JBHT.OQ',
 'JBLU.OQ',
 'KNX.N',
 'KSU.N',
 'MESA.OQ',
 'MSFT.OQ',
 'NEO.OQ',
 'NEO.TO',
 'NSC.N',
 'ODFL.OQ',
 'ORBC.OQ',
 'RLGT.A',
 'SAIA.OQ',
 'UAL.OQ',
 'UNP.N',
 'UPS.N',
 'UV.N',
 'VSAT.OQ',
 'WERN.OQ',
 'XPO.N',
 'YRCW.OQ'}

In [36]:
all_files_dcns = []

In [37]:
# Due to excessive time needed to transfer files from Compute Canada to local disk, I only use subset of companies for this example
for companies in ["AAL.OQ", 'ALK.N', 'FDX.N', "DAL.N", "UAL.OQ"]:
    all_files_dcns += get_company_files(dcns, companies)

In [38]:
all_files_dcns

[('.\\PDFParsing\\parsed_files\\AAL.OQ\\2018-10-02-NEO.TO-Neo Perfor-Raymond James Ltd. (-NEO Rev Up Your Investment Engine (Full Report)-83041100.txt',
  83041100),
 ('.\\PDFParsing\\parsed_files\\AAL.OQ\\2018-10-09-AAL.OQ-American A-Cowen and Company-3Q18 RASM To High End; Improvement Needs To Contin...-83097864.txt',
  83097864),
 ('.\\PDFParsing\\parsed_files\\AAL.OQ\\2018-10-09-AAL.OQ-American A-EVERCORE ISI-AAL  Adjusting Estimates-83099925.txt',
  83099925),
 ('.\\PDFParsing\\parsed_files\\AAL.OQ\\2018-10-09-AAL.OQ-American A-Morningstar, Inc.-Morningstar  AAL Updated Star Rating from 09 Oct ...-83100129.txt',
  83100129),
 ('.\\PDFParsing\\parsed_files\\AAL.OQ\\2018-10-09-AAL.OQ-American A-Morningstar, Inc.-Morningstar  American Airlines Updates Guidance; ...-83098692.txt',
  83098692),
 ('.\\PDFParsing\\parsed_files\\AAL.OQ\\2018-10-09-AAL.OQ-American A-Wolfe Research-Airlines - AAL Investor Update - We Thought it Wa...-83097580.txt',
  83097580),
 ('.\\PDFParsing\\parsed_file

In [242]:
analyst_to_index = construct_analyst_index_mapping(df, all_files_dcns)

<h2> Now we have got all information needed, we will run LDA with corpus of all the files in all_files_dcns list and we will get a topic distribution for reports each analyst covered </h2>

In [206]:
import nltk 
# nltk.download('punkt')
from nltk.tokenize import word_tokenize 
from gensim import corpora, models, similarities
import os

In [207]:
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline


In [243]:
words=[]

did=0
for fname, _ in all_files_dcns:
    f = open(fname, 'r')
    result = f.read()
    tokens = word_tokenize(result)
    tokens = list(filter(("--").__ne__, tokens))
    tokens = list(filter(("fy").__ne__, tokens))
    words.append(tokens)
    did+=1

In [244]:
num_topics = 8

In [245]:
dictionary_LDA = corpora.Dictionary(words)
#dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(list_of_tokens) for list_of_tokens in words]

num_topics = 8
%time lda_model = models.LdaMulticore(corpus=corpus,\
                                        id2word=dictionary_LDA,\
                                        num_topics=num_topics, \
                                        random_state=100,\
                                        chunksize=10,\
                                        passes=10,\
                                        alpha=0.3,\
                                        eta=0.6)

Wall time: 9.6 s


In [246]:
analyst_to_index

{'Credit Suisse Global Product Marketing': [0, 1, 2, 3, 165],
 'Mr. Frederic Bastien, CFA': [4, 34],
 'Ms. Helane R. Becker': [5,
  14,
  32,
  35,
  44,
  62,
  64,
  67,
  69,
  85,
  92,
  93,
  98,
  101,
  106,
  113,
  127,
  131,
  140,
  154,
  157,
  160],
 'Duane T. Pfennigwerth, CFA': [6, 17, 36, 47, 65, 73, 155],
 'Danny Goode': [7,
  8,
  19,
  20,
  33,
  37,
  38,
  49,
  50,
  63,
  129,
  135,
  141,
  163,
  166,
  170],
 'Hunter K. Keay': [9, 39, 79, 89, 94, 133, 153, 164],
 'Mr. Michael W. Derchin': [10, 31, 40, 61, 66, 90, 128, 138, 150, 158],
 'Michael J. Linenberg': [11, 16, 41, 46, 72, 87, 134, 139, 161],
 'Andrew G. Didora, CFA': [12, 42, 70, 84, 126, 151, 169],
 'Mr. James A. Corridore': [13,
  15,
  43,
  45,
  68,
  71,
  75,
  104,
  109,
  112,
  130,
  137,
  144,
  159],
 'Adam Hackel': [18, 48, 162],
 'Mr. Jack L. Atkins': [21, 51, 102, 118, 132, 167],
 'Daniel J. McKenzie, CFA': [22, 52, 74, 91, 136, 152],
 'Mr. Christopher D. Quilty': [23, 53],
 'Mr. 

In [255]:
matrix = []
for analyst, indexes in analyst_to_index.items():
    row = [0] * num_topics
    all_words = []
    for i in indexes:
        all_words.extend(words[i])
    topics = lda_model.get_document_topics(dictionary_LDA.doc2bow(all_words), minimum_probability = 1e-4)
    for index, dist in topics:
        row[index] = dist
    matrix.append(row)
matrix = np.array(matrix)

<h2> Calculating the shapley values </h2>

In [258]:
import numpy as np
import Powerset as ps
import pandas as pd

def normalize_rows(x: np.ndarray): # function to normalize rows in a two-dimensional materix
    return x/np.linalg.norm(x, ord=2, axis=1, keepdims=True)

    
# shapley values function
def shapley_values(loading_matrix):
    
    loading_matrix=normalize_rows(loading_matrix)

    no_analysts=np.shape(np.dot(loading_matrix,loading_matrix.T))[1] # number of analysts
    list_analysts=[x for x in range(no_analysts)]
    data=pd.DataFrame(columns={'Analyst','InfoContribution'})

    for k in range(no_analysts):
        list_minusone=[x for x in list_analysts if x!=k] # list without the analyst
        all_sets=[x for x in ps.powerset(list_minusone) if x]

        shapley_value=[]

        for coalition in all_sets:
            
            other_coal=loading_matrix[coalition,:].sum(axis=0)
            other_coal=other_coal/np.linalg.norm(other_coal,ord=2,axis=0,keepdims=True)
            
            contribution=1-np.dot(other_coal,loading_matrix[k,:])

            shapley_value.append(contribution)

            #print(coalition, np.dot(other_coal,loading_matrix[k,:]), contribution)

        #print(np.array(shapley_value).mean())
        data=data.append({'Analyst': k,'InfoContribution': np.array(shapley_value).mean()},ignore_index=True)
        
    return data


# get informational diversity measure
def diversity(loading_matrix):
    ld_matrix_norm=normalize_rows(loading_matrix) # normalize all row vectors
    cosine_matrix=np.dot(ld_matrix_norm,ld_matrix_norm.T) # compute dot products across normalized rows
    avg_similarity=cosine_matrix[np.triu_indices(np.shape(cosine_matrix)[1],k=1)].mean()
    
    if np.shape(loading_matrix)[0]==1:
        return 0
    else:
        return 1-avg_similarity

In [260]:
matrix

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 9.99864340e-01, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.08690758e-04, 9.99707639e-01, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.59938391e-02,
        9.78265762e-01, 0.00000000e+00, 0.00000000e+00, 5.67171536e-03],
       [1.81101466e-04, 2.49074027e-01, 1.95501678e-04, 2.63312645e-03,
        4.71199840e-01, 5.83241973e-03, 3.16843987e-02, 2.39199623e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.99956667e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 7.84895062e-01, 2.11509666e-03, 9.79568064e-03,
        3.11307702e-02, 1.25849947e-01, 0.00000000e+00, 4.60288264e-02],
       [0.00000000e+00, 3.21828248e-03, 0.00000000e+00, 0.00000000e+00,
        9.92820442e-01, 0.00000000e+00, 0.00000000e+00, 3.

In [261]:
len(matrix)

29

In [259]:
shapley_values(matrix)

MemoryError: 

In [56]:
directory = r".\PDFParsing\clean_txt_flat"

In [57]:
files = []
temp = os.path.join(directory)

In [58]:
temp

'.\\PDFParsing\\clean_txt_flat'

In [59]:
list_files = os.listdir(temp)

In [60]:
list_files

['2018-01-01-BDX.N-Becton Dic-Wells Fargo Securiti-BDX Raising Our Estimates Following BCR Deal Clos...-80447381.txt',
 '2018-01-01-CSX.OQ-CSX Corp-Morningstar, Inc.-Morningstar  We expect CSX to continue improving ...-80445915.txt',
 '2018-01-01-ILMN.OQ-Illumina I-Wells Fargo Securiti-ILMN 2018 Could Resolve Controversies--Upgrading ...-80446616.txt',
 '2018-01-01-INCY.OQ-Incyte Cor-RBC Capital Markets-Incyte Corporation  - Upgrade to OP RewardRisk I...-80446815.txt',
 '2018-01-01-LEN.N-Lennar Cor-Wells Fargo Securiti-LEN Upgrading LEN.  Attractive Without CAA, Compe...-80446024.txt',
 '2018-01-01-LNC.N-Lincoln Na-Deutsche Bank-Lincoln  Initiating coverage with a Buy Rating-80444120.txt',
 '2018-01-01-MET.N-Deutsche Bank-MetLife  Initiating Coverage with a Hold Recommendation-80444125.txt',
 '2018-01-01-PRU.N-Prudential-Deutsche Bank-Prudential  Initiating Coverage with a Hold Ratin...-80444039.txt',
 '2018-01-01-STZ.N-Susquehanna Financia-3Q18 Beat Likely, But Upside is Limited this 