Setup

In [1]:
import os

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import nltk
import string
import re
from nltk import word_tokenize, FreqDist,regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer,PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from sklearn.cluster import AgglomerativeClustering, KMeans, AffinityPropagation
from sklearn.cluster import DBSCAN, OPTICS,MeanShift
from scipy.cluster.hierarchy import dendrogram,linkage

In [5]:
from bs4 import BeautifulSoup as bs
from itertools import takewhile
import requests
import json

In [6]:
%run Code/Functionality.py

EDA

In [None]:
Universe.SIC.apply(lambda s:str(s)[:1]).value_counts()

Stock Price Correlations

Dataframes for Correlations of Daily Changes (Absolute & Relative to Benchmark)

In [None]:
daily_prices,abs_data,rel_data=collect_market_data(Universe.index)

Quick Look @ Correlations Between Daily Changes & Daily Changes relative to the S&P.

In [None]:
abs_data[1]

In [None]:
rel_data[0]

Look @ Some Heatmaps of Correlations

In [None]:
fig,ax=plt.subplots(nrows=1,ncols=1,figsize=(25,15))
ax.set_title('Correlations btwn 2011 & 2020')
mtrx=abs_data[1]
mask = np.zeros_like(mtrx)
mask[np.triu_indices_from(mtrx)] = True
sns.heatmap(mtrx,vmin=-1,vmax=1,cmap='RdBu',ax=ax,mask=mask);
ax.set_xlabel(None)
ax.set_ylabel(None)
fig.savefig(image_path+'Corr Matrix.pdf',
            orientation='landscape',
            pad_inches=0.0,bbox_inches='tight',
            format='pdf'
           )

In [None]:
fig,ax=plt.subplots(nrows=1,ncols=1,figsize=(25,15))
ax.set_title('Relative Correlations btwn 2011 & 2020')
mtrx=rel_data[1]
mask = np.zeros_like(mtrx)
mask[np.triu_indices_from(mtrx)] = True
sns.heatmap(mtrx,vmin=-1,vmax=1,cmap='RdBu',ax=ax,mask=mask);
fig.savefig(image_path+'Corr Matrix_Relative.pdf',
            orientation='landscape',pad_inches=0.0,bbox_inches='tight',
            format='pdf'
           )

In [None]:
fig,ax=plt.subplots(ncols=5,nrows=2,figsize=(25,15),sharex=True,sharey=True,)
fig.suptitle('Rltve. Corr. by Year (2011-2020)')
axes=ax.reshape(-1)
i=0
for y in coverage:
    a=axes[i]
    a.set_title(y)
    corr_mat=rel_data[2].xs(y+'-12-31')
    mask = np.zeros_like(corr_mat)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(corr_mat,mask=mask,ax=a,vmin=-1,vmax=1,cbar=False,cmap='RdBu')
    i+=1
fig.savefig(image_path+'Annual Corr Matrices_Relative.pdf',
            orientation='landscape',
            pad_inches=0.0,
            bbox_inches='tight',
            format='pdf'
           )

Given the number of time series in this dataset, it is tough to make any inferences with this visualization.  Let's look @ a different visualization technique...

In [None]:
fig,axes=plt.subplots(ncols=5,nrows=2,figsize=(25,15))
axes=axes.reshape(-1)
matrix=rel_data[2]
i=0
for yr in coverage:
    Sigma=matrix.xs(yr+'-12-31')
    Tops=filter_decile(Sigma,0.8)
    G=graphx.Graph()
    G.add_nodes_from(Tops.index)
    colors=[]
    a=axes[i]
    a.set_title(yr)
    for tick_a in G.nodes:
        for tick_b in G.nodes:
            p=Sigma[tick_a][tick_b]
            if((tick_a!=tick_b) and abs(p)>0.25):
                clr='green' if p>0 else 'red'
                colors.append(clr)
                G.add_edge(tick_a,tick_b,color=clr,weight=abs(p))
            else:
                pass
                #print(tick_a,tick_b)
    graphx.draw(G,with_labels=True,edge_color=colors,node_size=20,ax=a)
    i+=1
fig.savefig(image_path+'Annual Universe Structure.pdf',
            orientation='landscape',
            pad_inches=0.0,
            bbox_inches='tight',
            format='pdf'
           )

Moving on to the text...

In [None]:
for tick in Filed.index:
    visualize_stock(tick);

In [None]:
for yr in ['2016','2017','2018','2019','2020']:
    entire_year(yr)

In [None]:
visualize_group(sic_7372,'Group 7372')
visualize_group(sic_ex7,'Group ex7372')
visualize_group(sic_4,'Group 4')
visualize_group(sic_3,'Group 3')
visualize_group(sic_56,'Groups 5+6')

Let's look at some simple clustering algorithms on the 2020 subset.  This will provide some insight into the different options we have for clustering.

Create some feature engineers with different Vectorizor parameters (all TfIDF).

In [9]:
corpus=collect_texts_year('2020')
parms={'max_df':0.99,'max_features':10,'min_df':0.01}
engineers={}
for n in [10,26,50,100,200,500,1000,1500,2000,5000]:
    parms={'max_df':0.99,'max_features':n,'min_df':0.01}
    engineers[str(n)]=create_extractor(corpus.values(),parms,'max_features'+str(n))

Define some k-mean clustering modelers

In [None]:
Agg=AgglomerativeClustering(n_clusters=20)

In [None]:
groupings=pd.DataFrame(index=corpus.keys(),columns=engineers.keys())
for max_ftr,engineer in engineers.items():
    X=engineer.transform(corpus.values())
    X_vect=pd.DataFrame.sparse.from_spmatrix(X)
    fig,ax=plt.subplots(figsize=(25,10));
    ax.set_title(max_ftr);
    dendrogram(
            linkage(X_vect),
            leaf_rotation=45.,
            leaf_font_size=20.,
            labels=list(corpus.keys()),
            p=25,
            truncate_mode='level'
            );
    plt.savefig(image_path+'Dendograms/2020_MaxF'+max_ftr+'.pdf',
                orientation='landscape',
                pad_inches=0.0,
                bbox_inches='tight',
                format='pdf'
               );
    
    
    Agg.fit(X.toarray())
    groupings[max_ftr]=Agg.labels_    
groupings.to_excel(parent_path+'Data/Results/Different_Max_Features.xlsx')

In [None]:
distributions=pd.DataFrame(index=range(0,20),columns=engineers.keys())
for max_feature in engineers.keys():
    distributions[max_feature]=groupings[max_feature].value_counts()

In [None]:
for feature,engineer in engineers.items():
    X=engineer.transform(corpus.values())

In [None]:
K3=KMeans(n_clusters=3)
K5=KMeans(n_clusters=5)
K8=KMeans(n_clusters=8)
K10=KMeans(n_clusters=10)
K15=KMeans(n_clusters=15)
K25=KMeans(n_clusters=25)
clusterers={'3':K3,'5':K5,'8':K8,'10':K10,'15':K15,'25':K25}

In [None]:
groupings=pd.DataFrame(index=corpus.keys(),columns=engineers.keys())
engineer=create_extractor(corpus.values(),parms)
X=engineer.transform(corpus.values())
for n,clusterer in clusterers.items():
    clusterer.fit(X.toarray())
    groupings[n]=clusterer.labels_
groupings.to_excel(parent_path+'Data/Results/KNNs.xlsx')

In [None]:
Aff=AffinityPropagation(max_iter=500,random_state=STATE)
parms={'max_df':0.99,'max_features':1000,'min_df':0.01}
engineer=create_extractor(corpus.values(),parms)
X=engineer.transform(corpus.values())
Aff.fit(X.toarray())
Aff_Clusters=pd.DataFrame(zip(corpus.keys(),Aff.labels_),columns=['Ticker','Category'])
Aff_Clusters.to_excel(parent_path+'Data/Results/Affinity_Labels.xlsx')

In [None]:
centers=Aff.cluster_centers_indices_.tolist()
Aff_Summary=Aff_Clusters.loc[centers]
Aff_Summary.set_index(['Category'],inplace=True)
Aff_Summary['Count']=Aff_Clusters.Category.value_counts()
Aff_Summary.to_excel(parent_path+'Data/Results/Affinity_Summary.xlsx')

In [None]:
corpus_full=collect_texts_all()
corpus_3yr=collect_texts_years(['2020','2019','2018'])
corpus_20=collect_texts_year('2020')
corpus_19=collect_texts_year('2019')
corpus_18=collect_texts_year('2018')

In [None]:

parms={'max_df':0.99,'max_features':1000,'min_df':0.01}
engineer1=create_extractor(corpus_full.values(),parms,'Alltime')
engineer2=create_extractor(corpus_full.values(),parms,'3_Years')
engineer3=create_extractor(corpus_20.values(),parms,'2020s')
engineer4=create_extractor(corpus_19.values(),parms,'2019s')
engineer5=create_extractor(corpus_18.values(),parms,'2018s')

X1=engineer1.transform(corpus_20.values())
X2=engineer2.transform(corpus_20.values())
X3=engineer3.transform(corpus_20.values())
X4=engineer4.transform(corpus_20.values())
X5=engineer5.transform(corpus_20.values())

clusterer=clusterers['15']

clusterer.fit(X1.toarray())
clustering1=clusterer.labels_

clusterer.fit(X2.toarray())
clustering2=clusterer.labels_

clusterer.fit(X3.toarray())
clustering3=clusterer.labels_

clusterer.fit(X4.toarray())
clustering4=clusterer.labels_

clusterer.fit(X5.toarray())
clustering5=clusterer.labels_
DF=pd.DataFrame(zip(corpus.keys(),clustering1,clustering2,clustering3,clustering4),
                columns=['Ticker','Category-Full','Category-3yr','Category-20','Category-19','Category-18'],
               )
DF.to_excel(parent_path+'Data/Results/Comp_Dictionary.xlsx')

Clustering w/ Agg.Prop...Done two ways.

In [None]:
five_years=['2020','2019','2018','2017','2016']
stocks=Filed[
            Filed['2020'] & 
            Filed['2019'] & 
            Filed['2018'] & 
            Filed['2017'] & 
            Filed['2016']
            ].index

Way 1: Cluster entire corpus simultaneously.

In [None]:
full_corpus=collect_texts_years(five_years)
corpus=collect_texts(stocks,five_years)

In [None]:
parms={'max_df':0.99,'max_features':1000,'min_df':0.01}
engineer=create_extractor(full_corpus.values(),parms)
clusterer=AgglomerativeClustering(n_clusters=20)

In [None]:
X=engineer.transform(corpus.values())
clusterer.fit(X.toarray())

In [None]:
DF_All=pd.DataFrame(zip(clusterer.labels_,corpus.keys()),columns=['Group','Stock_Year'])
DF_All['Stock']=DF_All.Stock_Year.apply(lambda s:s.split('_')[0])
DF_All['Year']=DF_All.Stock_Year.apply(lambda s:s.split('_')[1])
DF_All.set_index(['Stock','Year'])
DF_All.to_excel(parent_path+'Data/Results/5Yr_Simultaneous.xlsx')

Way 2:  Split corpus into 5 parts (one for each year) prior to apply clustering.

In [None]:
DF_byYear=pd.DataFrame(columns=five_years,index=stocks)
parms={'max_df':0.99,'max_features':1000,'min_df':0.01}
engineer=create_extractor(full_corpus.values(),parms)

In [None]:
for y in five_years:
    corpus=collect_texts(stocks,[y])
    X=engineer.transform(corpus.values())
    clusterer.fit(X.toarray())
    DF_byYear[y]=clusterer.labels_

In [None]:
DF_byYear.to_excel(parent_path+'Data/Results/5Yr_Annuals.xlsx')

Calculate Weightings to generate similarity matrix.

In [None]:
weights={'2020':0.3,'2019':0.2,'2018':0.2,'2017':0.15,'2016':0.15}
S=groups_to_similarity(DF_byYear,weights)
S.to_excel(parent_path+'Data/Results/5yr_Similarity.xlsx')

Second Methodology Except w/ 10 Year horizon

In [None]:
ten_years=['2020','2019','2018','2017','2016',
            '2015','2014','2013','2012','2011']
stocks=Filed[
            Filed['2020'] & 
            Filed['2019'] & 
            Filed['2018'] & 
            Filed['2017'] & 
            Filed['2016'] &
            Filed['2015'] & 
            Filed['2014'] & 
            Filed['2013'] & 
            Filed['2012'] & 
            Filed['2011']
            ].index
full_corpus=collect_texts_years(ten_years)

In [None]:
parms={'max_df':0.99,'max_features':1000,'min_df':0.01}
engineer=create_extractor(full_corpus.values(),parms)
clusterer=AgglomerativeClustering(n_clusters=20)

In [None]:
DF_byYear=pd.DataFrame(columns=ten_years,index=stocks)
for y in ten_years:
    corpus=collect_texts(stocks,[y])
    X=engineer.transform(corpus.values())
    clusterer.fit(X.toarray())
    DF_byYear[y]=clusterer.labels_
DF_byYear.to_excel(parent_path+'Data/Results/10Yr_Annuals.xlsx')

In [None]:
weights={'2020':0.20,'2019':0.20,'2018':0.15,'2017':0.10,'2016':0.10,
         '2015':0.05,'2014':0.05,'2013':0.05,'2012':0.05,'2011':0.05}
S=groups_to_similarity(DF_byYear,weights)
S.to_excel(parent_path+'Data/Results/10yr_Similarity.xlsx')