In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import random
from random import sample
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib as mpl
import ast
from scipy import spatial
import csv
from tqdm.notebook import tqdm
%matplotlib inline

In [2]:
df_total = pd.read_excel('data/all_data.xlsx', sheet_name='percentChange',engine='openpyxl')
stocks = pd.read_csv('data/stocks.csv',)
links_total = pd.read_csv('data/links_total.csv',)

In [3]:
all_sectors = {}
for sect in df_total['Sector']:
    if(sect in all_sectors):
        all_sectors[sect]+=1
    else:
        all_sectors[sect]=1
sorted_sectors = sorted(all_sectors.items(),key=lambda x:x[1],reverse=True)

### Identifying Popular Sectors
The snippet below prints all the sectors from our raw data, these include banking, software, pharmaceuticals and many more.
With these sectors now identified, we can begin exploring the relationship between them, and see how stocks within the same sectors relate to eachother.

In [None]:
for sector in sorted_sectors:
    print(sector)

In [97]:
top_sectors = [
'Banks',
'Biotechnology',
'Oil, Gas & Consumable Fuels',
'Equity Real Estate Investment Trusts ...',
'Metals & Mining',
'Machinery',
'Electronic Equipment, Instruments & C...',
'Capital Markets',
'Health Care Equipment & Supplies',
'Insurance',
'Semiconductors & Semiconductor Equipment',
'Software',
'Specialty Retail',
'IT Services',
'Commercial Services & Supplies',
'Pharmaceuticals',
'Chemicals',
'Hotels, Restaurants & Leisure',
'Communications Equipment',
'Health Care Providers & Services',
'Food Products',
'Media',
'Aerospace & Defense',
'Energy Equipment & Services',
'Electrical Equipment',
'Textiles, Apparel & Luxury Goods',
'Electric Utilities',
'Trading Companies & Distributors',
'Real Estate Management & Development',
'Entertainment',
'Technology Hardware, Storage & Periph...',
'Construction & Engineering',
'Beverages',
'Airlines',
'Automobiles'
]

### Getting Stocks by sector
After we have obtained the top sectors, the snippet of code below goes through the data and finds all the stocks corresponding to our top sectors

In [67]:
stocks_by_sector = []
for sector in top_sectors:
    stocks = df_total.loc[df_total['Sector'] == sector]
    value = (sector,stocks)
    stocks_by_sector.append(value)

### Calculating Average Intra-Sector Correlation
The following code blocks calculate the average weight of a link between all the stocks in a given sector. For example, in the Bank sector, the functions will go and obtain the links from one bank stock to every other bank stock, and add them to the total. Then this total will be divided by the number of links, to obtain the average weight. This is in hopes of answering our third and fourth research questions.

In [68]:
def calculateIntraSectorWeight(sector):
    max_corr = (0,"stock","stock")
    min_corr = (0,"stock","stock")
    sum_of_links = 0
    link_count = 0
    sector_index = top_sectors.index(sector)
    df = stocks_by_sector[sector_index][1]
    num_of_stocks = df.shape[0]
    for i in tqdm(range(num_of_stocks)):
        first_stock = df.iloc[i,2:]
        for j in range(i+1,num_of_stocks):
            second_stock = df.iloc[j,2:]
            result = 1-spatial.distance.cosine(first_stock,second_stock)
            if(result>max_corr[0]):
                max_corr = (result,df.iloc[i,0],df.iloc[j,0])
            if(result<min_corr[0]):
                min_corr = (result,df.iloc[i,0],df.iloc[j,0])
            sum_of_links += result
            link_count += 1
    
    average_link = sum_of_links / link_count
    return average_link,max_corr,min_corr

In [69]:
def getAllIntraSectorCorrelations(df):
    for count,sector in enumerate(top_sectors):
        avg, max_corr, min_corr = calculateIntraSectorWeight(sector)
        df.loc[count] = [sector,avg,(max_corr[0],max_corr[1],max_corr[2]),(min_corr[0],min_corr[1],min_corr[2])]

In [70]:
df_intra = pd.DataFrame(columns=["Sector","Average Correlation","Max Correlation","Min Correlation"])
getAllIntraSectorCorrelations(df_intra)

  0%|          | 0/271 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/128 [00:00<?, ?it/s]

  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/93 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/87 [00:00<?, ?it/s]

  0%|          | 0/87 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

In [71]:
df_intra.sort_values('Average Correlation', ascending=False)

Unnamed: 0,Sector,Average Correlation,Max Correlation,Min Correlation
23,Energy Equipment & Services,0.475037,"(0.8882941728141304, HAL, SLB)","(0, stock, stock)"
33,Airlines,0.439068,"(0.8928999467636979, CEA, ZNH)","(0, stock, stock)"
3,Equity Real Estate Investment Trusts ...,0.419302,"(0.9126866521624862, KIM, REG)","(-0.12823799536807146, IHT, SBAC)"
26,Electric Utilities,0.409064,"(0.8963693868337042, LNT, XEL)","(-0.1286715503196716, EDN, MGF)"
17,"Hotels, Restaurants & Leisure",0.390153,"(0.9866427501699437, CCL, CUK)","(-0.06058279447573067, CPHC, TAST)"
5,Machinery,0.379853,"(0.8504897147877548, VKI, VKQ)","(-0.2018622560312524, HURC, VKQ)"
0,Banks,0.352028,"(0.9390549555893505, BBD, ITUB)","(-0.25630984505676646, EGF, MCBC)"
27,Trading Companies & Distributors,0.343413,"(0.7824808642361912, HRI, URI)","(-0.06427591992427106, GIC, IGC)"
16,Chemicals,0.32663,"(0.7563468929285467, CE, EMN)","(-0.20268707348397674, ADES, NEV)"
9,Insurance,0.321942,"(0.8855548429127549, LNC, PRU)","(-0.18794377816264296, UNAM, WTM)"


### Calculating Average Intra-Sector Correlation
The next few blocks of code perform a similar calculation as above, however this time we are comparing the stocks in different sectors to eachother, rather than looking within the same sector.

In [98]:
def calculateIntraSectorCorrelations(sector1,sector2):
    sector1_index = top_sectors.index(sector1)
    df1 = stocks_by_sector[sector1_index][1]
    sector1_stock_count = df1.shape[0]
    
    sector2_index = top_sectors.index(sector2)
    df2 = stocks_by_sector[sector2_index][1]
    sector2_stock_count = df2.shape[0]
    
    link_count = 0
    sum_of_links = 0
    max_corr = (0,"stock","stock")
    min_corr = (0,"stock","stock")
    for i in range(sector1_stock_count):
        first_stock = df1.iloc[i,2:]
        for j in range(sector2_stock_count):
            second_stock = df2.iloc[j,2:]
            result = 1-spatial.distance.cosine(first_stock,second_stock)
            if(result>max_corr[0]):
                max_corr = (result,df1.iloc[i,0],df2.iloc[j,0])
            if(result<min_corr[0]):
                min_corr = (result,df1.iloc[i,0],df2.iloc[j,0])
            sum_of_links += result
            link_count += 1
    
    average_link = sum_of_links / link_count
    return average_link,max_corr,min_corr

In [99]:
def getAllInterSectorCorrelations(df):
    num_of_sectors = len(top_sectors)
    count = 1
    for i in tqdm(range(num_of_sectors)):
        sector1 = top_sectors[i]
        for j in range(i+1,num_of_sectors):
            sector2 = top_sectors[j]
            avg_sector_corr, max_corr, min_corr = calculateIntraSectorCorrelations(sector1,sector2)
            # print(count,sector1,sector2)
            df.loc[count] = [sector1,sector2,avg_sector_corr,
                             (max_corr[0],max_corr[1],max_corr[2]),
                             (min_corr[0],min_corr[1],min_corr[2])]
            count += 1

In [95]:
df_inter = pd.DataFrame(columns=["Sector1","Sector2","Average Correlation","Max Correlation","Min Correlation"])
getAllInterSectorCorrelations(df_inter)

  0%|          | 0/3 [00:00<?, ?it/s]

In [96]:
df_inter.sort_values('Average Correlation',ascending=False)

Unnamed: 0,Sector1,Sector2,Average Correlation,Max Correlation,Min Correlation
2,Banks,"Oil, Gas & Consumable Fuels",0.252826,"(0.7892128026074418, ITUB, PBR)","(-0.2554323254657924, FDBC, RRC)"
3,Biotechnology,"Oil, Gas & Consumable Fuels",0.136366,"(0.8914344905610084, GLV, GLO)","(-0.3043409627580509, ATHX, GLP)"
1,Banks,Biotechnology,0.135676,"(0.8726843265470963, BFZ, MYI)","(-0.37291324134746207, BFZ, VNDA)"
