### CPSC 672 Network Project

In [2]:
import numpy as np
import pandas as pd
import networkx as nx
import random
from random import sample
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib as mpl
import ast
from scipy import spatial
import csv
%matplotlib inline


### Data

The following few cells read in our data from a excel spreadsheet and organize them into a few arrays:
- stock_symbols contains all the actual stock names
- dates contains all the column headings (which are dates) from Feb 2010 to Dec 2021

In [3]:
df = pd.read_excel('data/all_data.xlsx', sheet_name='percentChange')


In [9]:
stock_symbols = []
for stock in df['Stock']:
    stock_symbols.append(stock)

dates = df.columns.values
dates = dates[2:]

In [12]:
def getSimilarities():
    similarities = []
    for i in range(len(df)):
        for j in range(i+1,len(df)):
            result = 1 - spatial.distance.cosine(df.iloc[i,2:], df.iloc[j,2:])
            similarities.append((stock_symbols[i], stock_symbols[j], result))
    return similarities


### Calculating links

To get the links between any two stocks we have to obtain the correlation with the following formula:

                            corr(a,b) = min(a,b) / max(a,b)
the getLinks() function does just that by comparing every stock, with every other stock, for any input date (column)

In [6]:
def getLinks(date):
    links = []
    elems = len(df[date])
    for i in range(elems):
        for j in range(i+1,elems):
            correlation = min(df[date][i],df[date][j]) / max(df[date][i],df[date][j])
            links.append((stock_symbols[i],stock_symbols[j],correlation))
    return links

In [7]:
test = getLinks(dates[0])
print(test[0:100])

  correlation = min(df[date][i],df[date][j]) / max(df[date][i],df[date][j])
  correlation = min(df[date][i],df[date][j]) / max(df[date][i],df[date][j])


[('AA', 'AACG', 0.16791033158814497), ('AA', 'AAIC', 0.22314164484340382), ('AA', 'AAL', 0.11770347274617851), ('AA', 'AAME', -0.6426857304598126), ('AA', 'AAON', 0.48810090722509364), ('AA', 'AAP', 0.7642582545378946), ('AA', 'AAPL', 0.6846894458558584), ('AA', 'AATC', 0.1353535037171861), ('AA', 'AAU', 0.5565028387219726), ('AA', 'AAWW', 0.19523641705922312), ('AA', 'AB', 0.8797991624915225), ('AA', 'ABB', 0.36202377724819096), ('AA', 'ABC', 0.6390325884338439), ('AA', 'ABCB', 0.38258321799206124), ('AA', 'ABEO', -0.8805263490230542), ('AA', 'ABEV', 0.9070907018368913), ('AA', 'ABG', 0.8851265776324588), ('AA', 'ABIO', 0.3811132103595605), ('AA', 'ABM', 0.8203321052573571), ('AA', 'ABMD', 0.16192998621115146), ('AA', 'ABR', 0.3198292169604546), ('AA', 'ABST', 0.31678481473975956), ('AA', 'ABT', 0.565295253800409), ('AA', 'ABUS', -0.5955554375610249), ('AA', 'ABVC', -3.1904788024007598), ('AA', 'ACAD', 0.9328360057137889), ('AA', 'ACC', 0.580280111223393), ('AA', 'ACCO', -1.5372298567

### Exporting
At this point all the stock symbols are exported to a csv,
similarly the links for a particular date are also exported to a csv.

In [8]:
with open('data/stocks.csv','w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["ID	label"])
    for stock in stock_symbols:
        writer.writerow([str(stock_symbols.index(stock))+"\t"+stock])

In [9]:
with open('data/links.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Source	Target	Weight"])
    for link in test:
        if(link[2] > 0.80 and link[2] <= 1):
            writer.writerow([str(stock_symbols.index(link[0]))+"\t"+str(stock_symbols.index(link[1]))+"\t"+str(link[2])])

In [None]:
 cosine_similarities = getSimilarities()
with open('data/links_cosine.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Source	Target	Weight"])
    for link in cosine_similarities:
        if(link[2] > 0.80 and link[2] <= 1):
            writer.writerow([str(stock_symbols.index(link[0]))+"\t"+str(stock_symbols.index(link[1]))+"\t"+str(link[2])])