In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import itertools as it
import sys
import os
import glob
import seaborn
from operator import itemgetter
from itertools import groupby


# miscellaneous functions
def grouper(iterable, n, fillvalue=None):
    args = [iter(iterable)] * n
    group = list(it.izip_longest(*args, fillvalue=fillvalue))
    ranges = []
    for i in range(len(group)):
        ran = [group[i][0],group[i][-1]]
        ranges.append(ran)
    return ranges


def reform_data(stks, idx):
    stockset = stks
    stockyear = stockset.split(os.path.sep)[-1].split('_')[2][0:4]
    indexset = idx
    indexname = indexset.split(os.path.sep)[-1].split('_')[0]

    # reading / organizing the datasets
    stockslist = pd.read_csv(stockset).sort_index(level='date', inplace=True)
    index = pd.read_csv(indexset).sort_index(level='from',inplace=True)
    index = index.set_index(['co_tic'])

    # creating lists and relevant dataset
    indexlist = stockslist[stockslist['TICKER'].isin(index.index.values)]
    stocks = indexlist['TICKER'].unique()
    dates = np.sort(indexlist['date'].unique())
    indexlist = indexlist.set_index(['TICKER', 'date'])
    indexlist = indexlist[~indexlist.index.duplicated(keep='last')]
    
    new_df = pd.DataFrame(0.0, index=stocks, columns=dates)
    for stock in stocks:
        # iterates through all the relavant dates for each stock (instead of all the dates)
        for date in indexlist.loc[stock].index.values:
            # makes sure to only account for dates since it was added
            if date > index.loc[stock]['from'].any():
                # and only go until the date removed if applicable
                if np.isnan(index.loc[stock]['thru']) | (~np.isnan(index.loc[stock]['thru']) & date < index.loc[stock]['thru'].any()):
                    new_df.at[stock, date] = indexlist.at[(stock,date),'PRC']
                # if there is no information for that date, instead of having zero
                # in its place, it takes the previous 
                else:
                    stocknum = new_df.index.get_loc(stock)
                    datenum = new_df.columns.get_loc(date)
                    new_df.iat[stocknum, datenum] = new_df.iat[stocknum, datenum-1]

    # then, we set the relative difference to account for behavior, not price
    new_df = new_df.T.diff()
    
    return new_df, stockyear, indexname, dates

def correlations(dates, new_df):
    corr_list = []
    ranges = list(grouper(dates,25))

    #creates a list of correlation data for 25 day increments
    for ran in ranges:
        sel = new_df.loc[ran[0]:ran[-1]]
        corr_df = sel.corr(method='pearson')
        corr_list.append(corr_df)
        
    return corr_list

def networks(corr_list, indexname, stockyear, show_networks=False)
    for i in range(len(corr_list)):
        # creates the filtered links for each correlation set, threshold at .75
        # somewhat arbitrarily
        corr_df = corr_list[i]
        links = corr_df.stack().reset_index()
        links.columns = ['var1', 'var2','value']
        links_filtered=links.loc[ (links['value'] > 0.75) & (links['var1'] != links['var2']) ]

        G=nx.from_pandas_edgelist(links_filtered, 'var1', 'var2')
        nx.write_gml(G, "Graphs/{}_{}/network_set_{}.gml".format(indexname, stockyear, i))
        
        if show_networks:
            plt.figure(i)
            nx.draw_spring(G, with_labels=True, node_color='orange', node_size=400, edge_color='black', linewidths=1, font_size=9)
            plt.show()
