In [9]:
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn import preprocessing 
from sklearn.model_selection import train_test_split

FAMA_49CRSP = 'FAMA_49CRSP.csv'

In [10]:
def encode(df, col):
    """
    Takes in pandas dataframe and encodes df.col as numbers. 
    """
    encoder = preprocessing.LabelEncoder()
    industry_desc_encode = encoder.fit_transform(df[col])
    df[col] = industry_desc_encode
    return df 

def process_data():
    """
    Reads in a CSV file and encodes text columns. 
    Removes indret_ew, indret_vw from df. 
    Returns df (X matrix) and ew_indret (Y values )
    """
    df = pd.read_csv(FAMA_49CRSP, header=0, dtype={'public_date' : str})
    df = encode(df, 'FFI49_desc')
    
    ew_indret = df.indret_ew
    
    df = df.drop(labels=['indret_ew', 'indret_vw'], axis=1)
        
    return df, ew_indret

def split_data(x, y): 
    """
    Splits data into 0.64 Train, 0.16 dev, 0.2 Test
    """
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, shuffle = False)
    x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size = 0.2, shuffle = False)
    
    return x_train, x_test, x_dev, y_dev, y_train, y_test

In [17]:
def split_industries():
    df = pd.read_csv(FAMA_49CRSP)
    
    industries = set(df['FFI49_desc'])
    
    for ind in industries: 
        df_ind = df[df['FFI49_desc'] == ind]

In [18]:
split_industries()

{'HSHLD', 'RTAIL', 'AUTOS', 'MINES', 'TELCM', 'TRANS', 'BEER', 'CHEMS', 'FABPR', 'BANKS', 'AGRIC', 'INSUR', 'COAL', 'OIL', 'UTIL', 'MEDEQ', 'TOYS', 'OTHER', 'CNSTR', 'ELCEQ', 'DRUGS', 'SOFTW', 'BUSSV', 'CHIPS', 'MEALS', 'GOLD', 'MACH', 'RLEST', 'FUN', 'HLTH', 'FIN', 'STEEL', 'PAPER', 'HARDW', 'SODA', 'SHIPS', 'CLTHS', 'FOOD', 'PERSV', 'RUBBR', 'SMOKE', 'TXTLS', 'WHLSL', 'GUNS', 'BOXES', 'LABEQ', 'BOOKS', 'BLDMT', 'AERO'}


In [13]:
def graph(x, y_real, y_pred=None, industry=0):
    x_industry = x[x['FFI49_desc'] == industry]
    y_industry = y_real[x['FFI49_desc'] == industry]
    y_pred = y_pred[x['FFI49_desc'] == industry] if y_pred is not None else None
    x = [datetime(year=int(x[0:4]), month=int(x[4:6]), day=int(x[6:8])) for x in x_industry['public_date']] 
    
    plt.plot(x ,y_industry, color='tab:green')
    if y_pred is not None: 
        plt.plot(x, y_pred, color='tab:red')
    plt.show()

In [11]:
df, ew_indret = process_data()
x_train, x_test, x_dev, y_dev, y_train, y_test = split_data(df, ew_indret)