In [16]:
import numpy as np
import pandas as pd
import sklearn

from sklearn import preprocessing 
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics

FAMA_49CRSP = 'FAMA_49CRSP.csv'

In [2]:
def read_csv(filename, n_ind):
    """
    Read in CSV file of financial ratios and returns a pandas dataframe.
    df is sorted by date in ascending order. 
    """
    df = pd.read_csv(filename)
    df = df.sort_values(by='public_date', ascending=True)
    return df

In [3]:
def pct_format(col): 
    temp = [float(x[:-1]) if x is not np.nan else 0 for x in col]
    return pd.Series(temp)

In [4]:
def write_csv(filename, n_ind):
    """
    Read in CSV file of financial ratios and returns a pandas dataframe.
    df is sorted by date in ascending order. 
    """
    df = pd.read_csv(filename)
    df = df.sort_values(by='public_date', ascending=True)
    df['divyield_Median'] = pct_format(df['divyield_Median'])
    
    df = df.drop(labels=['pe_op_basic_Median', 'pe_op_dil_Median', 'PEG_1yrforward_Median', 'PEG_ltgforward_Median'], axis=1)
    df = df.fillna(df.mean())
    
    df = df[:-n_ind]
    
    df.to_csv('FAMA_49CRSP.csv')

In [5]:
write_csv('ee6d2f60cdafb550.csv', 49)

In [6]:
def encode(df, col):
    """
    Takes in pandas dataframe and encodes df.col as numbers. 
    """
    encoder = preprocessing.LabelEncoder()
    industry_desc_encode = encoder.fit_transform(df[col])
    df[col] = industry_desc_encode
    return df 

In [7]:
def calc_rolling_avg(col, n_years):
    """
    3 years rolling percent change, averaged ie. (y1-y2 + (y3-y2)change)/2 
    """
    return col.rolling(n_years).mean()

In [8]:
def calc_year_on_year(col, p):
    """
    3year on year change as a prediction feature, raw pct change
    """
    return col.diff(periods=p)

In [9]:
def chop(col, index):
    return col.iloc[index:]

In [10]:
def discretize(series, n_bins):
    #add -1 and 1 so the bins will take on bins to be equal and set to max -1 and 1
    endpoints = pd.Series([-1,1])
    series = series.append(endpoints)

    #make a new output (bucket by percentage?)
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal',strategy = 'uniform')
    series = np.asarray(series).reshape((-1,1))

    labels_binned = discretizer.fit_transform(series)

    return labels_binned[:-2]

In [31]:
def process_data():
    """
    Reads in a CSV file and encodes text columns. 
    Removes indret_ew, indret_vw from df. 
    Returns df (X matrix) and ew_indret (Y values )
    """
    df = read_csv(FAMA_49CRSP, n_ind=49)
    df = encode(df, 'FFI49_desc')
    
    ew_indret, vw_indret = df.indret_ew, df.indret_vw 
    
    df = df.drop(labels=['indret_ew', 'indret_vw'], axis=1)
    
    rolling_avg = calc_rolling_avg(ew_indret, 3)
    year_on_year = calc_year_on_year(ew_indret, 3) 
    
    df, ew_indret, year_on_year, rolling_avg = chop(df, 3), chop(ew_indret, 3), chop(year_on_year, 3), chop(rolling_avg, 3) 
    
    ew_binned = discretize(ew_indret, 8)
    
    return df, ew_indret, ew_binned, year_on_year, rolling_avg

In [32]:
def split_data(x, y): 
    """
    Splits data into 0.64 Train, 0.16 dev, 0.2 Test
    """
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, shuffle = False)
    x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size = 0.2, shuffle = False)
    
    return x_train, x_test, x_dev, y_dev, y_train, y_test

In [42]:
def train_Ridge(alpha):
    print(alpha)
    df, ew_indret, ew_binned, year_on_year, rolling_avg = process_data()

    x_train, x_test, x_dev, y_dev, y_train, y_test = split_data(df, ew_indret) 

    reg = linear_model.Ridge(alpha=alpha)
    reg.fit(x_train, y_train) 

    return reg

In [59]:
def dev(reg):
    y_dev_predict = reg.predict(x_dev)
    mse = metrics.mean_squared_error(y_dev, y_dev_predict)
    mse_scaled = mse / y_dev.mean()
    r2 = metrics.r2_score(y_dev, y_dev_predict)
    print('R2: {}\tMSE scaled: {}'.format(r2, mse_scaled))

In [60]:
for i in np.arange(0, 1.1, 0.1):
    reg = train(i)
    dev(reg)

0.0
R2: -0.3961511277792107	MSE scaled: 0.5867924289456709
0.1
R2: -0.3938302214426084	MSE scaled: 0.5858169684532408
0.2
R2: -0.39182582154631795	MSE scaled: 0.5849745333756046
0.30000000000000004
R2: -0.39006710580351855	MSE scaled: 0.5842353576073023
0.4
R2: -0.38850485744428753	MSE scaled: 0.5835787556885775
0.5
R2: -0.38710342963649413	MSE scaled: 0.5829897455084009
0.6000000000000001
R2: -0.38583612081469676	MSE scaled: 0.5824571045879658
0.7000000000000001
R2: -0.3846823684439151	MSE scaled: 0.581972190639481
0.8
R2: -0.3836259686894523	MSE scaled: 0.5815281933060085
0.9
R2: -0.38265390320410875	MSE scaled: 0.5811196410684392
1.0
R2: -0.3817555402329953	MSE scaled: 0.5807420654755068


In [61]:
def train_ElasticNet(alpha):
    print(alpha)
    df, ew_indret, ew_binned, year_on_year, rolling_avg = process_data()

    x_train, x_test, x_dev, y_dev, y_train, y_test = split_data(df, ew_indret) 

    reg = linear_model.ElasticNet(alpha=alpha)
    reg.fit(x_train, y_train) 

    return reg

In [62]:
for alpha in np.arange(0, 100, 5):
    reg = train_ElasticNet(alpha)
    dev(reg)

0


  
  positive)


R2: -0.3780714032047665	MSE scaled: 0.5791936487802442
5
R2: -0.00297161113390243	MSE scaled: 0.42154186330599547
10
R2: -0.002875449947583819	MSE scaled: 0.4215014474405727
15
R2: -0.002780901815227832	MSE scaled: 0.4214617095303039
20
R2: -0.002687966736832026	MSE scaled: 0.42142264957518794
25
R2: -0.002596644712392626	MSE scaled: 0.4213842675752233
30
R2: -0.002506935741906524	MSE scaled: 0.4213465635304086
35
R2: -0.002418839825370611	MSE scaled: 0.42130953744074284
40
R2: -0.0023323569627820007	MSE scaled: 0.4212731893062243
45
R2: -0.002247487154136918	MSE scaled: 0.4212375191268518
50
R2: -0.0021642303994324763	MSE scaled: 0.42120252690262405
55
R2: -0.002082586698665345	MSE scaled: 0.4211682126335396
60
R2: -0.00200255605183286	MSE scaled: 0.4211345763195972
65
R2: -0.001924138458931246	MSE scaled: 0.42110161796079554
70
R2: -0.0018473339199576166	MSE scaled: 0.4210693375571331
75
R2: -0.0017721424349088633	MSE scaled: 0.42103773510860887
80
R2: -0.0016985640037814331	MSE scal