In [23]:
import numpy as np
import pandas as pd

from sklearn import preprocessing 
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn import svm
from sklearn.kernel_ridge import KernelRidge


FAMA_49CRSP = 'FAMA_49CRSP.csv'

In [4]:
# https://keras.io/regularizers/
# https://keras.io/callbacks/
# https://en.wikipedia.org/wiki/Kriging
# https://gist.github.com/stared/dfb4dfaf6d9a8501cd1cc8b8cb806d2e
# https://www.tensorflow.org/guide/summaries_and_tensorboard
# https://stackoverflow.com/questions/50978117/how-to-plot-loss-curve-in-tensorflow-without-using-tensorboard
# https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/Callback
# https://www.tensorflow.org/api_docs/python/tf/keras/utils/normalize
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html
# https://datascience.stackexchange.com/questions/989/svm-using-scikit-learn-runs-endlessly-and-never-completes-execution

In [5]:
def read_csv(filename, n_ind):
    """
    Read in CSV file of financial ratios and returns a pandas dataframe.
    df is sorted by date in ascending order. 
    """
    df = pd.read_csv(filename)
    df = df.dropna()
    df = df.sort_values(by='public_date', ascending=True)
    return df

In [6]:
def pct_format(col): 
    temp = [float(x[:-1]) if x is not np.nan else 0 for x in col]
    return pd.Series(temp)

In [7]:
def write_csv(filename):
    """
    Read in CSV file of financial ratios and returns a pandas dataframe.
    Drops empty columns and reformats a str col into a float col. 
    df is sorted by date in ascending order. 
    """
    df = pd.read_csv(filename)
    df = df.sort_values(by='public_date', ascending=True)
    df['divyield_Median'] = pct_format(df['divyield_Median'])
    
    df.to_csv('FAMA_49CRSP.csv')

In [8]:
# write_csv('ee6d2f60cdafb550.csv')

In [9]:
def encode(df, col):
    """
    Takes in pandas dataframe and encodes df.col as numbers. 
    """
    encoder = preprocessing.LabelEncoder()
    industry_desc_encode = encoder.fit_transform(df[col])
    df[col] = industry_desc_encode
    return df 

In [10]:
def calc_rolling_avg(col, n_years):
    """
    3 years rolling percent change, averaged ie. (y1-y2 + (y3-y2)change)/2 
    """
    return col.rolling(n_years).mean()

In [11]:
def calc_year_on_year(col, p):
    """
    3year on year change as a prediction feature, raw pct change
    """
    return col.diff(periods=p)

In [12]:
def chop(col, index):
    return col.iloc[index:]

In [13]:
def discretize(series, n_bins):
    #add -1 and 1 so the bins will take on bins to be equal and set to max -1 and 1
    endpoints = pd.Series([-1,1])
    series = series.append(endpoints)

    #make a new output (bucket by percentage?)
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal',strategy = 'uniform')
    series = np.asarray(series).reshape((-1,1))

    labels_binned = discretizer.fit_transform(series)

    return labels_binned[:-2]

In [14]:
def process_data():
    """
    Reads in a CSV file and encodes text columns. 
    Removes indret_ew, indret_vw from df. 
    Returns df (X matrix) and ew_indret (Y values )
    """
    df = read_csv(FAMA_49CRSP, n_ind=49)
    df = encode(df, 'FFI49_desc')
    
    ew_indret, vw_indret = df.indret_ew, df.indret_vw 
    
    df = df.drop(labels=['indret_ew', 'indret_vw'], axis=1)
    
    rolling_avg = calc_rolling_avg(ew_indret, 3)
    year_on_year = calc_year_on_year(ew_indret, 3) 
    
    df, ew_indret, year_on_year, rolling_avg = chop(df, 3), chop(ew_indret, 3), chop(year_on_year, 3), chop(rolling_avg, 3) 
    
    ew_binned = discretize(ew_indret, 8)
    
    return df, ew_indret, ew_binned, year_on_year, rolling_avg

In [15]:
def split_data(x, y): 
    """
    Splits data into 0.64 Train, 0.16 dev, 0.2 Test
    """
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, shuffle = False)
    x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size = 0.2, shuffle = False)
    
    return x_train, x_test, x_dev, y_dev, y_train, y_test

In [21]:
def dev(model, x_dev, y_dev):
    # also try RMSE? 
    y_dev_predict = model.predict(x_dev)
    mse = metrics.mean_squared_error(y_dev, y_dev_predict)
    r2 = metrics.r2_score(y_dev, y_dev_predict)
    print('R2: {}\tMSE: {}'.format(r2, mse))

In [37]:
# Ordinary Least Squares: MSE 0.0064 
# ElasticNet: MSE 0.0047
# Ridge: MSE 0.0065
# Kernel Ridge Regression: MSE 0.0050
# Lasso: MSE 0.00456

def train_Ridge(alpha, x_train, y_train):
    reg = linear_model.Ridge(alpha=alpha)
    reg.fit(x_train, y_train) 

    return reg

def train_Lasso(alpha, x_train, y_train):
    clf = linear_model.Lasso(alpha=alpha)
    clf.fit(x_train, y_train)
    return clf
    
def train_ElasticNet(alpha, x_train, y_train):
    reg = linear_model.ElasticNet(alpha=alpha)
    reg.fit(x_train, y_train) 

    return reg

def train_SVR(x_train, y_train):
    """
    Takes scaled training data (does not converge otherwise) 
    """
    x_train = preprocessing.scale(x_train)
    x_dev = preprocessing.scale(x_dev)

    reg = svm.SVR(kernel='poly', gamma='auto')
    reg.fit(x_train, y_train) 

    return reg, x_dev, y_dev

def train_KRR(alpha, x_train, y_train):
    """
    Takes scaled training data (to avoid singular matrix problem? )
    """
    KRR = KernelRidge(alpha=alpha)
    KRR.fit(x_train, y_train)
    return KRR 

In [25]:
df, ew_indret, ew_binned, year_on_year, rolling_avg = process_data()
x_train, x_test, x_dev, y_dev, y_train, y_test = split_data(df, ew_indret) 

In [28]:
KRR = train_KRR(0.3, preprocessing.scale(x_train), y_train)
dev(KRR, preprocessing.scale(x_dev), y_dev)

  """Entry point for launching an IPython kernel.


R2: -0.1043792112373716	MSE: 0.005006491459622191


  


In [36]:
lasso = train_Lasso(0.1, x_train, y_train)
dev(lasso, x_dev, y_dev)

R2: -0.0061314052506031835	MSE: 0.004561103863953613
