In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
plt.style.use("ggplot")

In [18]:
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier

# Preprocessing

In [3]:
data_raw = pd.read_csv("D:\\monthlywithmomentum.csv", index_col=0)
y_raw = pd.read_csv("y.csv")

In [4]:
y_raw.head()

Unnamed: 0,date,TICKER,Y
0,19860331,NOAX,1
1,19860331,ALD,1
2,19860331,AC,1
3,19860331,BU,0
4,19860331,BG,0


In [5]:
data_raw.head()

Unnamed: 0,PERMNO,date,SHRCD,TICKER,COMNAM,CUSIP,PRC,VOL,RET,CFACPR,...,momentum63,momentum84,momentum105,momentum126,momentum147,momentum168,momentum189,momentum210,momentum231,momentum252
0,10062,19860331,11.0,NOAX,NORTHEAST OHIO AXLE INC,29409K60,-10.25,78.0,-0.012048,0.143,...,,,,,,,,,,
1,10062,19860430,11.0,NOAX,NORTHEAST OHIO AXLE INC,29409K60,-13.875,508.0,0.353659,0.143,...,,,,,,,,,,
2,10062,19860530,11.0,NOAX,N E O A X INC,29409K60,13.75,335.0,-0.009009,0.143,...,-0.231741,,,,,,,,,
3,10062,19860630,11.0,NOAX,N E O A X INC,29409K60,15.25,366.0,0.109091,0.143,...,-0.050081,-0.141722,,,,,,,,
4,10062,19860731,11.0,NOAX,N E O A X INC,29409K60,13.25,547.0,-0.131148,0.143,...,0.553784,0.10326,0.007698,,,,,,,


In [None]:
# drop na


In [6]:
if not isinstance(data_raw['date'][0], str):
    data_raw["date"] = data_raw["date"].apply(lambda x: str(x)[:4]+'-'+str(x)[4:6]+'-'+str(x)[6:])
data_raw["dateDT"] = pd.to_datetime(data_raw["date"])

if not isinstance(y_raw['date'][0], str):
    y_raw["date"] = y_raw["date"].apply(lambda x: str(x)[:4]+'-'+str(x)[4:6]+'-'+str(x)[6:])
y_raw["dateDT"] = pd.to_datetime(y_raw["date"])

In [7]:
data_raw = data_raw.sort_values(by=["dateDT", "TICKER"])
data_raw.index = range(data_raw.shape[0])

In [8]:
y_raw = y_raw.sort_values(by=["dateDT", "TICKER"])
y_raw.index = range(y_raw.shape[0])

In [9]:
data_raw["id"] = data_raw.index

In [10]:
# split train, val, test
train_val_split = int(data_raw.shape[0]*0.4)
val_test_split = int(data_raw.shape[0]*0.6)
data_train = data_raw.loc[data_raw["dateDT"] <= data_raw["dateDT"][train_val_split]]
train_cut = data_train.shape[0]
data_val = data_raw.iloc[train_cut:].loc[data_raw["dateDT"] <= data_raw["dateDT"][val_test_split]]
val_cut = train_cut + data_val.shape[0]
data_test = data_raw.iloc[val_cut:]

In [11]:
y_train = y_raw.iloc[:train_cut]
y_val = y_raw.iloc[train_cut:val_cut]
y_test = y_raw.iloc[val_cut:]

In [16]:
feature_all = ['PRC', 'VOL', 'RET']

# add momentum terms
for i in range(1,22):
    feature_all.append("momentum" + str(i))
for i in range(42,253,21):
    feature_all.append("momentum" + str(i))

feature_all

['PRC',
 'VOL',
 'RET',
 'momentum1',
 'momentum2',
 'momentum3',
 'momentum4',
 'momentum5',
 'momentum6',
 'momentum7',
 'momentum8',
 'momentum9',
 'momentum10',
 'momentum11',
 'momentum12',
 'momentum13',
 'momentum14',
 'momentum15',
 'momentum16',
 'momentum17',
 'momentum18',
 'momentum19',
 'momentum20',
 'momentum21',
 'momentum42',
 'momentum63',
 'momentum84',
 'momentum105',
 'momentum126',
 'momentum147',
 'momentum168',
 'momentum189',
 'momentum210',
 'momentum231',
 'momentum252']

Use the following

`data_train`, `y_train`

`data_val`, `y_val`

`data_test`, `y_test`

# Descriptive Analysis

In [13]:
y_raw.shape

(436627, 4)

In [14]:
data_raw.shape

(436626, 47)

# Basic Models

In [21]:
lasso1 = LassoCV()
lasso1.fit(data_train[feature_all], y_train['Y'])

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

Unnamed: 0,PRC,VOL,RET,momentum1,momentum2,momentum3,momentum4,momentum5,momentum6,momentum7,...,momentum63,momentum84,momentum105,momentum126,momentum147,momentum168,momentum189,momentum210,momentum231,momentum252
0,62.500,1849.0,0.016260,-1.650000e-03,1.381400e-02,-0.004308,0.016696,0.028234,0.057164,0.083342,...,,,,,,,,,,
1,44.000,1714.0,0.034884,-1.234600e-02,-2.131500e-02,-0.025252,-0.015151,-0.021589,-0.044845,-0.042192,...,,,,,,,,,,
2,57.000,2567.0,0.104116,7.634000e-03,7.634000e-03,0.022449,0.026008,-0.007260,-0.007260,0.005157,...,,,,,,,,,,
3,6.875,931.0,0.123636,4.761900e-02,1.726190e-01,0.201190,0.159068,0.191326,0.191326,0.209508,...,,,,,,,,,,
4,4.750,681.0,0.085714,1.387779e-17,1.387779e-17,0.062500,0.076486,0.076486,0.058065,0.012553,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174881,45.250,90201.0,-0.044855,-2.236400e-02,-3.379300e-02,-0.048499,-0.056974,-0.061266,-0.072377,-0.063335,...,-0.024197,0.068339,0.118700,-0.012585,-0.049900,0.078218,0.105064,0.062211,0.228299,0.172478
174882,18.625,42098.0,0.079710,-5.551115e-17,1.050800e-02,-0.014582,-0.014582,-0.299392,-0.304797,-0.283005,...,-0.279544,-0.367380,-0.190722,-0.154563,-0.216454,-0.194816,-0.003092,-0.194424,-0.523713,-0.464841
174883,25.250,1904.0,-0.015000,-5.286000e-03,-1.186600e-02,-0.011866,-0.013000,0.002385,-0.002845,-0.002845,...,-0.133538,-0.109338,-0.070551,0.012269,0.003178,0.011662,0.039194,-0.074502,-0.147189,-0.258738
174884,20.875,5398.0,-0.132165,3.322000e-03,-4.532700e-02,-0.057827,-0.082827,-0.082827,-0.082827,-0.019669,...,-0.001369,0.369823,0.350176,0.329556,0.316725,0.369604,0.567452,0.660557,0.711171,0.579261
