In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import pickle
%matplotlib inline

In [2]:
df = pd.read_pickle("final_df.pkl")

In [3]:
df.drop([
 'date',
 'calendardate',
 'name',
 'rating_cnt_strong_buys',
 'rating_cnt_mod_buys',
 'rating_cnt_holds',
 'rating_cnt_mod_sells',
 'rating_cnt_strong_sells',
 'rating_cnt_with',
 'rating_cnt_without',
 'rating_change',
 'quart',
 'year'
 ], axis=1, inplace=True)

In [4]:
one_hot_features = ['exchange', 'sector', 'industry']

In [5]:
for feature in one_hot_features:
    cat_X = df.loc[:, [feature]]
    ohe = OneHotEncoder(drop='first', sparse=False)
    ohe.fit(cat_X)
    ohe_X = ohe.transform(cat_X)
    columns = ohe.get_feature_names([feature])
    ohe_X_df = pd.DataFrame(ohe_X, columns=columns, index=cat_X.index)
    df = df.join(ohe_X_df)

In [6]:
tickers = list(df.ticker.unique())
test_tickers = random.sample(tickers, int(df.ticker.nunique() * .2))

In [7]:
df_train = df[~df.ticker.isin(test_tickers)]

In [8]:
df_test = df[df.ticker.isin(test_tickers)]

In [9]:
print(len(df_train))
print(len(df_test))

862311
219282


In [10]:
df_train = df_train.sample(frac=1).reset_index(drop=True)

In [11]:
df_train.drop([
 'ticker',
 'exchange',
 'sector',
 'industry'
 ], axis=1, inplace=True)

In [12]:
df_test.drop([
 'ticker',
 'exchange',
 'sector',
 'industry'
 ], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [13]:
X_train = df_train.copy()
del X_train["rating_mean_recom"]
y_train = df_train["rating_mean_recom"]

In [14]:
X_test = df_test.copy()
del X_test["rating_mean_recom"]
y_test = df_test["rating_mean_recom"]

In [15]:
X_train.shape

(862311, 324)

In [16]:
ss = StandardScaler()

ss.fit(X_train)

x_train_scaled = ss.transform(X_train)

x_test_scaled = ss.transform(X_test)

In [17]:
lasso_model = LassoCV(cv=5, max_iter = 1000000)
X_train_model = lasso_model.fit(X_train, y_train)
X_train_model.score(X_train, y_train)

0.01694514600616459

In [18]:
def mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true))

In [19]:
train_set_pred = lasso_model.predict(X_train)

In [20]:
mae(y_train, train_set_pred)

0.7873973815441566

In [21]:
test_set_pred = lasso_model.predict(X_test)

In [22]:
mae(y_test, test_set_pred)

0.8276126192882866

In [23]:
coef = list(zip(X_train.columns, lasso_model.coef_))
coef

[('close', 0.0),
 ('volume', 0.0),
 ('accoci', 4.0271616022415335e-12),
 ('assets', -7.591576104721973e-14),
 ('assetsavg', 1.7897795041977845e-12),
 ('assetsc', -2.3602574875508943e-12),
 ('assetsnc', -1.926631954353851e-11),
 ('assetturnover', 0.0),
 ('bvps', 0.0),
 ('capex', -0.0),
 ('cashneq', -0.0),
 ('cor', -1.3534828818582494e-12),
 ('consolinc', -1.9634787323198573e-11),
 ('currentratio', -0.0),
 ('de', 0.0),
 ('debt', -1.1793758139700494e-12),
 ('debtc', 2.3700527349112327e-11),
 ('debtnc', -5.397963587650339e-12),
 ('deferredrev', -1.3124721799629117e-11),
 ('depamor', -0.0),
 ('deposits', -3.9115110689665247e-13),
 ('dps', 0.0),
 ('ebit', 0.0),
 ('ebitda', 0.0),
 ('ebitdamargin', 0.0),
 ('ebt', -0.0),
 ('eps', -0.0),
 ('epsdil', -0.0),
 ('equity', -4.574619038943343e-12),
 ('equityavg', 1.47364220704182e-11),
 ('fcf', 0.0),
 ('fcfps', 0.0),
 ('gp', 0.0),
 ('grossmargin', 0.0),
 ('intangibles', -0.0),
 ('intexp', 0.0),
 ('invcap', -8.101722322590143e-13),
 ('invcapavg', -1.83

In [24]:
sorted(coef, key=lambda x: x[1])

[('ncfcommon', -2.2429129822301557e-11),
 ('consolinc', -1.9634787323198573e-11),
 ('assetsnc', -1.926631954353851e-11),
 ('taxassets', -1.800200174374934e-11),
 ('ncfdiv', -1.6072864689034188e-11),
 ('deferredrev', -1.3124721799629117e-11),
 ('ncfinv', -1.0928592374590187e-11),
 ('workingcapital', -7.0506533012967994e-12),
 ('ncff', -6.66164393814874e-12),
 ('debtnc', -5.397963587650339e-12),
 ('equity', -4.574619038943343e-12),
 ('payables', -3.72932445719767e-12),
 ('ncfdebt', -2.5449048878345677e-12),
 ('assetsc', -2.3602574875508943e-12),
 ('invcapavg', -1.831185859122967e-12),
 ('ev', -1.3845367110439652e-12),
 ('cor', -1.3534828818582494e-12),
 ('tangibles', -1.3383522692036386e-12),
 ('debt', -1.1793758139700494e-12),
 ('invcap', -8.101722322590143e-13),
 ('dcf', -6.815839673917418e-13),
 ('investmentsnc', -4.62326672054739e-13),
 ('deposits', -3.9115110689665247e-13),
 ('assets', -7.591576104721973e-14),
 ('close', 0.0),
 ('volume', 0.0),
 ('assetturnover', 0.0),
 ('bvps', 0.0

In [25]:
filename = 'lasso_model.pkl'
pickle.dump(lasso_model, open(filename, 'wb'))