In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler, StandardScaler

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

  from numpy.core.umath_tests import inner1d


In [2]:
def add_shifts(df_, col_id, col_calc_pl, new_col, shift):
    for id_ in df_[col_id].unique():
        df_by_id = df_[df_[col_id] == id_]
        df_.loc[df_[col_id] == id_, new_col] = df_by_id[col_calc_pl] - df_by_id[col_calc_pl].shift(shift)
        
    return df_

In [3]:
def get_non_42_cols(df_):
    return [elem for elem in df_.columns if (re.search(r'\d+$', elem) is not None) and (int(elem[-2:].strip()) != 42)]

In [4]:
def interpolate_nan_values(df_):
    list_df = []
    for tick in df_['ticker'].unique():
        df_by_ticker = df_[df_['ticker'] == tick]
        float_columns = df_by_ticker.select_dtypes(float).columns
        df_by_ticker[float_columns] = df_by_ticker[float_columns].interpolate(method='pad', axis=1)
        list_df.append(df_by_ticker)
    return pd.concat(list_df)

In [5]:
df_categorical = pd.read_csv('../data/db_bsm_categorical.csv')

In [6]:
df_financial = pd.read_csv('../data/db_bsm_financial.csv')

In [7]:
list_cols_to_drop = get_non_42_cols(df_financial)

In [8]:
df_financial = df_financial.drop(list_cols_to_drop, 1)

In [9]:
df_financial = df_financial.replace(0, np.nan)

In [10]:
df_financial_no_empty = interpolate_nan_values(df_financial)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [11]:
df_financial_no_empty = add_shifts(df_financial_no_empty, 'ticker', 'close', 'daily_pl', 1)

In [12]:
df_categorical = df_categorical.dropna()

In [13]:
df_categorical = df_categorical.drop_duplicates(subset=['ticker'], keep='first')

In [14]:
df_fin_cat = pd.merge(left=df_financial_no_empty, right=df_categorical, how='inner', on='ticker')

In [15]:
df_fin_cat = df_fin_cat.dropna()

In [16]:
target = df_fin_cat['sector_gics'].values
features = df_fin_cat.select_dtypes(float).values

In [17]:
features_rs = RobustScaler().fit_transform(features)
features_ss = StandardScaler().fit_transform(features)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.1, random_state=32)
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(features_rs, target, test_size=0.1, random_state=32)
X_train_ss, X_test_ss, y_train_ss, y_test_ss = train_test_split(features_ss, target, test_size=0.1, random_state=32)

In [19]:
knn = GridSearchCV(KNeighborsClassifier(),
                   param_grid={'n_neighbors': range(1, 11)}, 
                   scoring='recall_macro', 
                   cv=5)
knn_rs = GridSearchCV(KNeighborsClassifier(),
                   param_grid={'n_neighbors': range(1, 11)}, 
                   scoring='recall_macro', 
                   cv=5)
knn_ss = GridSearchCV(KNeighborsClassifier(),
                   param_grid={'n_neighbors': range(1, 11)}, 
                   scoring='recall_macro', 
                   cv=5)

In [None]:
knn.fit(X_train, y_train)
print('knn done')

In [None]:
knn_rs.fit(X_train_rs, y_train_rs)
print('knn_rs done')
knn_ss.fit(X_train_ss, y_train_ss)
print('knn_ss done')

knn done


In [None]:
knn = KNeighborsClassifier(**knn.best_params_)
knn.fit(X_train, y_train)

knn_rs = KNeighborsClassifier(**knn_rs.best_params_)
knn_rs.fit(X_train_rs, y_train_rs)

knn_ss = KNeighborsClassifier(**knn_ss.best_params_)
knn_ss.fit(X_train_ss, y_train_ss)

In [None]:
random_forest = GridSearchCV(RandomForestClassifier(criterion='entropy'),
                   param_grid={'max_depth': range(1, 21)}, 
                   scoring='recall_macro', 
                   cv=5)
random_forest_rs = GridSearchCV(RandomForestClassifier(criterion='entropy'),
                   param_grid={'max_depth': range(1, 21)}, 
                   scoring='recall_macro', 
                   cv=5)
random_forest_ss = GridSearchCV(RandomForestClassifier(criterion='entropy'),
                   param_grid={'max_depth': range(1, 21)}, 
                   scoring='recall_macro', 
                   cv=5)

In [None]:
random_forest.fit(X_train, y_train)
print('random_forest done')
random_forest_rs.fit(X_train_rs, y_train_rs)
print('random_forest_rs done')
random_forest_ss.fit(X_train_ss, y_train_ss)
print('random_forest_ss done')

In [None]:
random_forest = RandomForestClassifier(**random_forest.best_params_)
random_forest.fit(X_train, y_train)

random_forest_rs = RandomForestClassifier(**random_forest_rs.best_params_)
random_forest_rs.fit(X_train_rs, y_train_rs)

random_forest_ss = RandomForestClassifier(**random_forest_ss.best_params_)
random_forest_ss.fit(X_train_ss, y_train_ss)

In [None]:
dec_tree = GridSearchCV(DecisionTreeClassifier(criterion='entropy'),
                   param_grid={'max_depth': range(1, 21)}, 
                   scoring='recall_macro', 
                   cv=5)
dec_tree_rs = GridSearchCV(DecisionTreeClassifier(criterion='entropy'),
                   param_grid={'max_depth': range(1, 21)}, 
                   scoring='recall_macro', 
                   cv=5)
dec_tree_ss = GridSearchCV(DecisionTreeClassifier(criterion='entropy'),
                   param_grid={'max_depth': range(1, 21)}, 
                   scoring='recall_macro', 
                   cv=5)

In [None]:
dec_tree.fit(X_train, y_train)
print('dec_tree done')
dec_tree_rs.fit(X_train_rs, y_train_rs)
print('dec_tree_rs done')
dec_tree_ss.fit(X_train_ss, y_train_ss)
print('dec_tree_ss done')

In [None]:
dec_tree = DecisionTreeClassifier(**dec_tree.best_params_)
dec_tree.fit(X_train, y_train)

dec_tree_rs = DecisionTreeClassifier(**dec_tree_rs.best_params_)
dec_tree_rs.fit(X_train_rs, y_train_rs)

dec_tree_ss = DecisionTreeClassifier(**dec_tree_ss.best_params_)
dec_tree_ss.fit(X_train_ss, y_train_ss)