Looking for collinearity and/or multicollinearity 

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#non anaconda native modules, these will need the conda env (listed above) or pip install
import pydash as __
import seaborn as sns

#custom modules
from utils import \
ModelTester, \
to_important_features, \
create_heatmap, \
create_learning_curve, \
plot_learning_curve

# pre-processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import \
StandardScaler, \
LabelEncoder

#classifiers:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# ta-lib
import talib

import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from functools import reduce

In [None]:
ACQUISITION_RAW_COLUMN_NAMES = ['loan_id', 'origin_channel', 'seller_name', 'original_interest_rate', 'original_upb',
                                'original_loan_term', 'origination_date_string', 'first_payment_date_string',
                                'original_loan_to_value', 'original_combined_loan_to_value', 'number_of_borrowers',
                                'original_debt_to_income_ratio', 'borrower_credit_score_at_origination',
                                'first_time_homebuyer_indicator', 'loan_purpose', 'property_type', 'number_of_units',
                                'occupancy_type', 'property_state', 'zip_code_short',
                                'primary_mortgage_insurance_percent',
                                'product_type', 'co_borrower_credit_score_at_origination', 'mortgage_insurance_type',
                                'relocation_mortgage_indicator', 'sdq']

In [None]:
df = pd.read_csv('Acquisition_2016Q1.txt', sep="|", index_col=False, names=ACQUISITION_RAW_COLUMN_NAMES)

In [None]:
# df.head(10)

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
# convert original_upb to float for ta-lib to function correctly

convert_dict = {'original_upb': float, 
                } 
  
df = df.astype(convert_dict)

# reference material: https://www.geeksforgeeks.org/change-data-type-for-one-or-more-columns-in-pandas-dataframe/

In [None]:
print(df.dtypes)

In [None]:
def add_talib_indicators(col):
    # adding momentum and volume indicators
    fastk, fastd = talib.STOCHRSI(col)
    macd, macdsignal, macdhist = talib.MACD(col)
    dema = talib.DEMA(col) 
    roc = talib.ROC(col)
    return pd.DataFrame(
        dict(fastk=fastk,
             fastd=fastd,
             stochrsi=fastd-fastk,
             macdhist=macdhist,
             dema=dema,
             roc=roc
            ))

In [None]:
df = pd.concat([
    df,
    add_talib_indicators(df.original_upb.as_matrix())
], axis=1) \
.set_index('loan_id') \
.sort_index()

# # if unable to install/load talib use, uncomment and use:
# # df = pd.read_table('./data/ETH_USD_data_with_indicators.csv', sep=',', memory_map=True, parse_dates=True, date_parser=date_parser, index_col='date').sort_index()
# df.tail()

In [None]:
# df['relocation_mortgage_indicator'].unique()

In [None]:
# df['relocation_mortgage_indicator'].value_counts()

In [None]:
# Drop the object types
df.drop(df.select_dtypes(['object']), inplace=True, axis=1)

In [None]:
df = df.drop('sdq', axis=1)

In [None]:
# convert the remaining to floats

# convert original_upb to float for ta-lib to function correctly

df = df.astype('float64')

In [None]:
df.head(5)

In [None]:
features = list(df.columns)
# our target is the following hour, 
# specifically whether it went up (1) or down (0) from the period close an hour before.
# we drop the last row since the target will be NaN (the next hour is the future.)
y = np.where(df.number_of_units.shift(-1) > df.number_of_units, 1, 0)[:-1]
X = df[features].fillna(0)[:-1]
X.tail()

# features = list(filter(lambda x: x not in ['target', 'date', 'symbol' ,'close', 'high', 'open', 'close', 'low'] , df.columns))
# # our target is the following hour, 
# # specifically whether it went up (1) or down (0) from the period close an hour before.
# # we drop the last row since the target will be NaN (the next hour is the future.)
# y = np.where(df.close.shift(-1) > df.close, 1, 0)[:-1]
# X = df[features].fillna(0)[:-1]
# X.tail()

In [None]:
# sns.pairplot(X, height=len(features))
# plt.show()

In [None]:
create_heatmap(X)

In [None]:
clfs = [LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, KNeighborsClassifier]
clf_args = {
    '0': dict(random_state=0, C=100, penalty='l1', solver='saga', n_jobs=1), 
    '1': dict(random_state=0),
    '2': dict(random_state=0, n_estimators=30, n_jobs=1),
    '3': dict(n_neighbors=5, p=2, metric='minkowski', n_jobs=1),
            }
# # Original
# clfs = [LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, KNeighborsClassifier, SVC]
# clf_args = {
#     '0': dict(random_state=0, C=100, penalty='l1', solver='saga', n_jobs=1), 
#     '1': dict(random_state=0),
#     '2': dict(random_state=0, n_estimators=30, n_jobs=1),
#     '3': dict(n_neighbors=5, p=2, metric='minkowski', n_jobs=1),
#     '4': dict(random_state=0),
#             }

In [None]:
tester = ModelTester(clfs,
                     X=X,
                     y=y,
                     clf_args={**clf_args, **{'0': dict(solver='newton-cg')}},
                     x_normalizer=StandardScaler
                    )

In [None]:
results = list(tester.run_tests(default_args={}))

In [None]:
tester.plot_fscores()

In [None]:
tester.plot_importances()

In [None]:
# select important features which score >= 1 std deviation
min_top_imp = __.reduce_(tester.feature_importances, to_important_features(std_dev_mult=1), {})
min_top_imp_names = list(min_top_imp.keys())
min_top_imp

In [None]:
# run the models again using the feature subset on X
tester_feature_sub = ModelTester(
     clfs,
     X=X[min_top_imp_names],
     y=y,
     clf_args=clf_args,
     x_normalizer=StandardScaler)

results_feature_sub = list(tester_feature_sub.run_tests(default_args={}))
tester_feature_sub.plot_fscores() 
# models seem to have done worse on the subset :(, 
#which would make some sense considering how even most of the importance scores were

In [None]:
# convert y into multiple class targets, 
# classes are built by getting the difference between the close by 1 hr, then rounding to nearest 10.
# we then label encode
y2_str = np.round(df.number_of_units.shift(-1) - df.number_of_units, decimals=-1)[:-1].map('{:,.0f}'.format)
class_le = LabelEncoder()
y2 = class_le.fit_transform(y2_str)
unq, counts = np.unique(y2_str, return_counts=True)

print('multiclass')
print(pd.DataFrame([counts], columns=unq))

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X.reset_index().drop('date', axis=1), y, test_size=0.3, random_state=0, stratify=y)

# ss = StandardScaler()
# X_train_std = ss.fit_transform(X_train)
# X_test_std = ss.transform(X_test)

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# from pprint import pprint

# rf_random_grid = {
#     'n_estimators': [int(x) for x in np.linspace(start = 30, stop = 500, num = 10)],
#     'max_features': ['auto', 'sqrt'],
#     'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)] + [None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False]
# }

# pprint(rf_random_grid)

In [None]:
# # hyper parameter tuning for randomforest
# rf = RandomForestClassifier(random_state=0)
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = rf_random_grid, n_iter = 100, cv = 2, random_state=0, n_jobs = -1)
# rf_random.fit(X_train_std, y_train)

In [None]:
# rf_random.best_params_

In [None]:
# rf_random.best_score_