## Continue ensembles from v1 on full dataset format commonly used on Kaggle

In [1]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from collections import Counter

# local imports
from prepare import *
from evaluate import *

In [None]:
def qwk3(a1, a2, max_rat=3):
    """ function from kaggle"""
    assert(len(a1) == len(a2))
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e

### Read in initial datasets if needed

In [3]:
#raw_train, raw_train_labels, raw_test, specs, sample = read_raw_csvs()
#raw_train_labels = pd.read_csv('data/train_labels.csv')

### Load large train/test features from Josh's work

In [4]:
reduced_train = pd.read_csv('reduce_train.csv')
reduced_test = pd.read_csv('reduce_test.csv')
reduced_train.shape, reduced_test.shape

((17690, 891), (1000, 891))

### Start throwing model mud at the wall

In [6]:
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
                                ExtraTreesClassifier, BaggingClassifier, \
                                GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')  #Ridge classifier throws some warnings about ill-conditioned matrix

### Baseline accuracy of 50% (or 25% if balanced)

In [7]:
reduced_train.accuracy_group.value_counts(normalize=True)

3    0.500000
0    0.239062
1    0.136292
2    0.124647
Name: accuracy_group, dtype: float64

# Initialize Models and start testing accuracy

In [8]:
rf = RandomForestClassifier()
rc = RidgeClassifier()
ac = AdaBoostClassifier()
et = ExtraTreesClassifier()
bc = BaggingClassifier()
gbc = GradientBoostingClassifier()

clf = CatBoostClassifier(
    loss_function='MultiClass',
    task_type="CPU",
    learning_rate=0.01,
    iterations=2000,
    od_type="Iter",
    early_stopping_rounds=500,
    random_seed=42
    )

# tried with minimal results
#nb = GaussianNB()
#lr = LogisticRegression()               ## if they are commented out, they weren't performing well (or operator error...)
#sgd = SGDClassifier()

In [9]:
knn = KNeighborsClassifier()
svc = SVC(probability=True, verbose=1)

## Evaluate model performance

In [10]:
quick_eval(reduced_train, rf)

The accuracy of RandomForestClassifier is 0.5508762012436405
The QWK of RandomForestClassifier is 0.3703937710916171
              precision    recall  f1-score   support

           0       0.54      0.51      0.52       826
           1       0.28      0.11      0.15       509
           2       0.18      0.06      0.09       437
           3       0.60      0.82      0.69      1766

    accuracy                           0.55      3538
   macro avg       0.40      0.37      0.36      3538
weighted avg       0.49      0.55      0.50      3538



RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
# Catboost - accuracy 56%a
catbooster = quick_eval(reduced_train, clf, cv=True)

## Train/test split and attempt purely catboost

In [14]:
X_train, X_test, y_train, y_test = train_test_split(reduced_train.drop('accuracy_group', axis=1)._get_numeric_data(), 
                                                   reduced_train.accuracy_group,
                                                   test_size=.2,
                                                   random_state=42)

In [52]:
from sklearn.metrics import cohen_kappa_score
y_pred = catbooster.predict(X_test)

#confirimg these two functions do the same thing
print(qwk3(y_pred, y_test))  #0.5194
print(cohen_kappa_score(y_pred, y_test, weights='quadratic'))  #0.5194

[0.51940396]
0.5194039619602867


In [21]:
reduced_train.shape, reduced_test.shape

((17690, 891), (1000, 891))

In [27]:
sub_pred = catbooster.predict(reduced_test._get_numeric_data())
len(sub_pred)

1000

## Create submission for testing - scored .443

In [33]:
sample_sub = pd.read_csv('data/sample_submission.csv')

# To create a submission:
submission = pd.DataFrame()
submission['installation_id'] = sample_sub.installation_id
submission['accuracy_group'] = sub_pred
submission.head()

Unnamed: 0,installation_id,accuracy_group
0,00abaee7,3.0
1,01242218,3.0
2,017c5718,3.0
3,01a44906,3.0
4,01bc6cb6,3.0


In [35]:
submission.accuracy_group.value_counts(normalize=True)

3.0    0.762
0.0    0.220
1.0    0.018
Name: accuracy_group, dtype: float64

In [36]:
submission.to_csv('preds.csv', index=False)

In [45]:
submission.set_index('installation_id').to_csv('preds16.csv', index=False)

In [49]:
submission.to_csv('preds3.csv', index=False)

In [None]:
quick_eval(reduced_train, rf, cv=True)  #.563cv acc
quick_eval(reduced_train, rc, cv=True)  #.528cv acc
quick_eval(reduced_train, ac, cv=True)  #.529cv acc
quick_eval(reduced_train, et, cv=True)  #.5399cv acc
quick_eval(reduced_train, bc, cv=True)  #.511cv acc
quick_eval(reduced_train, gbc, cv=True)  #.435 cv acc
quick_eval(reduced_train, clf, cv=True) #

# These were not impressive, dropping from future
#quick_eval(reduced_train, lr, cv=True)
#quick_eval(reduced_train, sgd, cv=True)
#quick_eval(reduced_train, nb, cv=True)
#quick_eval(reduced_train, knn, scale=True, cv=True)
#quick_eval(reduced_train, svc, scale=True, cv=True)

## more models, to include sklearn neural net

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier

In [None]:
mlp = MLPClassifier()
gpc = GaussianProcessClassifier() #takes a long time
#rbf = RBF()
dt = DecisionTreeClassifier()

In [None]:
#quick_eval(reduced_train, mlp, cv=True)   # 0.295
#quick_eval(reduced_train, gpc) #   .498
#quick_eval(reduced_train, rbf, cv=True)   poor
#quick_eval(reduced_train, dt, cv=True)     #.411

In [None]:
bc_rf_10 = BaggingClassifier(
    base_estimator=RandomForestClassifier(max_depth=10),
    n_estimators=20)

bc_rf_50 = BaggingClassifier(
    base_estimator=RandomForestClassifier(max_depth=50),
    n_estimators=20)

bc_rf_5 = BaggingClassifier(
    base_estimator=RandomForestClassifier(max_depth=5),
    n_estimators=20)

bc_gbc = BaggingClassifier(
    base_estimator=GradientBoostingClassifier(),
    n_estimators=20)

bc_abc = BaggingClassifier(
    base_estimator=AdaBoostClassifier(),
    n_estimators=20)


quick_eval(reduced_train, bc_rf_5)
quick_eval(reduced_train, bc_rf_10)    # 0.549 with 20 estimators max depth of 10  (.377 with balanced)    0.551 with 200 
quick_eval(reduced_train, bc_rf_50) 
quick_eval(reduced_train, bc_gbc)   # 0.576 with 20 estimators (.389 with balanced)                     0.578 with 200
quick_eval(reduced_train, bc_abc)   # 0.563 with 20 estimators  (.374 with balanced)                    0.561 with 200

The accuracy of BaggingClassifier is 0.5387224420576597
The QWK of BaggingClassifier is 0.18304098870845487
              precision    recall  f1-score   support

           0       0.69      0.20      0.31       826
           1       0.00      0.00      0.00       509
           2       0.00      0.00      0.00       437
           3       0.53      0.98      0.69      1766

    accuracy                           0.54      3538
   macro avg       0.30      0.30      0.25      3538
weighted avg       0.42      0.54      0.42      3538

The accuracy of BaggingClassifier is 0.5729225551158846
The QWK of BaggingClassifier is 0.3474125031192433
              precision    recall  f1-score   support

           0       0.62      0.43      0.51       826
           1       0.15      0.00      0.01       509
           2       0.17      0.00      0.00       437
           3       0.57      0.94      0.71      1766

    accuracy                           0.57      3538
   macro avg       0.38 

## Model Ensembling

In [None]:
vc = VotingClassifier(estimators=[
    ('Adaboost',ac),
    ('rf', rf), 
    ('gbc', gbc),
    ('et', et),
    #('svc', svc),
    #('gpc', gpc),
    ('rc', rc),
    #('mlp', mlp),
    #('dt', dt),
    ('bc_rf_5', bc_rf_5),
    ('bc_rf_10', bc_rf_10),
    ('bc_rf_50', bc_rf_50),
    ('bc_gbc', bc_gbc),
    ('bc_abc', bc_abc),
    #('catboost', clf)
    ],
     n_jobs=-1,
     voting='hard')

quick_eval(reduced_train, vc)#, cv=True)     #initial w/o catboost .564

In [None]:
estimators = [
    ('Adaboost',ac),
    ('rf', rf), 
    ('gbc', gbc),
    ('et', et),
    ('gpc', gpc),
    #('svc', svc),
    ('rc', rc),
    #('mlp', mlp),
    #('dt', dt),
    ('bc_rf_5', bc_rf_5),
    ('bc_rf_10', bc_rf_10),
    ('bc_rf_50', bc_rf_50),
    ('bc_gbc', bc_gbc),
    ('bc_abc', bc_abc),
]
stacking_clf = StackingClassifier(estimators=estimators, 
                final_estimator=RandomForestClassifier(), n_jobs=-1)        # created, load from pickle
quick_eval(reduced_train, stacking_clf)

In [None]:
##joblib.dump(stacking_clf, 'fitted_stacked_classifier.pkl')
##joblib.dump(vc, 'fitted_voting_classifier.pkl')