## Continue ensembles from v1 on full dataset format commonly used on Kaggle

In [1]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from collections import Counter
import mlflow

# local imports
from prepare import *
from evaluate import *

In [2]:
mlflow.set_experiment('baseline')

INFO: 'baseline' does not exist. Creating a new experiment


### Read in initial datasets if needed

In [3]:
#raw_train, raw_train_labels, raw_test, specs, sample = read_raw_csvs()
#raw_train_labels = pd.read_csv('data/train_labels.csv')

### Load large train/test features from Josh's work

In [4]:
reduced_train = pd.read_csv('reduce_train.csv')
reduced_test = pd.read_csv('reduce_test.csv')
reduced_train.shape, reduced_test.shape

((17690, 890), (1000, 891))

### Start throwing model mud at the wall

In [6]:
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
                                ExtraTreesClassifier, BaggingClassifier, \
                                GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')  #Ridge classifier throws some warnings about ill-conditioned matrix

  return _load(spec)


### Baseline accuracy of 50% (or 25% if balanced)

In [7]:
reduced_train.accuracy_group.value_counts(normalize=True)

3    0.500000
0    0.239062
1    0.136292
2    0.124647
Name: accuracy_group, dtype: float64

# Initialize Models and start testing accuracy

In [8]:
rf = RandomForestClassifier()
rc = RidgeClassifier()
ac = AdaBoostClassifier()
et = ExtraTreesClassifier()
bc = BaggingClassifier()
gbc = GradientBoostingClassifier()

clf = CatBoostClassifier(
    loss_function='MultiClass',
    task_type="CPU",
    learning_rate=0.01,
    iterations=2000,
    od_type="Iter",
    early_stopping_rounds=500,
    random_seed=42
    )

# tried with minimal results
#nb = GaussianNB()
#lr = LogisticRegression()               ## if they are commented out, they weren't performing well (or operator error...)
#sgd = SGDClassifier()

In [9]:
knn = KNeighborsClassifier()
svc = SVC(probability=True, verbose=1)

## Evaluate model performance

In [None]:
# Catboost - accuracy 56%a
catbooster = quick_eval(reduced_train, clf)#, cv=True)

## Train/test split and attempt purely catboost

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reduced_train.drop('accuracy_group', axis=1)._get_numeric_data(), 
                                                   reduced_train.accuracy_group,
                                                   test_size=.2,
                                                   random_state=42)

In [None]:
from sklearn.metrics import cohen_kappa_score
y_pred = catbooster.predict(X_test)

#confirimg these two functions do the same thing
print(qwk3(y_pred, y_test))  #0.5194
print(cohen_kappa_score(y_pred, y_test, weights='quadratic'))  #0.5194

In [None]:
reduced_train.shape, reduced_test.shape

In [None]:
sub_pred = catbooster.predict(reduced_test._get_numeric_data())
len(sub_pred)

## Create submission for testing - scored .443

In [None]:
sample_sub = pd.read_csv('data/sample_submission.csv')

# To create a submission:
submission = pd.DataFrame()
submission['installation_id'] = sample_sub.installation_id
submission['accuracy_group'] = sub_pred
submission.head()

In [None]:
submission.accuracy_group.value_counts(normalize=True)

In [None]:
submission.to_csv('preds.csv', index=False)