In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, precision_recall_curve, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import boto3
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

In [2]:
def process_vector_df(path):
    pass

In [3]:
s3_client = boto3.client('s3')
bucket_name = 'jarednewstudy'
key = 'train_mels_vec.csv'

In [4]:
obj = s3_client.get_object(Bucket=bucket_name, Key=key)

In [5]:
# df = pd.read_csv(obj['Body'])

In [6]:
tp = pd.read_csv(obj['Body'], iterator=True, chunksize=1000) 

In [7]:
df = pd.concat(tp, ignore_index=True)

In [8]:
def choose_target(df, target, fill_na):
    """
    Returns y as ndarray, X as a dataframe
    """
    df[target] = pd.get_dummies(df)['labels_{}'.format(target)]
    df['fname'] = df['Unnamed: 0']
    df.set_index('fname')
    y = df[target].values
    X = df.drop(columns=['Unnamed: 0', 'labels', 'fname', target])
    if fill_na == '0':
        X = X.fillna(0)
    return X, y

In [9]:
X, y = choose_target(df, 'Purr', '0')

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61686,61687,61688,61689,61690,61691,61692,61693,61694,61695
1221,-7.539231,-6.119341,-12.664371,0.036964,-2.473693,-3.334282,1.435069,-2.374225,-0.252988,2.999558,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3555,-24.306046,-5.735206,0.906864,15.611013,3.248552,-11.283641,-18.717607,-14.73946,-20.346775,-19.977085,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
903,-4.654064,-48.972027,-26.757408,-24.362648,-17.046318,-33.622826,-10.730468,-24.275528,-28.694151,-48.972027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2133,5.101423,-0.411431,3.478077,6.843094,5.282769,-6.176236,0.101955,0.755362,5.908342,3.376797,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,11.556662,10.142279,10.504073,9.533718,9.438493,9.87372,26.29637,4.009405,22.084673,15.802631,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61686,61687,61688,61689,61690,61691,61692,61693,61694,61695
0,9.886278,7.386839,9.859134,3.623122,2.141981,7.452931,11.166306,5.476013,2.809983,-2.018272,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-29.162361,-32.420738,-38.64909,-35.599121,-32.5839,-33.295036,-33.223362,-30.60519,-34.374557,-42.983246,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-17.183058,-25.703621,-13.213657,-17.846926,6.637511,-4.118928,-6.611485,-10.425756,-16.695269,-0.808737,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-12.784816,-8.405742,-6.91794,-7.080761,-6.357865,-5.945704,-8.577787,-6.233015,-7.823601,-6.587538,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,14.404525,15.455218,18.002903,13.197304,4.986,-13.429062,-0.201984,2.163121,-2.510244,-3.788836,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

## PCA

In [46]:
pca = PCA(n_components=1000, random_state=77)
X_train_pca = pca.fit_transform(X_train)

In [47]:
X_test_pca = pca.transform(X_test)

## SMOTE
Oversampling ONLY on training data

In [None]:
sm = SMOTE(random_state=12, ratio)

In [42]:
#Scoring

def score_fitted_model(model, X_test, y_test):
    
    y_preds = model.predict(X_test)
    y_probs = model.predict_proba(X_test)
    acc = accuracy_score(y_test, y_preds)
    prec = precision_score(y_test, y_preds)
    rec = recall_score(y_test, y_preds)
    f1 = f1_score(y_test, y_preds)
    conf = confusion_matrix(y_test, y_preds)
    scores = {'model': model.__class__.__name__, 'accuracy': acc, 'precision': prec, 
              'recall': rec, 'f1': f1, 'conf_mat': conf}
    return scores

## Gradient Boosting

In [16]:
gbc = GradientBoostingClassifier(learning_rate=0.1,
                                n_estimators=200,
                                max_features=1000,
                                 random_state=8,
                                 verbose=1
                                )
gbc.fit(X_train_pca, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.1018           48.54s
         2           0.0895           46.08s
         3           0.0820           44.56s
         4           0.0743           44.59s
         5           0.0664           43.71s
         6           0.0584           43.10s
         7           0.0502           42.55s
         8           0.0418           42.15s
         9           0.0342           42.31s
        10           0.0268           42.08s
        20           0.0074           41.36s
        30           0.0043           40.31s
        40           0.0032           38.84s
        50           0.0028           35.93s
        60           0.0024           33.16s
        70           0.0022           30.43s
        80           0.0020           28.28s
        90           0.0018           25.79s
       100           0.0017           23.39s
       200           0.0008            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=1000, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='auto',
                           random_state=8, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=1,
                           warm_start=False)

In [43]:
score_fitted_model(gbc, X_test_pca, y_test)

{'model': 'GradientBoostingClassifier',
 'accuracy': 0.9517296862429606,
 'precision': 0.019230769230769232,
 'recall': 0.1,
 'f1': 0.03225806451612903,
 'conf_mat': array([[1182,   51],
        [   9,    1]])}

## Random Forest

In [33]:
rfc = RandomForestClassifier()

In [34]:
rfc.fit(X_train_pca, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [40]:
score_fitted_model(rfc, X_test_pca, y_test)

{'model': 'RandomForestClassifier',
 'accuracy': 0.9895414320193081,
 'precision': 0.0,
 'recall': 0.0,
 'f1': 0.0}

## Logistic Regression

In [None]:
log = LogisticRegression()

In [44]:
log.fit(X_train_pca, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
score_fitted_model(log, X_test_pca, y_test)

{'model': 'LogisticRegression',
 'accuracy': 0.7658889782783588,
 'precision': 0.01718213058419244,
 'recall': 0.5,
 'f1': 0.03322259136212625,
 'conf_mat': array([[947, 286],
        [  5,   5]])}

In [51]:
# Fit classifier with out-of-bag estimates
params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,
          'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}
clf = GradientBoostingClassifier(**params)

clf.fit(X_train_pca, y_train)
acc = clf.score(X_test_pca, y_test)
print("Accuracy: {:.4f}".format(acc))

n_estimators = params['n_estimators']
x = np.arange(n_estimators) + 1


def heldout_score(clf, X_test, y_test):
    """compute deviance scores on ``X_test`` and ``y_test``. """
    score = np.zeros((n_estimators,), dtype=np.float64)
    for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
        score[i] = clf.loss_(y_test, y_pred)
    return score


def cv_estimate(n_folds=3):
    cv = KFold(n=X_train.shape[0], n_folds=n_folds)
    cv_clf = ensemble.GradientBoostingClassifier(**params)
    val_scores = np.zeros((n_estimators,), dtype=np.float64)
    for train, test in cv:
        cv_clf.fit(X_train[train], y_train[train])
        val_scores += heldout_score(cv_clf, X_train[test], y_train[test])
    val_scores /= n_folds
    return val_scores


# Estimate best n_estimator using cross-validation
cv_score = cv_estimate(3)

# Compute best n_estimator for test data
test_score = heldout_score(clf, X_test_pca, y_test)

# negative cumulative sum of oob improvements
cumsum = -np.cumsum(clf.oob_improvement_)

# min loss according to OOB
oob_best_iter = x[np.argmin(cumsum)]

# min loss according to test (normalize such that first loss is 0)
test_score -= test_score[0]
test_best_iter = x[np.argmin(test_score)]

# min loss according to cv (normalize such that first loss is 0)
cv_score -= cv_score[0]
cv_best_iter = x[np.argmin(cv_score)]

# color brew for the three curves
oob_color = list(map(lambda x: x / 256.0, (190, 174, 212)))
test_color = list(map(lambda x: x / 256.0, (127, 201, 127)))
cv_color = list(map(lambda x: x / 256.0, (253, 192, 134)))

# plot curves and vertical lines for best iterations
plt.plot(x, cumsum, label='OOB loss', color=oob_color)
plt.plot(x, test_score, label='Test loss', color=test_color)
plt.plot(x, cv_score, label='CV loss', color=cv_color)
plt.axvline(x=oob_best_iter, color=oob_color)
plt.axvline(x=test_best_iter, color=test_color)
plt.axvline(x=cv_best_iter, color=cv_color)

# add three vertical lines to xticks
xticks = plt.xticks()
xticks_pos = np.array(xticks[0].tolist() +
                      [oob_best_iter, cv_best_iter, test_best_iter])
xticks_label = np.array(list(map(lambda t: int(t), xticks[0])) +
                        ['OOB', 'CV', 'Test'])
ind = np.argsort(xticks_pos)
xticks_pos = xticks_pos[ind]
xticks_label = xticks_label[ind]
plt.xticks(xticks_pos, xticks_label)

plt.legend(loc='upper right')
plt.ylabel('normalized loss')
plt.xlabel('number of iterations')

plt.show()

Accuracy: 0.9912


TypeError: __init__() got an unexpected keyword argument 'n'