In [70]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, precision_recall_curve, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import boto3
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

In [3]:
s3_client = boto3.client('s3')
bucket_name = 'jarednewstudy'
key = 'train_mels_vec.csv'

In [4]:
obj = s3_client.get_object(Bucket=bucket_name, Key=key)

In [5]:
# df = pd.read_csv(obj['Body'])

In [6]:
# tp = pd.read_csv(obj['Body'], iterator=True, chunksize=1000) 

In [158]:
df = pd.read_csv('data/train_windows_mel.csv', index_col=0)

In [7]:
# df = pd.concat(tp, ignore_index=True)

In [161]:
df.isnull().values.any()

True

In [160]:
df.shape

(19868, 12901)

In [162]:
def choose_target(df, target, fill_na):
    """
    Returns y as ndarray, X as a dataframe
    """
    df[target] = pd.get_dummies(df, columns=['labels'])['labels_{}'.format(target)]
#     df['fname'] = df['Unnamed: 0']
#     df.set_index('fname')
    y = df[target].values
    X = df.drop(columns=['labels', target])
    if fill_na == '0':
        X = X.fillna(0)
    return X, y

In [163]:
X, y = choose_target(df=df, 
                     target='Purr',
                     fill_na='0')

In [164]:
X = X.applymap(lambda x: abs(complex(x)))
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12890,12891,12892,12893,12894,12895,12896,12897,12898,12899
ac9e7a91.wav,1.034165,1.081409,1.219715,1.042868,1.166504,1.158957,1.249667,1.121618,1.119989,1.194605,...,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676
ac9e7a91.wav,1.22984,0.849743,1.173018,0.977654,1.201367,1.03815,0.946473,1.080248,1.092282,1.039641,...,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838
ac9e7a91.wav,1.148532,0.970157,0.807383,1.200776,1.266151,1.138947,0.889605,0.925469,1.076378,1.071888,...,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085
ac9e7a91.wav,1.198112,0.658447,1.180605,1.040019,1.130454,0.947504,1.085755,1.079043,1.090813,1.066453,...,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653
65ae847e.wav,0.286788,0.490074,0.807862,0.545526,0.525213,0.763628,0.648102,0.584791,0.821777,0.646159,...,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137


In [176]:
X.shape

(19868, 12900)

## Upsampling minority class

In [77]:
target = 'Purr'
df_majority = df[df['labels'] != target]
df_minority = df[df['labels'] == target]

In [83]:
majority_count = sum(df[df['labels'] != target]['labels'].value_counts())

In [123]:
df_minority_upsampled = resample(df_minority, 
                                replace=True,
                                n_samples=majority_count,
                                random_state=13)

In [124]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [125]:
X, y = choose_target(df=df_upsampled,
                    target=target,
                    fill_na='0')

In [126]:
X = X.applymap(lambda x: abs(complex(x)))

In [157]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [148]:
X_train.shape

(460, 12900)

In [149]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## PCA

In [177]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [179]:
pca = PCA(n_components=5000, random_state=77)
X_train_pca = pca.fit_transform(X_train)

In [180]:
X_test_pca = pca.transform(X_test)

In [181]:
#Scoring

def score_fitted_model(model, X_test, y_test):
    
    y_preds = model.predict(X_test)
    y_probs = model.predict_proba(X_test)
    acc = accuracy_score(y_test, y_preds)
    prec = precision_score(y_test, y_preds)
    rec = recall_score(y_test, y_preds)
    f1 = f1_score(y_test, y_preds)
    conf = confusion_matrix(y_test, y_preds)
    scores = {'model': model.__class__.__name__, 'accuracy': acc, 'precision': prec, 
              'recall': rec, 'f1': f1, 'conf_mat': conf}
    return scores

## Gradient Boosting

In [182]:
gbc = GradientBoostingClassifier(learning_rate=0.1,
                                n_estimators=500,
                                max_features=5000,
                                 random_state=8,
                                 verbose=1
                                )
gbc.fit(X_train_pca, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.0974          230.73m
         2           0.0896          231.46m
         3           0.0843          232.16m
         4           0.0813          233.24m
         5           0.0768          232.82m
         6           0.0723          231.79m
         7           0.0703          231.61m
         8           0.0683          231.48m
         9           0.0666          231.19m
        10           0.0649          231.01m
        20           0.0543          227.98m
        30           0.0487          223.85m
        40           0.0437          221.59m
        50           0.0394          214.33m
        60           0.0363          208.00m
        70           0.0339          202.12m
        80           0.0333          196.54m
        90           0.0322          191.45m
       100           0.0291          186.25m
       200           0.0202          139.42m
       300           0.0159           92.56m
       40

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=5000, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=8, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=1,
                           warm_start=False)

In [184]:
score_fitted_model(gbc, X_test_pca, y_test)

{'model': 'GradientBoostingClassifier',
 'accuracy': 0.9635594926514999,
 'precision': 0.016,
 'recall': 0.03333333333333333,
 'f1': 0.021621621621621623,
 'conf_mat': array([[4784,  123],
        [  58,    2]])}

In [185]:
from joblib import dump, load

In [187]:
dump (gbc, 'models/gdb_clf.joblib')

['models/gdb_clf.joblib']

## Random Forest

In [196]:
rfc = RandomForestClassifier(n_estimators=500,
                             max_depth=8,
                             min_samples_leaf=8,
                             min_samples_split=10,
                             verbose=1,
                             random_state=419
)

In [197]:
rfc.fit(X_train_pca, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  5.9min finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=8, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=419,
                       verbose=1, warm_start=False)

In [198]:
score_fitted_model(rfc, X_test_pca, y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
  _warn_prf(average, modifier, msg_start, len(result))


{'model': 'RandomForestClassifier',
 'accuracy': 0.987920273807127,
 'precision': 0.0,
 'recall': 0.0,
 'f1': 0.0,
 'conf_mat': array([[4907,    0],
        [  60,    0]])}

In [200]:
dump(rfc, 'models/rf_clf.joblib')

['models/rf_clf.joblib']

## Logistic Regression

In [201]:
log = LogisticRegression()

In [202]:
log.fit(X_train_pca, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [203]:
score_fitted_model(log, X_test_pca, y_test)

{'model': 'LogisticRegression',
 'accuracy': 0.012281054962754178,
 'precision': 0.012082158679017317,
 'recall': 1.0,
 'f1': 0.023875845602865098,
 'conf_mat': array([[   1, 4906],
        [   0,   60]])}

In [204]:
dump(log, 'models/log_reg.joblib')

['models/log_reg.joblib']

## Downsampling Majority Class