In [70]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, precision_recall_curve, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import boto3
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

In [3]:
s3_client = boto3.client('s3')
bucket_name = 'jarednewstudy'
key = 'train_mels_vec.csv'

In [4]:
obj = s3_client.get_object(Bucket=bucket_name, Key=key)

In [5]:
# df = pd.read_csv(obj['Body'])

In [6]:
# tp = pd.read_csv(obj['Body'], iterator=True, chunksize=1000) 

In [158]:
df = pd.read_csv('data/train_windows_mel.csv', index_col=0)

In [7]:
# df = pd.concat(tp, ignore_index=True)

In [161]:
df.isnull().values.any()

True

In [160]:
df.shape

(19868, 12901)

In [162]:
def choose_target(df, target, fill_na):
    """
    Returns y as ndarray, X as a dataframe
    """
    df[target] = pd.get_dummies(df, columns=['labels'])['labels_{}'.format(target)]
#     df['fname'] = df['Unnamed: 0']
#     df.set_index('fname')
    y = df[target].values
    X = df.drop(columns=['labels', target])
    if fill_na == '0':
        X = X.fillna(0)
    return X, y

In [163]:
X, y = choose_target(df=df, 
                     target='Purr',
                     fill_na='0')

In [164]:
X = X.applymap(lambda x: abs(complex(x)))
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12890,12891,12892,12893,12894,12895,12896,12897,12898,12899
ac9e7a91.wav,1.034165,1.081409,1.219715,1.042868,1.166504,1.158957,1.249667,1.121618,1.119989,1.194605,...,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676
ac9e7a91.wav,1.22984,0.849743,1.173018,0.977654,1.201367,1.03815,0.946473,1.080248,1.092282,1.039641,...,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838
ac9e7a91.wav,1.148532,0.970157,0.807383,1.200776,1.266151,1.138947,0.889605,0.925469,1.076378,1.071888,...,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085
ac9e7a91.wav,1.198112,0.658447,1.180605,1.040019,1.130454,0.947504,1.085755,1.079043,1.090813,1.066453,...,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653
65ae847e.wav,0.286788,0.490074,0.807862,0.545526,0.525213,0.763628,0.648102,0.584791,0.821777,0.646159,...,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137


In [176]:
X.shape

(19868, 12900)

## Upsampling minority class

In [77]:
target = 'Purr'
df_majority = df[df['labels'] != target]
df_minority = df[df['labels'] == target]

In [83]:
majority_count = sum(df[df['labels'] != target]['labels'].value_counts())

In [123]:
df_minority_upsampled = resample(df_minority, 
                                replace=True,
                                n_samples=majority_count,
                                random_state=13)

In [124]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [125]:
X, y = choose_target(df=df_upsampled,
                    target=target,
                    fill_na='0')

In [126]:
X = X.applymap(lambda x: abs(complex(x)))

In [157]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [148]:
X_train.shape

(460, 12900)

In [149]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## PCA

In [177]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
pca = PCA(n_components=5000, random_state=77)
X_train_pca = pca.fit_transform(X_train)

In [167]:
X_test_pca = pca.transform(X_test)

In [168]:
#Scoring

def score_fitted_model(model, X_test, y_test):
    
    y_preds = model.predict(X_test)
    y_probs = model.predict_proba(X_test)
    acc = accuracy_score(y_test, y_preds)
    prec = precision_score(y_test, y_preds)
    rec = recall_score(y_test, y_preds)
    f1 = f1_score(y_test, y_preds)
    conf = confusion_matrix(y_test, y_preds)
    scores = {'model': model.__class__.__name__, 'accuracy': acc, 'precision': prec, 
              'recall': rec, 'f1': f1, 'conf_mat': conf}
    return scores

## Gradient Boosting

In [169]:
gbc = GradientBoostingClassifier(learning_rate=0.1,
                                n_estimators=200,
                                max_features=5000,
                                 random_state=8,
                                 verbose=1
                                )
gbc.fit(X_train_pca, y_train)

      Iter       Train Loss   Remaining Time 
         1           1.1972            9.32s
         2           1.0643            6.80s
         3           0.9472            6.58s
         4           0.8469            6.41s
         5           0.7538            6.97s
         6           0.6678            6.50s
         7           0.5923            6.16s
         8           0.5267            6.14s
         9           0.4749            5.88s
        10           0.4249            5.70s
        20           0.1537            4.65s
        30           0.0567            3.85s
        40           0.0216            3.42s
        50           0.0081            3.12s
        60           0.0031            2.84s
        70           0.0012            2.46s
        80           0.0006            2.11s
        90           0.0006            1.73s
       100           0.0006            1.42s
       200           0.0006            0.00s


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=200, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=8, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=1,
                           warm_start=False)

In [171]:
score_fitted_model(gbc, X_test_pca, y_test)

{'model': 'GradientBoostingClassifier',
 'accuracy': 1.0,
 'precision': 1.0,
 'recall': 1.0,
 'f1': 1.0,
 'conf_mat': array([[ 90,   0],
        [  0, 102]])}

## Random Forest

In [172]:
rfc = RandomForestClassifier()

In [173]:
rfc.fit(X_train_pca, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [174]:
score_fitted_model(rfc, X_test_pca, y_test)

{'model': 'RandomForestClassifier',
 'accuracy': 1.0,
 'precision': 1.0,
 'recall': 1.0,
 'f1': 1.0,
 'conf_mat': array([[ 90,   0],
        [  0, 102]])}

## Logistic Regression

In [144]:
log = LogisticRegression()

In [145]:
log.fit(X_train_pca, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [146]:
score_fitted_model(log, X_test_pca, y_test)

{'model': 'LogisticRegression',
 'accuracy': 0.9791666666666666,
 'precision': 0.9615384615384616,
 'recall': 1.0,
 'f1': 0.9803921568627451,
 'conf_mat': array([[ 88,   4],
        [  0, 100]])}

## Downsampling Majority Class