In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, precision_recall_curve, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import boto3
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from joblib import dump, load

In [3]:
s3_client = boto3.client('s3')
bucket_name = 'jarednewstudy'
key = 'train_mels_vec.csv'

In [4]:
obj = s3_client.get_object(Bucket=bucket_name, Key=key)

In [8]:
# df = pd.read_csv(obj['Body'])

In [9]:
# tp = pd.read_csv(obj['Body'], iterator=True, chunksize=1000) 

In [5]:
df = pd.read_csv('data/train_windows_mel.csv', index_col=0)

In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12891,12892,12893,12894,12895,12896,12897,12898,12899,labels
ac9e7a91.wav,(0.8720727-0.55586475j),(0.97243+0.47310236j),(1.1108465-0.5037104j),(0.9333715-0.46517918j),(1.0634736-0.47932836j),(1.1580651+0.045449547j),(1.1319557+0.5294742j),(1.1067612-0.18195131j),(1.1150475+0.105091475j),(1.0884783+0.49223566j),...,(-1.7656741+0.0027507467j),(-1.7656741+0.0027507467j),(-1.7656741+0.0027507467j),(-1.7656741+0.0027507467j),(-1.7656741+0.0027507467j),(-1.7656741+0.0027507467j),(-1.7656741+0.0027507467j),(-1.7656741+0.0027507467j),(-1.7656741+0.0027507467j),Church_bell
ac9e7a91.wav,(1.1620889-0.40256172j),(0.8195885+0.22436227j),(1.1583741+0.18477306j),(0.92741376-0.30937093j),(1.1496263-0.34877148j),(0.9815187+0.3381944j),(0.78291595+0.53184026j),(1.0595443-0.21048172j),(1.0112655-0.4128221j),(1.0171237+0.21520227j),...,(-1.7668371+0.001747158j),(-1.7668371+0.001747158j),(-1.7668371+0.001747158j),(-1.7668371+0.001747158j),(-1.7668371+0.001747158j),(-1.7668371+0.001747158j),(-1.7668371+0.001747158j),(-1.7668371+0.001747158j),(-1.7668371+0.001747158j),Church_bell
ac9e7a91.wav,(1.0819808-0.38528323j),(0.89443386-0.37575608j),(0.8043155+0.07030975j),(1.1942259-0.12525356j),(1.2379514+0.26573363j),(1.0011575-0.5430332j),(0.88896024+0.033864737j),(0.92523164-0.02095847j),(0.95133233-0.5035442j),(1.0703399+0.05758296j),...,(-1.7560852-0.00046687393j),(-1.7560852-0.00046687393j),(-1.7560852-0.00046687393j),(-1.7560852-0.00046687393j),(-1.7560852-0.00046687393j),(-1.7560852-0.00046687393j),(-1.7560852-0.00046687393j),(-1.7560852-0.00046687393j),(-1.7560852-0.00046687393j),Church_bell
ac9e7a91.wav,(1.1904478-0.13529912j),(0.56683093-0.3350452j),(1.1070894+0.41009906j),(1.0396317+0.028394233j),(1.0836594-0.3218837j),(0.86174834+0.39389622j),(1.0672377+0.19966662j),(1.067286-0.15885366j),(1.0418937+0.3230034j),(1.0345826+0.2587691j),...,(-1.7456528+0.0003073491j),(-1.7456528+0.0003073491j),(-1.7456528+0.0003073491j),(-1.7456528+0.0003073491j),(-1.7456528+0.0003073491j),(-1.7456528+0.0003073491j),(-1.7456528+0.0003073491j),(-1.7456528+0.0003073491j),(-1.7456528+0.0003073491j),Church_bell
65ae847e.wav,(0.28678146+0.001967726j),(0.46207947+0.16326383j),(0.5549207+0.58711433j),(0.48897272-0.24187677j),(0.4599209+0.25361636j),(0.5127542-0.56587243j),(0.5938512-0.25957084j),(0.5694763+0.13295428j),(0.5385659+0.62069625j),(0.58470094-0.2750396j),...,(-2.0571358+0.001967726j),(-2.0571358+0.001967726j),(-2.0571358+0.001967726j),(-2.0571358+0.001967726j),(-2.0571358+0.001967726j),(-2.0571358+0.001967726j),(-2.0571358+0.001967726j),(-2.0571358+0.001967726j),(-2.0571358+0.001967726j),Frying_(food)


In [11]:
# df = pd.concat(tp, ignore_index=True)

In [12]:
# df.isnull().values.any()

In [None]:
# dump(df, 'data/train_df.joblib')

In [160]:
df.shape

(19868, 12901)

In [7]:
def choose_target(df, target, fill_na):
    """
    Returns y as ndarray, X as a dataframe
    """
    df[target] = pd.get_dummies(df, columns=['labels'])['labels_{}'.format(target)]
#     df['fname'] = df['Unnamed: 0']
#     df.set_index('fname')
    y = df[target].values
    X = df.drop(columns=['labels', target])
    if fill_na == '0':
        X = X.fillna(0)
    return X, y

In [8]:
X, y = choose_target(df=df, 
                     target='Purr',
                     fill_na='0')

In [9]:
X = X.applymap(lambda x: abs(complex(x)))
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12890,12891,12892,12893,12894,12895,12896,12897,12898,12899
ac9e7a91.wav,1.034165,1.081409,1.219715,1.042868,1.166504,1.158957,1.249667,1.121618,1.119989,1.194605,...,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676,1.765676
ac9e7a91.wav,1.22984,0.849743,1.173018,0.977654,1.201367,1.03815,0.946473,1.080248,1.092282,1.039641,...,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838,1.766838
ac9e7a91.wav,1.148532,0.970157,0.807383,1.200776,1.266151,1.138947,0.889605,0.925469,1.076378,1.071888,...,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085,1.756085
ac9e7a91.wav,1.198112,0.658447,1.180605,1.040019,1.130454,0.947504,1.085755,1.079043,1.090813,1.066453,...,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653,1.745653
65ae847e.wav,0.286788,0.490074,0.807862,0.545526,0.525213,0.763628,0.648102,0.584791,0.821777,0.646159,...,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137,2.057137


In [10]:
X.shape

(19868, 12900)

## Upsampling minority class

In [77]:
target = 'Purr'
df_majority = df[df['labels'] != target]
df_minority = df[df['labels'] == target]

In [83]:
majority_count = sum(df[df['labels'] != target]['labels'].value_counts())

In [123]:
df_minority_upsampled = resample(df_minority, 
                                replace=True,
                                n_samples=majority_count,
                                random_state=13)

In [124]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [125]:
X, y = choose_target(df=df_upsampled,
                    target=target,
                    fill_na='0')

In [126]:
X = X.applymap(lambda x: abs(complex(x)))

In [157]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [148]:
X_train.shape

(460, 12900)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## PCA

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
pca = PCA(n_components=5000, random_state=77)
X_train_pca = pca.fit_transform(X)

In [12]:
dump(X_train_pca, 'data/X_train_pca.joblib')

['data/X_train_pca.joblib']

In [13]:
dump(pca, 'models/pca.joblib')

['models/pca.joblib']

In [180]:
# X_test_pca = pca.transform(X_test)

In [14]:
#Scoring

def score_fitted_model(model, X_test, y_test):
    
    y_preds = model.predict(X_test)
    y_probs = model.predict_proba(X_test)
    acc = accuracy_score(y_test, y_preds)
    prec = precision_score(y_test, y_preds)
    rec = recall_score(y_test, y_preds)
    f1 = f1_score(y_test, y_preds)
    conf = confusion_matrix(y_test, y_preds)
    scores = {'model': model.__class__.__name__, 'accuracy': acc, 'precision': prec, 
              'recall': rec, 'f1': f1, 'conf_mat': conf}
    return scores

## Gradient Boosting

In [16]:
gbc = GradientBoostingClassifier(learning_rate=0.2,
                                n_estimators=100,
                                max_features=1000,
                                 random_state=8,
                                 verbose=1
                                )
gbc.fit(X_train_pca, y)

      Iter       Train Loss   Remaining Time 
         1           0.1011           13.60m
         2           0.0994           13.14m
         3           0.0943           12.92m
         4           0.0925           12.83m
         5           0.0911           13.25m
         6           0.0896           14.36m
         7           0.0875           14.29m
         8           0.0820           14.55m
         9           0.0806           14.39m
        10           0.0794           14.20m
        20           0.0676           12.22m
        30           0.0638           11.19m
        40           0.0531            9.75m
        50           0.0505            8.12m
        60           0.0477            6.76m
        70           0.0441            5.10m
        80           0.0405            3.48m
        90           0.0389            1.78m
       100           0.0360            0.00s


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.2, loss='deviance', max_depth=3,
                           max_features=1000, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=8, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=1,
                           warm_start=False)

In [17]:
score_fitted_model(gbc, X_test_pca, y_test)

NameError: name 'X_test_pca' is not defined

In [None]:
dump (gbc, 'models/gdb_clf.joblib')

## Random Forest

In [196]:
rfc = RandomForestClassifier(n_estimators=500,
                             max_depth=8,
                             min_samples_leaf=8,
                             min_samples_split=10,
                             verbose=1,
                             random_state=419
)

In [197]:
rfc.fit(X_train_pca, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  5.9min finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=8, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=419,
                       verbose=1, warm_start=False)

In [198]:
score_fitted_model(rfc, X_test_pca, y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
  _warn_prf(average, modifier, msg_start, len(result))


{'model': 'RandomForestClassifier',
 'accuracy': 0.987920273807127,
 'precision': 0.0,
 'recall': 0.0,
 'f1': 0.0,
 'conf_mat': array([[4907,    0],
        [  60,    0]])}

In [200]:
dump(rfc, 'models/rf_clf.joblib')

['models/rf_clf.joblib']

## Logistic Regression

In [201]:
log = LogisticRegression()

In [202]:
log.fit(X_train_pca, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [203]:
score_fitted_model(log, X_test_pca, y_test)

{'model': 'LogisticRegression',
 'accuracy': 0.012281054962754178,
 'precision': 0.012082158679017317,
 'recall': 1.0,
 'f1': 0.023875845602865098,
 'conf_mat': array([[   1, 4906],
        [   0,   60]])}

In [204]:
dump(log, 'models/log_reg.joblib')

['models/log_reg.joblib']

## Downsampling Majority Class

...tbc

## Testing on a (balanced) subset of test data 

In [29]:
test_df = pd.read_csv('data/test_windows_mel.zip', index_col=0)

In [19]:
test_df.shape

(20, 12901)

In [30]:
X_test, y_test = choose_target(df=test_df, 
                     target='Purr',
                     fill_na='0')

In [31]:
X_test = X_test.applymap(lambda x: abs(complex(x)))
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12890,12891,12892,12893,12894,12895,12896,12897,12898,12899
340da762.wav,1.851779,1.926661,1.790155,1.772461,1.830766,1.76303,1.907218,1.906484,1.847534,1.839827,...,1.564205,1.564205,1.564205,1.564205,1.564205,1.564205,1.564205,1.564205,1.564205,1.564205
340da762.wav,1.326589,1.031765,0.801493,1.126498,1.213463,1.165523,0.780377,1.052408,1.013382,1.098081,...,1.570098,1.570098,1.570098,1.570098,1.570098,1.570098,1.570098,1.570098,1.570098,1.570098
340da762.wav,0.818254,0.852426,1.177918,1.010415,1.08931,1.367235,0.980664,1.284388,1.046358,1.332898,...,1.604056,1.604056,1.604056,1.604056,1.604056,1.604056,1.604056,1.604056,1.604056,1.604056
340da762.wav,1.310807,1.412962,1.514875,1.423629,1.046401,1.455149,1.412662,1.08644,1.268236,1.104407,...,1.635082,1.635082,1.635082,1.635082,1.635082,1.635082,1.635082,1.635082,1.635082,1.635082
2d21dbf4.wav,1.406415,1.221941,1.312833,1.547531,1.440355,1.460948,1.345249,1.528509,1.510984,1.565138,...,1.599996,1.599996,1.599996,1.599996,1.599996,1.599996,1.599996,1.599996,1.599996,1.599996


In [10]:
pca = load('models/pca.joblib')

FileNotFoundError: [Errno 2] No such file or directory: 'models/pca.joblib'

In [32]:
X_test_pca = pca.transform(X_test)

In [27]:
gbc = load('models/gdb_clf.joblib')

In [33]:
score_fitted_model(model=gbc,
                   X_test=X_test_pca,
                   y_test=y_test
                    )

{'model': 'GradientBoostingClassifier',
 'accuracy': 0.5173267326732673,
 'precision': 0.52,
 'recall': 0.0663265306122449,
 'f1': 0.11764705882352942,
 'conf_mat': array([[196,  12],
        [183,  13]])}