In [169]:
import glob
import pandas as pd
import numpy as np
import re
import json
import pickle
from matplotlib import pylab as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score, make_scorer
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier



In [2]:
CURRENT_FOLDER = f'analysis/chime/'

In [3]:
chime_mfcc_filename = f'{CURRENT_FOLDER}chime_mfcc.csv'
raw_df = pd.read_csv(chime_mfcc_filename)

In [4]:
tvt_filename = f'{CURRENT_FOLDER}train_val_test.pkl'
with open(tvt_filename, 'rb') as readfile:
    train_val_test = pickle.load(readfile)

In [29]:
pca = PCA()
scaler = StandardScaler()
pca_90 = PCA(0.9)
pca_50 = PCA(0.5)
xtrain = scaler.fit_transform(train_val_test['X_train'])
pca.fit_transform(xtrain)
pca_50.fit_transform(xtrain)
pca_90.fit_transform(xtrain)

array([[-34.44255228,  -7.85984383, -14.26408728, ...,  -2.19877924,
         -1.05359889,  -0.79442866],
       [ 42.67418778,   1.22087045,   6.64160394, ...,   2.22967752,
         -0.687279  ,  -3.69279056],
       [-14.63477139,   7.11055933,   5.78524565, ...,  -0.44239342,
         -0.55647615,   1.29830201],
       ...,
       [  2.33764124,  10.46461175,  -8.31714126, ...,   2.90918809,
         -0.97938499,   0.4724944 ],
       [  0.24940182,  21.07588185,   6.53337924, ...,   0.74185434,
         -0.05362823,   0.66528404],
       [  2.24185271,  -3.67147669,  -5.24241008, ...,  -1.88942063,
         -1.93428452,  -0.51426086]])

In [32]:
print('pca', pca.n_components_)
print('pca', pca_50.n_components_)
print('pca', pca_90.n_components_)

pca 1244
pca 74
pca 318


In [35]:
test = [{'id': 1, 'v': 5}, {'id': 2, 'v': 2}, {'id': 3, 'v': 3}]

In [37]:
sorted(test, key=lambda obj: obj['v'], reverse=True)

[{'id': 1, 'v': 5}, {'id': 3, 'v': 3}, {'id': 2, 'v': 2}]

In [None]:
### Verifying manual vs pipeline vs pipeline+grid produces the same scores and coefficients
The following blocks of code uses the Iris data sets and verifies that grid and pipeline correctly scales the X data.


In [5]:
from sklearn.datasets import load_iris
iris = load_iris()
X = pd.DataFrame(data = iris.data)
y = iris.target
X.columns = iris['feature_names']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.4,
    random_state=42
)

In [129]:
scaler = StandardScaler()
X_train_trans = scaler.fit_transform(X_train)
X_test_trans = scaler.transform(X_test)

lr = LogisticRegression(solver='lbfgs', multi_class='auto', C=1, max_iter=1000)
lr.fit(X_train_trans, y_train)
y_pred = lr.predict(X_test_trans)
print('C=1')
print('train score: ', lr.score(X_train_trans, y_train))
print('test score: ', lr.score(X_test_trans, y_test))
print('coefs\n', lr.coef_)
print('f1', f1_score(y_test, y_pred, average='weighted'))

lr = LogisticRegression(solver='lbfgs', multi_class='auto', C=100, max_iter=1000)
lr.fit(X_train_trans, y_train)
y_pred = lr.predict(X_test_trans)
print('\nC=100')
print('train score: ', lr.score(X_train_trans, y_train))
print('test score: ', lr.score(X_test_trans, y_test))
print('coefs\n', lr.coef_)
print('f1', f1_score(y_test, y_pred, average='weighted'))

lr = LogisticRegression(solver='lbfgs', multi_class='auto', C=10000, max_iter=1000)
lr.fit(X_train_trans, y_train)
y_pred = lr.predict(X_test_trans)
print('\nC=10000')
print('train score: ', lr.score(X_train_trans, y_train))
print('test score: ', lr.score(X_test_trans, y_test))
print('coefs\n', lr.coef_)
print('f1', f1_score(y_test, y_pred, average='weighted'))



C=1
train score:  0.9666666666666667
test score:  0.9833333333333333
coefs
 [[-0.9228266   0.96518343 -1.66166807 -1.53042151]
 [ 0.49665646 -0.33559257 -0.24038208 -0.69282691]
 [ 0.42617014 -0.62959086  1.90205015  2.22324843]]
f1 0.9833089133089132

C=100
train score:  0.9888888888888889
test score:  0.9833333333333333
coefs
 [[-2.25319202  2.71189877 -6.01187126 -5.75347406]
 [ 1.78171561  0.1138654  -2.50975151 -3.61072626]
 [ 0.47147641 -2.82576417  8.52162277  9.36420032]]
f1 0.9833089133089132

C=10000
train score:  0.9888888888888889
test score:  0.9833333333333333
coefs
 [[ -6.54776542   7.64408805 -21.94565854 -21.4096363 ]
 [  6.08743213   1.4225854  -15.08227974 -20.76259704]
 [  0.46033329  -9.06667346  37.02793829  42.17223334]]
f1 0.9833089133089132


0.9833089133089132

In [121]:
lr_pipeline = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('logreg', LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000))
])


for c in [1, 100, 10000]:
    lr_pipeline.set_params(**dict(
        logreg__C = c
    ))
    lr_pipeline.fit(X_train, y_train)
    print(f'\nC: {c}')
    print('train score: ', lr_pipeline.score(X_train, y_train))
    print('test score: ', lr_pipeline.score(X_test, y_test))
    print('coefs\n', lr_pipeline.named_steps['logreg'].coef_)
    print('f1', f1_score(y_test, lr_pipeline.predict(X_test), average='weighted'))



C: 1
train score:  0.9666666666666667
test score:  0.9833333333333333
coefs
 [[-0.9228266   0.96518343 -1.66166807 -1.53042151]
 [ 0.49665646 -0.33559257 -0.24038208 -0.69282691]
 [ 0.42617014 -0.62959086  1.90205015  2.22324843]]
f1 0.9833089133089132

C: 100
train score:  0.9888888888888889
test score:  0.9833333333333333
coefs
 [[-2.25319202  2.71189877 -6.01187126 -5.75347406]
 [ 1.78171561  0.1138654  -2.50975151 -3.61072626]
 [ 0.47147641 -2.82576417  8.52162277  9.36420032]]
f1 0.9833089133089132

C: 10000
train score:  0.9888888888888889
test score:  0.9833333333333333
coefs
 [[ -6.54776542   7.64408805 -21.94565854 -21.4096363 ]
 [  6.08743213   1.4225854  -15.08227974 -20.76259704]
 [  0.46033329  -9.06667346  37.02793829  42.17223334]]
f1 0.9833089133089132


In [146]:
lr_grid_pipeline = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('logreg', LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=200))
])
params = {
    'logreg__C': [0.1, 0.5, 1, 100, 10000]
}
f1_metric = make_scorer(f1_score, greater_is_better=True, average='weighted')
grid = GridSearchCV(lr_grid_pipeline, param_grid=params, cv=5, scoring=f1_metric)
grid.fit(X_train, y_train)
print('train score: ', grid.score(X_train, y_train))
print('test score: ', grid.score(X_test, y_test))
print('grid coefs\n', grid.best_estimator_.named_steps['logreg'].coef_)
print('best params: ', grid.best_params_)

train score:  0.9333333333333333
test score:  0.9833089133089132
grid coefs
 [[-0.76463531  0.76649571 -1.28672085 -1.18318996]
 [ 0.36415486 -0.3883894  -0.09370145 -0.49884388]
 [ 0.40048045 -0.37810632  1.3804223   1.68203384]]
best params:  {'logreg__C': 0.5}




In [166]:
# how to use scorer
f1_metric(lr_pipeline, X_test, y_test)

0.9833089133089132

In [135]:
pca_pipeline = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('pca', PCA()),
    ('logreg', LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000))
])

In [136]:
pca_pipeline.get_params()

{'memory': None,
 'steps': [('standard_scaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('pca',
   PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
       svd_solver='auto', tol=0.0, whiten=False)),
  ('logreg',
   LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=1000,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'standard_scaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'pca': PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
     svd_solver='auto', tol=0.0, whiten=False),
 'logreg': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
    

In [148]:
def hello(name, **kwargs):
    
    def hi():
        return name
    
    if 'myvar' in kwargs:
        print('hi', hi(), kwargs['myvar'])
    
    return hi
        

def test(name, **kwargs):
    return hello(name, **kwargs)

t = test('me', myvar='no')
t()

hi me no


'me'

In [151]:
a = pd.Series([1, 2, 3, 4, 5, 10, 10])

0     1
1     2
2     3
3     4
4     5
5    10
6    10
dtype: int64

In [159]:
# understanding stratified vs non_stratified train split
X = train_val_test['X']
y = train_val_test['y']
n = 100


def split(X, y, n=100, **kwargs):
    y_ratio = []
    for i in range(n):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, **kwargs)
        y_ratio.append(sum(y_test)/sum(y_train))
    
    
    return pd.Series(y_ratio)

split_default = split(X, y, n)
split_strat = split(X, y, n, stratify=y)

In [160]:
split_default.describe()

count    100.000000
mean       0.250223
std        0.010063
min        0.229186
25%        0.241966
50%        0.250238
75%        0.257416
max        0.272023
dtype: float64

In [161]:
split_strat.describe()

count    1.000000e+02
mean     2.502379e-01
std      5.579081e-17
min      2.502379e-01
25%      2.502379e-01
50%      2.502379e-01
75%      2.502379e-01
max      2.502379e-01
dtype: float64

In [168]:
# xg classifier
xgb = XGBClassifier()

In [170]:
rand = RandomForestClassifier(n_estimators=100)