# VAD Training

In [41]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from scipy.io import wavfile

import seaborn as sns
sns.set()
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [23, 10]
plt.rcParams['axes.axisbelow'] = True
from statsmodels.graphics.mosaicplot import mosaic

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import RandomOverSampler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBoostClassifier

from texttable import Texttable
from pdpbox import pdp, get_dataset, info_plots
from IPython.core.display import display

ModuleNotFoundError: No module named 'xgboost'

In [20]:
df = pd.read_csv('/Users/ekervella/Dropbox/GitHub/vad/vad_data/csv/103-1240-0001.csv')
df.set_index('timestamp', inplace=True)
df.shape

(255120, 2)

In [21]:
df.head()

Unnamed: 0_level_0,value,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,-192,0
6.3e-05,-146,0
0.000125,-220,0
0.000188,-267,0
0.00025,-377,0


In [22]:
print('Percentage of voice activity: {}%'.format(round(df['target'].sum()/df.shape[0]*100, 2)))

Percentage of voice activity: 87.87%


In [23]:
X = df[['value']]
y = df['target']
X_train, X_test, y_train, y_test= train_test_split(X, y, shuffle=True, test_size=0.25)

In [24]:
def precision(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return(tp/(tp+fp))

def recall(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return(tp/(tp+fn))

def accuracy(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return((tp+tn)/(tn+fp+fn+tp))

def f1_score(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return(2*p*r/(p+r))

In [25]:
def baseline_predictor(X):
    return([1]*X.shape[0])

In [37]:
print('--- Baseline Predictor ---')
table = Texttable()
rows = []
rows.append(['', 'Precision', 'Recall', 'Accuracy', 'F1 Score'])
rows.append(['On Train (Over Resampled)', round(precision(y_train_b, baseline_predictor(X_train_b)), 3), 
             round(recall(y_train_b, baseline_predictor(X_train_b)), 3), round(accuracy(y_train_b, baseline_predictor(X_train_b)), 3),
             round(f1_score(y_train_b, baseline_predictor(X_train_b)), 3)])
rows.append(['On Test (Imbalanced)', round(precision(y_test, baseline_predictor(X_test)), 3), round(recall(y_test, baseline_predictor(X_test)), 3), 
             round(accuracy(y_test, baseline_predictor(X_test)), 3), round(f1_score(y_test, baseline_predictor(X_test)), 3)])

table.add_rows(rows)
table.set_cols_width([30, 12, 12, 12, 12])
print(table.draw())

--- Baseline Predictor ---
+--------------------------------+--------------+--------------+--------------+--------------+
|                                |  Precision   |    Recall    |   Accuracy   |   F1 Score   |
| On Train (Over Resampled)      | 0.500        | 1            | 0.500        | 0.667        |
+--------------------------------+--------------+--------------+--------------+--------------+
| On Test (Imbalanced)           | 0.879        | 1            | 0.879        | 0.936        |
+--------------------------------+--------------+--------------+--------------+--------------+


In [14]:
resampler = RandomOverSampler(random_state=7)
X_train_b, y_train_b = resampler.fit_sample(X_train, y_train)

In [32]:
logreg = LogisticRegression(random_state=7)
logreg.fit(X_train_b, y_train_b)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=7, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [38]:
tree = DecisionTreeClassifier(random_state=7)
tree.fit(X_train_b, y_train_b)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=7,
            splitter='best')

In [39]:
rf = RandomForestClassifier(random_state=7)
rf.fit(X_train_b, y_train_b)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=7, verbose=0, warm_start=False)

In [None]:
rf = RandomForestClassifier(random_state=7)
rf.fit(X_train_b, y_train_b)

In [31]:
print('--- Logistic Regression ---')
model = logreg
table = Texttable()
rows = []
rows.append(['', 'Precision', 'Recall', 'Accuracy', 'F1 Score'])
rows.append(['On Train (Over Resampled)', round(precision(y_train_b, model.predict(X_train_b)), 3), 
             round(recall(y_train_b, model.predict(X_train_b)), 3), round(accuracy(y_train_b, model.predict(X_train_b)), 3),
             round(f1_score(y_train_b, model.predict(X_train_b)), 3)])
rows.append(['On Test (Imbalanced)', round(precision(y_test, model.predict(X_test)), 3), round(recall(y_test, model.predict(X_test)), 3), 
             round(accuracy(y_test, model.predict(X_test)), 3), round(f1_score(y_test, model.predict(X_test)), 3)])

table.add_rows(rows)
table.set_cols_width([30, 12, 12, 12, 12])
print(table.draw())

--- Logistic Regression ---
+--------------------------------+--------------+--------------+--------------+--------------+
|                                |  Precision   |    Recall    |   Accuracy   |   F1 Score   |
| On Train (Over Resampled)      | 0.494        | 0.471        | 0.494        | 0.482        |
+--------------------------------+--------------+--------------+--------------+--------------+
| On Test (Imbalanced)           | 0.878        | 0.470        | 0.477        | 0.613        |
+--------------------------------+--------------+--------------+--------------+--------------+
