In [28]:
from __future__ import print_function, division, absolute_import
from sklearn.metrics import mean_squared_error, classification_report
from scipy.io import wavfile
from glob import glob
from tqdm import tqdm

import os
import sys
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
import noisereduce as nr
import IPython.display as ipd
import matplotlib.pyplot as plt
import xgboost as xgb

In [2]:
# Set directory for source file, verify if we have collected all the necessary data
data_dir ='./ml-fmi-23-2020'
audio_files_train = glob(data_dir + '/train/train/*.wav')
audio_files_validation = glob(data_dir + '/validation/validation/*.wav')
audio_files_test = glob(data_dir + '/test/test/*.wav')

count_labels_mask_train, count_labels_mask_validation = 0, 0

train_labels = []
try:
    train_data = open(data_dir + '/train.txt', "r")
    for line in train_data:
        filename, label = line.split(",")
        index_file, extension = filename.split(".")
        label = int(label[0])
        
        if label == 1: # we want to see how data is distributed
            count_labels_mask_train += 1
        
        train_labels.append([index_file, label])
except OSError as err:
    print("OS error: {}".format(err))
except:
    print("Unexpected error: ", sys.exc_info()[0])
finally:
    train_data.close()

train_labels.sort(key = lambda t:t[0])


validation_labels = []
try:
    validation_data = open(data_dir + '/validation.txt', "r")
    for line in validation_data:
        filename, label = line.split(",")
        index_file, extension = filename.split(".")
        label = int(label[0])
        
        if label == 1:
            count_labels_mask_validation += 1
        
        validation_labels.append([index_file, label])
except OSError as err:
    print("OS error: {}".format(err))
except:
    print("Unexpected error: ", sys.exc_info()[0])
finally:
    validation_data.close()

validation_labels.sort(key = lambda t:t[0])


&emsp; First step consist of visualizing the data, in order to identify audio properties that need preprocessing to ensure consistency across the whole dataset.

In [3]:
# Number of audio files: training + validation
print("Number of train files:", len(audio_files_train))
print("Number of train files labeled with 1:", count_labels_mask_train)
print("Number of validation files:", len(audio_files_validation))
print("Number of validation files labeled with 1:", count_labels_mask_validation)

Number of train files: 8000
Number of train files labeled with 1: 4076
Number of validation files: 1000
Number of validation files labeled with 1: 528


In [4]:
#Extract some basic info about the file
info = sf.info(audio_files_train[42])
print(info)

./ml-fmi-23-2020/train/train\100043.wav
samplerate: 16000 Hz
channels: 1
duration: 16000 samples
format: WAV (Microsoft) [WAV]
subtype: Signed 16 bit PCM [PCM_16]


In [5]:
def clean_mute_sound(audio, sample_rate, threshold):
    audio = pd.Series(audio)
    audio = audio.apply(np.abs)
    audio = audio.rolling(window=int(sample_rate/32), min_periods=1, center=True)
    audio = audio.mean()
    
    envelope = []
    for audio_window in audio:
        if audio_window > threshold:
            envelope.append(True)
        else:
            envelope.append(False)
            
    return envelope

In [6]:
def reduce_noise(audio):
    audio_noise_reduced = nr.reduce_noise(audio_clip = audio, noise_clip = audio, prop_decrease=0.28, verbose=False, pad_clipping = True)
    return audio_noise_reduced

&emsp; Extracting features:

In [7]:
def extract_features(audio_files, index_audio_file):
    audio, sample_rate = librosa.load(audio_files[index_audio_file]) #it returns an audio time series and the sampling rate
    
    audio = reduce_noise(audio)
    
    mask = clean_mute_sound(audio, sample_rate, 0.0041)
    audio = audio[mask]
    
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=80)
    mfcc_scaled = np.mean(mfccs.T, axis = 0)
    
    return mfcc_scaled  

In [8]:
# Perform the reading through all the 'train' audio files and extract the features(in this case mffc spectogram)
features = []
for index_audio_file in tqdm(range(0, len(audio_files_train))):
    data = extract_features(audio_files_train, index_audio_file)
    
    features.append([data, train_labels[index_audio_file][1]])
    
# Convert into a Panda dataframe
features_df = pd.DataFrame(features, columns=['feature','label'])

100%|██████████████████████████████████████████████████████████████████████████████| 8000/8000 [08:53<00:00, 14.98it/s]


In [9]:
# Perform the reading through all the 'validation' audio files and extract the features
features = []
for index_audio_file in tqdm(range(0, len(audio_files_validation))):
    data = extract_features(audio_files_validation, index_audio_file)
    
    features.append([data, validation_labels[index_audio_file][1]])
    
features_df_v = pd.DataFrame(features, columns=['feature','label'])

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:08<00:00, 14.66it/s]


In [30]:
X_train = np.array(features_df.feature.tolist())
y_train = np.array(features_df.label.tolist())

X_validation = np.array(features_df_v.feature.tolist())
y_validation = np.array(features_df_v.label.tolist())

In [31]:
xgb_model = xgb.XGBClassifier(max_depth = 6, 
                              min_child_weight = 5,
                              n_estimators=500, 
                              learning_rate=0.0995, 
                              eval_metric="rmse", 
                              subsample = 0.8,
                              colsample_bytree = 0.8,
                              early_stopping = 10)

In [32]:
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, early_stopping=10,
              eval_metric='rmse', gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.0995,
              max_delta_step=0, max_depth=6, min_child_weight=5, missing=nan,
              monotone_constraints=None, n_estimators=500, n_jobs=0,
              num_parallel_tree=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method=None, validate_parameters=False, verbosity=None)

In [33]:
y_predicted = xgb_model.predict(X_validation)

In [34]:
rmse = np.sqrt(mean_squared_error(y_validation, y_predicted))
print("RMSE: %f" %(rmse))

RMSE: 0.462601


In [35]:
tp = len([y for i, y in enumerate(y_validation) if y == 1 and y_predicted[i] == 1])
tn = len([y for i, y in enumerate(y_validation) if y == 0 and y_predicted[i] == 0])
fp = len([y for i, y in enumerate(y_validation) if y == 0 and y_predicted[i] == 1])
fn = len([y for i, y in enumerate(y_validation) if y == 1 and y_predicted[i] == 0])

In [36]:
print('accuracy: %.5f%%' % ((tp + tn) / (tp + tn + fp + fn) * 100))
print('recall: %.5f%%' %(tp / (tp + fn) * 100))
print('specificity: %.5f%%' %(tn / (tn + fp) * 100))
print('precision: %.5f%%' % (tp / (tp + fp) *100))
print('fpr: %.5f%%' %(fp / (fp + tn) * 100))

accuracy: 78.60000%
recall: 80.49242%
specificity: 76.48305%
precision: 79.29104%
fpr: 23.51695%


In [37]:
classes = ['with mask', 'without mask']
print(classification_report(y_validation, y_predicted, target_names = classes))

              precision    recall  f1-score   support

   with mask       0.78      0.76      0.77       472
without mask       0.79      0.80      0.80       528

    accuracy                           0.79      1000
   macro avg       0.79      0.78      0.79      1000
weighted avg       0.79      0.79      0.79      1000



In [27]:
confusion_matrix = [[tn,fp],[fn,tp]]
print("Confusion matrix:\n")
for i in range(len(confusion_matrix)):
    print(confusion_matrix[i])

Confusion matrix:

[359, 113]
[115, 413]


In [18]:
X_train = np.concatenate((X_train, X_validation))
y_train = np.concatenate((y_train, y_validation))
print("Test + validation: ", len(X_train))

data_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

Test + validation:  9000


In [454]:
xgb_params = {'base_score':0.5, 'colsample_bylevel':1,
              'colsample_bynode':1, 'colsample_bytree':1,
              'learning_rate':0.0995, 'max_delta_step':0, 'max_depth':3,
              'min_child_weight':1, 'n_estimators':500, 'early_stopping':10,
              'objective':'binary:logistic', 'random_state':0, 'reg_alpha':0,
              'reg_lambda':1, 'scale_pos_weight':1, 'subsample':1}

In [455]:
#Grid Search in order to find the best params for max_depth and min_childs_weights => tune them together in order to find 
#a good tradeof between bias and variance
gridsearch_params_md_mcw = [
    (max_depth, min_child_weight)
    for max_depth in range(3,7)
    for min_child_weight in range(1,6)
]

In [456]:
min_rmse = float("Inf")
best_params = None

for max_depth, min_child_weight in gridsearch_params_md_mcw:
    print("Cross validation with max_depth={}, min_child_weight={}".format(max_depth, min_child_weight))
    
    # update parameters
    xgb_params['max_depth'] = max_depth
    xgb_params['min_child_weight'] = min_child_weight
    
    # cross validation
    cv_results = xgb.cv(dtrain = data_dmatrix, params = xgb_params, nfold=3, 
                        num_boost_round=501, metrics="rmse", as_pandas=True, seed=23)

    
    # Update best Root Mean Squared Error
    rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(rmse, boost_rounds))
    if rmse < min_rmse:
        min_rmse = rmse
        best_params = (max_depth,min_child_weight)
        
print("Best params: {}, {}, : {}".format(best_params[0], best_params[1], min_rmse))

#Update parameters:
xgb_params['max_depth'] = best_params[0]
xgb_params['min_child_weight'] = best_params[1]

Cross validation with max_depth=3, min_child_weight=1
	RMSE 0.40556666666666663 for 500 rounds
Cross validation with max_depth=3, min_child_weight=2
	RMSE 0.40529 for 500 rounds
Cross validation with max_depth=3, min_child_weight=3
	RMSE 0.40567833333333336 for 500 rounds
Cross validation with max_depth=3, min_child_weight=4
	RMSE 0.406731 for 500 rounds
Cross validation with max_depth=3, min_child_weight=5
	RMSE 0.40609433333333333 for 500 rounds
Cross validation with max_depth=4, min_child_weight=1
	RMSE 0.39546466666666663 for 499 rounds
Cross validation with max_depth=4, min_child_weight=2
	RMSE 0.393871 for 500 rounds
Cross validation with max_depth=4, min_child_weight=3
	RMSE 0.39450999999999997 for 485 rounds
Cross validation with max_depth=4, min_child_weight=4
	RMSE 0.39565233333333333 for 500 rounds
Cross validation with max_depth=4, min_child_weight=5
	RMSE 0.39463699999999996 for 499 rounds
Cross validation with max_depth=5, min_child_weight=1
	RMSE 0.3882046666666667 for 4

In [457]:
gridsearch_params_sample = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(8,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [458]:
min_rmse = float("Inf")
best_params = None

for subsample, colsample in reversed(gridsearch_params_sample):
    print("Cross validation with subsample={}, colsample={}".format(subsample, colsample))

    xgb_params['subsample'] = subsample
    xgb_params['colsample_bytree'] = colsample

    # cross validation
    cv_results = xgb.cv(dtrain = data_dmatrix, params = xgb_params, nfold=3, 
                        num_boost_round=501, metrics="rmse", as_pandas=True, seed=23)

    # Update best Root Mean Squared Error
    rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(rmse, boost_rounds))
    if rmse < min_rmse:
        min_rmse = rmse
        best_params = (subsample,colsample)
        
print("Best params: {}, {}, : {}".format(best_params[0], best_params[1], min_rmse))

#Update parameters:
xgb_params['subsample'] = subsample
xgb_params['colsample_bytree'] = colsample

Cross validation with subsample=1.0, colsample=1.0
	RMSE 0.38519233333333336 for 498 rounds
Cross validation with subsample=1.0, colsample=0.9
	RMSE 0.38478833333333334 for 499 rounds
Cross validation with subsample=1.0, colsample=0.8
	RMSE 0.38412266666666667 for 485 rounds
Cross validation with subsample=1.0, colsample=0.7
	RMSE 0.3848306666666667 for 500 rounds
Cross validation with subsample=0.9, colsample=1.0
	RMSE 0.38149166666666673 for 483 rounds
Cross validation with subsample=0.9, colsample=0.9
	RMSE 0.3836386666666667 for 490 rounds
Cross validation with subsample=0.9, colsample=0.8
	RMSE 0.38215 for 497 rounds
Cross validation with subsample=0.9, colsample=0.7
	RMSE 0.381818 for 499 rounds
Cross validation with subsample=0.8, colsample=1.0
	RMSE 0.3811653333333333 for 494 rounds
Cross validation with subsample=0.8, colsample=0.9
	RMSE 0.3817426666666666 for 485 rounds
Cross validation with subsample=0.8, colsample=0.8
	RMSE 0.38047633333333336 for 479 rounds
Cross validatio

In [468]:
xgb_model = xgb.XGBClassifier(max_depth = 6, 
                              min_child_weight = 5,
                              n_estimators=500, 
                              learning_rate=0.0995, 
                              eval_metric="rmse", 
                              subsample = 0.8,
                              colsample_bytree = 0.8,
                              early_stopping = 10)

In [469]:
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, early_stopping=10,
              eval_metric='rmse', gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.0995,
              max_delta_step=0, max_depth=6, min_child_weight=5, missing=nan,
              monotone_constraints=None, n_estimators=500, n_jobs=0,
              num_parallel_tree=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method=None, validate_parameters=False, verbosity=None)

In [470]:
# Perform the reading through all the 'test' audio files and extract the features(in this case mffc spectogram)
features = []
for index_audio_file in tqdm(range(0, len(audio_files_test))):
    data = extract_features(audio_files_test, index_audio_file)    
    features.append(data)
    
X_test = np.array(features)

100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [03:15<00:00, 15.32it/s]


In [471]:
y_pred = xgb_model.predict(X_test)

In [472]:
# Now we will save the the index of the file with the corresponding label in order to send the submission as in the sample
submission = dict()
for i in range(0, len(audio_files_test)):
    dir_name, file_name = audio_files_test[i].split('./ml-fmi-23-2020/test/test')
    index_file, extension = file_name.split(".")
    submission[index_file[1:]] = y_pred[i]

In [473]:
test_files = []
try:
    test = open(data_dir + '/test.txt', "r")
    for line in test:
        file, extension = line.split(".")
        test_files.append(file)
except OSError as err:
    print("OS error: {}".format(err))
except:
    print("Unexpected error: ", sys.exc_info()[0])
finally:
    test.close()

In [474]:
# Create submission file
import csv

with open('submission_xgb_clean_noise_reduce_ht_fd2.csv', 'w', newline='') as fout:
    csv_writer = csv.writer(fout)
    csv_writer.writerow(['name','label'])
    for i in range(0, len(test_files)):
        csv_writer.writerow([test_files[i] + '.wav',str(submission[test_files[i]])])
