# Data Modeling

### Import Libraries

In [1]:
import pandas as pd

### Import Refined DataSet

In [2]:
essentialdata = pd.read_csv('Data/PrincipalComponentData.csv')

In [3]:
essentialdata.head()

Unnamed: 0,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,...,PCA42,PCA43,PCA44,PCA45,PCA46,PCA47,PCA48,PCA49,Music,Filename
0,-3.816012,6.572316,0.34725,0.829393,0.001834,0.81093,-1.836805,-2.008,1.262345,-0.050803,...,-1.443265,-0.239589,0.280315,1.14798,0.725233,0.549861,-0.363714,-0.101971,0,acomic2.wav
1,-3.963032,7.845675,0.78138,1.211285,-0.733665,-0.082552,-0.792448,-0.87892,0.424085,0.460887,...,-0.891295,-0.376305,0.490656,-0.065805,0.109681,0.276101,-0.359412,-0.553112,0,acomic.wav
2,5.148364,0.119405,0.155236,-3.167039,-3.202758,-0.554421,0.60048,1.951873,-0.675283,1.532033,...,0.325509,-1.531609,0.667297,0.379448,-0.788391,-0.175263,-0.809928,-0.317829,0,allison.wav
3,1.639687,0.36749,0.441764,-2.503467,0.725221,-1.860704,-0.336967,3.233178,-4.816601,-0.002228,...,0.253526,-0.314578,-0.395561,0.358883,-0.664901,0.108055,0.4165,0.213533,0,amal.wav
4,2.632253,0.906935,0.199317,-2.542599,-1.278079,-0.435696,-1.576462,1.763553,-2.774685,-1.105315,...,0.393921,-0.031789,0.556298,0.322125,0.009665,-0.541398,-0.555361,0.351066,0,austria.wav


### Separate Features and Labels

In [4]:
features = essentialdata.drop(['Music', 'Filename'],axis=1).values
labels = essentialdata.loc[:,['Music']].values

### Perform Train-Test Split

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [6]:
y_train = y_train.T[0]
y_test = y_test.T[0]

### Evaluate The Performance of Different Models

In [7]:
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    clf.fit(X_train, y_train)
    
def predict_labels(clf, features, target):
    y_pred = clf.predict(features)
    return f1_score(target, y_pred)


def train_predict(clf, X_train, y_train, X_test, y_test):
    print ("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    train_classifier(clf, X_train, y_train)
    print ("F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train)))
    print ("F1 score for test set: {:.4f}.".format(predict_labels(clf, X_test, y_test)))

In [8]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm

clf_A = GaussianNB()
clf_B = DecisionTreeClassifier()
clf_C = AdaBoostClassifier()
clf_D = svm.SVC(gamma='scale')

train_predict(clf_A, X_train, y_train, X_test, y_test)
train_predict(clf_B, X_train, y_train, X_test, y_test)
train_predict(clf_C, X_train, y_train, X_test, y_test)
train_predict(clf_D, X_train, y_train, X_test, y_test)

Training a GaussianNB using a training set size of 102. . .
F1 score for training set: 0.9515.
F1 score for test set: 0.9286.
Training a DecisionTreeClassifier using a training set size of 102. . .
F1 score for training set: 1.0000.
F1 score for test set: 0.8462.
Training a AdaBoostClassifier using a training set size of 102. . .
F1 score for training set: 1.0000.
F1 score for test set: 0.8667.
Training a SVC using a training set size of 102. . .
F1 score for training set: 1.0000.
F1 score for test set: 0.9655.


### Pickle Relevant Model

In [9]:
import pickle
pickle.dump(clf_A, open('PickledModels/'+clf_A.__class__.__name__, 'wb'))
pickle.dump(clf_B, open('PickledModels/'+clf_B.__class__.__name__, 'wb'))
pickle.dump(clf_C, open('PickledModels/'+clf_C.__class__.__name__, 'wb'))
pickle.dump(clf_D, open('PickledModels/'+clf_D.__class__.__name__, 'wb'))