# Data Modeling

### Import Libraries

In [37]:
import pandas as pd

### Import Refined DataSet

In [38]:
essentialdata = pd.read_csv('Data/PrincipalComponentData.csv')

In [39]:
essentialdata.head()

Unnamed: 0,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,...,PCA42,PCA43,PCA44,PCA45,PCA46,PCA47,PCA48,PCA49,Music,Filename
0,-5.38267,-1.408019,-0.26719,1.104658,-0.055857,0.69788,0.503811,0.601028,0.603681,0.220391,...,-0.090331,-0.126543,0.222985,-0.391295,-0.04096,-0.064627,-0.165569,0.035157,0,acomic2.wav
1,-3.227412,-0.671966,-0.557932,0.16952,-0.513011,-0.485725,0.572506,-0.994831,-0.407965,-0.23318,...,0.23729,-0.309418,0.033326,-0.077643,0.323396,-0.069389,-0.499215,-0.010356,0,acomic.wav
2,-0.621593,-1.390643,-0.514943,3.923674,-1.027973,0.380351,0.872491,1.137823,-0.217091,1.49245,...,-0.024066,0.10347,0.013676,0.630781,-0.253838,-0.108359,0.087015,0.180932,0,allison.wav
3,-4.323109,-0.370215,-0.605461,0.589852,-0.232428,0.339021,0.809531,-1.550904,0.513967,0.533095,...,0.068455,0.037394,-0.621647,-0.004052,-0.449807,-0.160071,-0.157622,-0.013166,0,amal.wav
4,-0.50345,-0.882949,-0.474098,1.554466,-0.351477,-1.245828,0.19047,-0.707988,0.382783,-0.583487,...,-0.617341,0.195878,0.170842,-0.6819,0.198254,-0.261608,-0.103652,0.250543,0,austria.wav


### Separate Features and Labels

In [40]:
features = essentialdata.drop(['Music', 'Filename'],axis=1).values
labels = essentialdata.loc[:,['Music']].values

### Perform Train-Test Split

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.6, random_state=42)

In [42]:
y_train = y_train.T[0]
y_test = y_test.T[0]

### Evaluate The Performance of Different Models

In [43]:
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    clf.fit(X_train, y_train)
    
def predict_labels(clf, features, target):
    y_pred = clf.predict(features)
    return f1_score(target, y_pred)


def train_predict(clf, X_train, y_train, X_test, y_test):
    print ("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    train_classifier(clf, X_train, y_train)
    print ("F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train)))
    print ("F1 score for test set: {:.4f}.".format(predict_labels(clf, X_test, y_test)))

In [44]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm

clf_A = GaussianNB()
clf_B = DecisionTreeClassifier()
clf_C = AdaBoostClassifier()
clf_D = svm.SVC(gamma='scale')

train_predict(clf_A, X_train, y_train, X_test, y_test)
train_predict(clf_B, X_train, y_train, X_test, y_test)
train_predict(clf_C, X_train, y_train, X_test, y_test)
train_predict(clf_D, X_train, y_train, X_test, y_test)

Training a GaussianNB using a training set size of 51. . .
F1 score for training set: 0.7907.
F1 score for test set: 0.7123.
Training a DecisionTreeClassifier using a training set size of 51. . .
F1 score for training set: 1.0000.
F1 score for test set: 0.7013.
Training a AdaBoostClassifier using a training set size of 51. . .
F1 score for training set: 1.0000.
F1 score for test set: 0.6667.
Training a SVC using a training set size of 51. . .
F1 score for training set: 0.8500.
F1 score for test set: 0.7027.


### Pickle Relevant Model

In [45]:
import pickle
pickle.dump(clf_A, open('PickledModels/'+clf_A.__class__.__name__, 'wb'))
pickle.dump(clf_B, open('PickledModels/'+clf_B.__class__.__name__, 'wb'))
pickle.dump(clf_C, open('PickledModels/'+clf_C.__class__.__name__, 'wb'))
pickle.dump(clf_D, open('PickledModels/'+clf_D.__class__.__name__, 'wb'))