# Data Modeling

### Import Libraries

In [48]:
import pandas as pd

### Import Refined DataSet

In [49]:
essentialdata = pd.read_csv('Data/PrincipalComponentData.csv', low_memory=False).dropna()

In [50]:
essentialdata.shape

(128, 52)

### Separate Features and Labels

In [51]:
features = essentialdata.drop(['Music', 'Filename'],axis=1).values
labels = essentialdata.loc[:,['Music']].values

### Perform Train-Test Split

In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

In [53]:
y_train = y_train.T[0]
y_test = y_test.T[0]

### Evaluate The Performance of Different Models

In [54]:
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    clf.fit(X_train, y_train)
    
def predict_labels(clf, features, target):
    y_pred = clf.predict(features)
    return f1_score(target, y_pred)


def train_predict(clf, X_train, y_train, X_test, y_test):
    print ("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    train_classifier(clf, X_train, y_train)
    print ("F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train)))
    print ("F1 score for test set: {:.4f}.".format(predict_labels(clf, X_test, y_test)))

In [55]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm

clf_A = GaussianNB()
clf_B = DecisionTreeClassifier()
clf_C = AdaBoostClassifier()
clf_D = svm.SVC(gamma='scale')

train_predict(clf_A, X_train, y_train, X_test, y_test)
train_predict(clf_B, X_train, y_train, X_test, y_test)
train_predict(clf_C, X_train, y_train, X_test, y_test)
train_predict(clf_D, X_train, y_train, X_test, y_test)

Training a GaussianNB using a training set size of 89. . .
F1 score for training set: 0.7708.
F1 score for test set: 0.5957.
Training a DecisionTreeClassifier using a training set size of 89. . .
F1 score for training set: 1.0000.
F1 score for test set: 0.5500.
Training a AdaBoostClassifier using a training set size of 89. . .
F1 score for training set: 1.0000.
F1 score for test set: 0.6512.
Training a SVC using a training set size of 89. . .
F1 score for training set: 0.8163.
F1 score for test set: 0.5333.


### Pickle Relevant Model

In [56]:
import pickle
pickle.dump(clf_A, open('PickledModels/'+clf_A.__class__.__name__, 'wb'))
pickle.dump(clf_B, open('PickledModels/'+clf_B.__class__.__name__, 'wb'))
pickle.dump(clf_C, open('PickledModels/'+clf_C.__class__.__name__, 'wb'))
pickle.dump(clf_D, open('PickledModels/'+clf_D.__class__.__name__, 'wb'))