In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import itertools as it
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

In [2]:
dataset = pd.read_csv('dataset_final.csv')

___________________
## Split Into Train and Test

In [3]:
labels = list(dataset)
X_labels = labels[:-17]
Y_labels = labels[-17:]

data = dataset.as_matrix()[:,:] 

Y = data[:, -17:]
X = data[:,0:-17]

training_set_X, test_set_X, training_set_Y, test_set_Y = train_test_split(
    X, Y, test_size=0.20)

X_train = pd.DataFrame(data=training_set_X[:,:],  
                 columns=X_labels)  

X_test = pd.DataFrame(data=test_set_X[:,:],  
                 columns=X_labels)  

Y_train = pd.DataFrame(data=training_set_Y[:,:],  
                 columns=Y_labels)  

Y_test = pd.DataFrame(data=test_set_Y[:,:],  
                 columns=Y_labels)  

X_train.to_csv("X_train.csv", index = False, na_rep = np.nan)
X_test.to_csv("X_test.csv", index = False, na_rep = np.nan)
Y_train.to_csv("Y_train.csv", index = False, na_rep = np.nan)
Y_test.to_csv("Y_test.csv", index = False, na_rep = np.nan)

_______________
## Impute Data

In [5]:
for column in X_train:
    X_train[column] = X_train[column].apply(pd.to_numeric)

    #if this is a categorical column
    if np.array_equal(X_train[column].unique(),[0,1]):
        X_train[column] = X_train[column].replace(np.nan, X_train[column].value_counts()[0])
        X_test[column] = X_test[column].replace(np.nan, X_train[column].value_counts()[0])

    else: #if numerical column
        X_train[column] = X_train[column].replace(np.nan, X_train[column].mean()) 
        X_test[column] = X_test[column].replace(np.nan, X_train[column].mean()) 
          
X_train.to_csv("ImputedX_train.csv", index = False)
X_test.to_csv("ImputedX_test.csv", index = False)




______________
## Apply PCA

In [20]:
X_train = np.loadtxt("ImputedX_train.csv", delimiter=",", skiprows = 1)
X_test = np.loadtxt("ImputedX_test.csv", delimiter=",", skiprows = 1)

pca = PCA(n_components = 0.99)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [22]:
labels = ['PCA'+str(i) for i in range(len(pca.explained_variance_))]

X_test = pd.DataFrame(data=X_test_pca[:,:],  
                 columns=labels)  


X_train = pd.DataFrame(data=X_train_pca[:,:],  
                 columns=labels)  

X_train.to_csv("X_train_pca.csv", index = False)
X_test.to_csv("X_test_pca.csv", index = False)

______________
## Apply SVM Model

In [8]:
def tuneHyperparameters(classifier, training_set_X, training_set_Y, hyperparameters, number_of_folds):
    """This function runs k-fold cross validation on given set of parameters and returns best combo of parameters
    
    Parameters:
    ----------
    -classifier: base classifer, parameters will be set during this function
    -training_set_X: X data
    -training_set_Y: Y data
    -hyperparameters: dictionary of parameter name to options
    -number_of_folds: number of k-fold folds
    
    Returns: 
    -------
    Dictionary of best parameter values, for example {'C': 2, 'penalty': 'l2'}
    
    Example:
    --------
    parameters = {'C': [3,2,5,6], 'penalty': ['l2']}
    tuneHyperparameters(LogisticRegression(), X, Y, parameters, 5)
    """
    
    kf = KFold(n_splits= number_of_folds)
    kf.get_n_splits(training_set_X)
    
    allNames = sorted(hyperparameters)
    parameter_combos = it.product(*(hyperparameters[Name] for Name in allNames))
    
    metrics = []
    params = []
    for hyperparameter_combo in parameter_combos:
        params.append(hyperparameter_combo)
        for p in range(len(allNames)):
            classifier.set_params(**{allNames[p]: hyperparameter_combo[p]})
        f1_score = []
        for train_index, test_index in kf.split(training_set_X):
            X_train, X_validation = training_set_X[train_index], training_set_X[test_index]
            Y_train, Y_validation = training_set_Y[train_index], training_set_Y[test_index]

            classifier.fit(X_train, Y_train)
            print 'Classifer fit'

            y_pred = classifier.predict(X_validation)
            print 'Classifer predict'
            prec, rec, f1, sup = precision_recall_fscore_support(Y_validation, y_pred, average= "binary")
            print 'f1 calculated'
            print prec, rec, f1
            acc = accuracy_score(Y_validation, y_pred)
            print acc
            f1_score.append(f1)
            print f1
            
        metrics.append(sum(f1_score)/float(len(f1_score))) 
    
    print metrics
    best_params = params[metrics.index(max(metrics))]
    return {allNames[i]: best_params[i] for i in range(len(best_params))}
  

In [11]:
X_train = np.loadtxt("X_train_pca.csv", delimiter=",", skiprows = 1)
X_test = np.loadtxt("X_test_pca.csv", delimiter=",", skiprows = 1)
#X_train = np.loadtxt("ImputedX_train.csv", delimiter=",", skiprows = 1)
#X_test = np.loadtxt("ImputedX_test.csv", delimiter=",", skiprows = 1)
Y_train = np.loadtxt("Y_train.csv", delimiter=",", skiprows = 1)
Y_test = np.loadtxt("Y_test.csv", delimiter=",", skiprows = 1)

X_train = normalize(X_train)
X_test = normalize(X_test)

genres = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']


for i in range(len(genres)):
    print "genre is:", genres[i]
    Y_curr = Y_train[:,i]
    print tuneHyperparameters(SVC(), X_train, Y_curr, {'class_weight':['balanced'],'C':[0.001,0.01,0.1,1],'kernel':['linear','rbf'], 'gamma':[0.001,0.01,0.1,1]}, 5)
    break

genre is: Action
Classifer fit
Classifer predict
f1 calculated
0.25468164794 0.231292517007 0.242424242424
0.863079896907
0.242424242424
Classifer fit
Classifer predict
f1 calculated
0.262411347518 0.222891566265 0.241042345277
0.849871134021
0.241042345277
Classifer fit
Classifer predict
f1 calculated
0.233676975945 0.201183431953 0.216216216216
0.841121495327
0.216216216216
Classifer fit
Classifer predict
f1 calculated
0.198473282443 0.172757475083 0.184724689165
0.85207863358
0.184724689165
Classifer fit
Classifer predict
f1 calculated
0.239864864865 0.217125382263 0.227929373997
0.844988720593
0.227929373997
Classifer fit
Classifer predict
f1 calculated
0.0 0.0 0.0
0.905283505155
0.0
Classifer fit
Classifer predict
f1 calculated
0.106958762887 1.0 0.193247962747
0.106958762887
0.193247962747
Classifer fit
Classifer predict
f1 calculated
0.108926844989 1.0 0.196454519035
0.108926844989
0.196454519035
Classifer fit
Classifer predict
f1 calculated
0.0970029004189 1.0 0.176850763807
0.