In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import itertools as it
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

In [2]:
dataset = pd.read_csv('dataset_final.csv')

___________________
## Split Into Train and Test

In [3]:
labels = list(dataset)
X_labels = labels[:-17]
Y_labels = labels[-17:]

data = dataset.as_matrix()[:,:] 

Y = data[:, -17:]
X = data[:,0:-17]

training_set_X, test_set_X, training_set_Y, test_set_Y = train_test_split(
    X, Y, test_size=0.20)

X_train = pd.DataFrame(data=training_set_X[:,:],  
                 columns=X_labels)  

X_test = pd.DataFrame(data=test_set_X[:,:],  
                 columns=X_labels)  

Y_train = pd.DataFrame(data=training_set_Y[:,:],  
                 columns=Y_labels)  

Y_test = pd.DataFrame(data=test_set_Y[:,:],  
                 columns=Y_labels)  

X_train.to_csv("X_train.csv", index = False, na_rep = np.nan)
X_test.to_csv("X_test.csv", index = False, na_rep = np.nan)
Y_train.to_csv("Y_train.csv", index = False, na_rep = np.nan)
Y_test.to_csv("Y_test.csv", index = False, na_rep = np.nan)

_______________
## Impute Data

In [4]:
for column in X_train:
    X_train[column] = X_train[column].apply(pd.to_numeric)

    #if this is a categorical column
    if np.array_equal(X_train[column].unique(),[0,1]):
        X_train[column] = X_train[column].replace(np.nan, X_train[column].value_counts()[0])
        X_test[column] = X_test[column].replace(np.nan, X_train[column].value_counts()[0])

    else: #if numerical column
        X_train[column] = X_train[column].replace(np.nan, X_train[column].median()) 
        X_test[column] = X_test[column].replace(np.nan, X_train[column].median()) 
          
X_train.to_csv("ImputedX_train.csv", index = False)
X_test.to_csv("ImputedX_test.csv", index = False)


______________
## Apply PCA

In [9]:
X_train = np.loadtxt("ImputedX_train.csv", delimiter=",", skiprows = 1)
X_test = np.loadtxt("ImputedX_test.csv", delimiter=",", skiprows = 1)
X_train = normalize(X_train, axis = 0)
X_test = normalize(X_test, axis = 0)

pca = PCA(n_components = 0.90)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [10]:
labels = ['PCA'+str(i) for i in range(len(pca.explained_variance_))]

X_test = pd.DataFrame(data=X_test_pca[:,:],  
                 columns=labels)  


X_train = pd.DataFrame(data=X_train_pca[:,:],  
                 columns=labels)  

X_train.to_csv("X_train_pca.csv", index = False)
X_test.to_csv("X_test_pca.csv", index = False)

______________
## Apply SVM Model

In [7]:
def tuneHyperparameters(classifier, training_set_X, training_set_Y, hyperparameters, number_of_folds):
    """This function runs k-fold cross validation on given set of parameters and returns best combo of parameters
    
    Parameters:
    ----------
    -classifier: base classifer, parameters will be set during this function
    -training_set_X: X data
    -training_set_Y: Y data
    -hyperparameters: dictionary of parameter name to options
    -number_of_folds: number of k-fold folds
    
    Returns: 
    -------
    Dictionary of best parameter values, for example {'C': 2, 'penalty': 'l2'}
    
    Example:
    --------
    parameters = {'C': [3,2,5,6], 'penalty': ['l2']}
    tuneHyperparameters(LogisticRegression(), X, Y, parameters, 5)
    """
    
    kf = KFold(n_splits= number_of_folds)
    kf.get_n_splits(training_set_X)
    
    allNames = sorted(hyperparameters)
    parameter_combos = it.product(*(hyperparameters[Name] for Name in allNames))
    
    metrics = []
    params = []
    for hyperparameter_combo in parameter_combos:
        params.append(hyperparameter_combo)
        for p in range(len(allNames)):
            classifier.set_params(**{allNames[p]: hyperparameter_combo[p]})
        f1_score = []
        for train_index, test_index in kf.split(training_set_X):
            X_train, X_validation = training_set_X[train_index], training_set_X[test_index]
            Y_train, Y_validation = training_set_Y[train_index], training_set_Y[test_index]

            classifier.fit(X_train, Y_train)

            y_pred = classifier.predict(X_validation)
            prec, rec, f1, sup = precision_recall_fscore_support(Y_validation, y_pred, average= "binary")
            acc = accuracy_score(Y_validation, y_pred)
            f1_score.append(f1)
            
        metrics.append(sum(f1_score)/float(len(f1_score))) 
    
    best_params = params[metrics.index(max(metrics))]
    print max(metrics)
    return {allNames[i]: best_params[i] for i in range(len(best_params))}
  

In [None]:
X_train = np.loadtxt("X_train_pca.csv", delimiter=",", skiprows = 1)
X_test = np.loadtxt("X_test_pca.csv", delimiter=",", skiprows = 1)
#X_train = np.loadtxt("ImputedX_train.csv", delimiter=",", skiprows = 1)
#X_test = np.loadtxt("ImputedX_test.csv", delimiter=",", skiprows = 1)
Y_train = np.loadtxt("Y_train.csv", delimiter=",", skiprows = 1)
Y_test = np.loadtxt("Y_test.csv", delimiter=",", skiprows = 1)

#X_train = normalize(X_train, axis = 0)
#X_test = normalize(X_test, axis = 0)

genres = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']


for i in range(len(genres)):
    print "genre is:", genres[i]
    Y_curr = Y_train[:,i]
    #print tuneHyperparameters(SVC(), X_train, Y_curr, {'class_weight':['balanced'],'C':[0.001,0.1,1],'kernel':['linear']}, 5)
    print tuneHyperparameters(SVC(), X_train, Y_curr, {'class_weight':['balanced'],'C':[1,2],'kernel':['linear']}, 5)

# Get Results on Test Set

In [11]:
c_param = [1,1,1,2,1,1,2,1,1,1,1,1,1,1,1,1,0.1]
y_predictions = []

for i in range(len(genres)):
    print "genre is:", genres[i]
    Y_curr = Y_train[:,i]
    Y_test_curr = Y_test[:,i]
    
    svm = SVC(class_weight = 'balanced', C = c_param[i], kernel = 'linear')
    svm.fit(X_train, Y_curr)

    y_pred = svm.predict(X_test)
    prec, rec, f1, sup = precision_recall_fscore_support(Y_test_curr, y_pred, average= "binary")
    print f1
    y_predictions.append(y_pred)
    


genre is: Action
0.265372168285
genre is: Adventure
0.147477360931
genre is: Animation
0.400606980273
genre is: Comedy
0.447845804989
genre is: Crime
0.202496532594
genre is: Documentary
0.487804878049
genre is: Drama
0.57538849223
genre is: Family
0.196428571429
genre is: Fantasy
0.0808344198175
genre is: History
0.0364188163885
genre is: Horror
0.278330019881
genre is: Mystery
0.107142857143
genre is: Romance
0.273704789834
genre is: Science Fiction
0.166666666667
genre is: Thriller
0.242990654206
genre is: War
0.137614678899
genre is: Western
0.615384615385


## Look at Hamming Loss

In [12]:
from sklearn.metrics import hamming_loss

In [19]:
a = np.matrix(y_predictions)
a = np.transpose(a)
 
hamming_loss(Y_test, a)

0.23024371843500493

____________
# Sample Data For Tuning

In [11]:
X_train = np.loadtxt("X_train_pca.csv", delimiter=",", skiprows = 1)
X_test = np.loadtxt("X_test_pca.csv", delimiter=",", skiprows = 1)
Y_train = np.loadtxt("Y_train.csv", delimiter=",", skiprows = 1)
Y_test = np.loadtxt("Y_test.csv", delimiter=",", skiprows = 1)


genres = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']
sample_size = X_train.shape[0]/5

for i in range(len(genres)):
    print "genre is:", genres[i]
    
    s = np.random.choice(X_train.shape[0], sample_size, replace=False)
    X_train_sample = X_train[s, :]
    Y_curr = Y_train[s, i]
    print tuneHyperparameters(SVC(), X_train_sample, Y_curr, {'class_weight':['balanced'],'C':[0.001,0.1,1,10],'kernel':['rbf'],'gamma':[0.001,0.1,1,5]}, 5)

genre is: Action
0.284886958077
{'kernel': 'rbf', 'C': 1, 'gamma': 5, 'class_weight': 'balanced'}
genre is: Adventure
0.115759726816
{'kernel': 'rbf', 'C': 1, 'gamma': 1, 'class_weight': 'balanced'}
genre is: Animation
0.405414984711
{'kernel': 'rbf', 'C': 10, 'gamma': 0.1, 'class_weight': 'balanced'}
genre is: Comedy
0.405489071525
{'kernel': 'rbf', 'C': 1, 'gamma': 1, 'class_weight': 'balanced'}
genre is: Crime
0.187542073798
{'kernel': 'rbf', 'C': 1, 'gamma': 5, 'class_weight': 'balanced'}
genre is: Documentary
0.468122693662
{'kernel': 'rbf', 'C': 10, 'gamma': 0.1, 'class_weight': 'balanced'}
genre is: Drama
0.552402109197
{'kernel': 'rbf', 'C': 0.001, 'gamma': 0.001, 'class_weight': 'balanced'}
genre is: Family
0.18140493057
{'kernel': 'rbf', 'C': 1, 'gamma': 1, 'class_weight': 'balanced'}
genre is: Fantasy
0.103621650239
{'kernel': 'rbf', 'C': 10, 'gamma': 0.1, 'class_weight': 'balanced'}
genre is: History
0.0392344497608
{'kernel': 'rbf', 'C': 1, 'gamma': 1, 'class_weight': 'bal

In [12]:
c_param = [(1,5),(1,1),(10,0.1),(1,1),(1,5),(10,0.1),(0.001,0.001),(1,1),(10,0.1),(1,1),(10,5),(10,0.1),(10,0.1),(1,1),(10,5),(0.1,5),(10,0.1)]
y_predictions = []

for i in range(len(genres)):
    print "genre is:", genres[i]
    Y_curr = Y_train[:,i]
    Y_test_curr = Y_test[:,i]
    
    svm = SVC(class_weight = 'balanced', C = c_param[i][0], gamma = c_param[i][1], kernel = 'rbf')
    svm.fit(X_train, Y_curr)

    y_pred = svm.predict(X_test)
    prec, rec, f1, sup = precision_recall_fscore_support(Y_test_curr, y_pred, average= "binary")
    print f1
    y_predictions.append(y_pred)
    



genre is: Action
0.232227488152
genre is: Adventure
0.138020833333
genre is: Animation
0.394736842105
genre is: Comedy
0.450346420323
genre is: Crime
0.196923076923
genre is: Documentary
0.479204339964
genre is: Drama
0.542196831992
genre is: Family
0.211731044349
genre is: Fantasy
0.0918114143921
genre is: History
0.0280811232449
genre is: Horror
0.290976058932
genre is: Mystery
0.0974212034384
genre is: Romance
0.267168391345
genre is: Science Fiction
0.168498168498
genre is: Thriller
0.266875981162
genre is: War
0.152249134948
genre is: Western
0.316784869976


In [14]:
from sklearn.metrics import hamming_loss
a = np.matrix(y_predictions)
a = np.transpose(a)
 
hamming_loss(Y_test, a)

0.22881870952869926