In [32]:
#!/bin/python

import numpy as np
import pandas as pd
import os
import pickle
import sys
import scipy
from pathlib import Path
from collections import Counter
import random
import copy

# Machine Learning libraries
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn import metrics
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import average_precision_score

#s_trn_file_path = "../surf_bow/surf_2000_trn.csv"
#s_val_file_path = "../surf_bow/surf_2000_val.csv"
#s_test_file_path = "../surf_bow/surf_2000_test.csv"

s_trn_file_path = "../cnn_bow/cnn_trn.csv"
s_val_file_path = "../cnn_bow/cnn_val.csv"
s_test_file_path = "../cnn_bow/cnn_test.csv"

In [33]:
s_train_df = pd.read_csv(s_trn_file_path, index_col='Unnamed: 0')
s_train_df.drop(['name'], axis=1, inplace=True )

train_df = s_train_df
train_df.target.fillna('P000', inplace=True)
train_df.fillna(0.0, inplace=True)

##

### tf_idf conversion

# 1. Save target column, and drop if from dataframe
train_df_target = pd.DataFrame(train_df['target'], columns=['target'])
train_df.drop(['target'], axis=1, inplace=True )

# 2. Replace frequencies with tf_idf scores
tf_transformer = TfidfTransformer(use_idf=True).fit(train_df)
X_train_tf = tf_transformer.transform(train_df)
train_df = pd.DataFrame(X_train_tf.todense(), columns=train_df.columns.values)

# 3. Add back the target column
train_df = pd.concat([train_df, train_df_target], axis=1)

##

s_test_df = pd.read_csv(s_val_file_path, index_col='Unnamed: 0')
s_test_df.drop(['name'], axis=1, inplace=True )

test_df = s_test_df
test_df.target.fillna('P000', inplace=True)
test_df.fillna(0.0, inplace=True)

##

### tf_idf conversion

# 1. Save target column, and drop if from dataframe
test_df_target = pd.DataFrame(test_df['target'], columns=['target'])
test_df.drop(['target'], axis=1, inplace=True )

# 2. Replace frequencies with tf_idf scores
tf_transformer = TfidfTransformer(use_idf=True).fit(test_df)
X_train_tf = tf_transformer.transform(test_df)
test_df = pd.DataFrame(X_train_tf.todense(), columns=test_df.columns.values)

# 3. Add back the target column
test_df = pd.concat([test_df, test_df_target], axis=1)

##

# Machine Learning
prediction_var = list(train_df.columns)
prediction_var.remove('target')

# Get input training data
train_X = train_df[prediction_var]

# Get input target variable
train_y = train_df.target

print(train_X.shape)
print(train_X.head())
print(train_y.shape)
print(train_y.head())

(836, 512)
         C0        C1        C2        C3        C4        C5        C6  \
0  0.010122  0.018024  0.000900  0.010409  0.000605  0.005340  0.008419   
1  0.050114  0.013067  0.009877  0.061242  0.038988  0.018305  0.026640   
2  0.009906  0.026293  0.034094  0.054672  0.082611  0.010293  0.056078   
3  0.013478  0.066380  0.017004  0.015395  0.008548  0.069515  0.017058   
4  0.029833  0.095173  0.027000  0.061712  0.041942  0.023729  0.029055   

         C7        C8        C9  ...      C502      C503      C504      C505  \
0  0.008305  0.030344  0.050468  ...  0.003989  0.030597  0.087661  0.008131   
1  0.027461  0.018837  0.041661  ...  0.018111  0.005752  0.048789  0.015635   
2  0.045595  0.024558  0.089477  ...  0.014988  0.028714  0.064525  0.037818   
3  0.028123  0.053700  0.022339  ...  0.011754  0.024436  0.040179  0.059960   
4  0.016515  0.044979  0.047156  ...  0.007343  0.017801  0.015857  0.028200   

       C506      C507      C508      C509      C510      

In [34]:
# Machine Learning
prediction_var = list(test_df.columns)
prediction_var.remove('target')

# Get test data feature
test_X = test_df[prediction_var]

# Get test data target
test_y = test_df.target

print(test_X.shape)
print(test_X.head())
print(test_y.shape)
print(test_y.head())

(400, 512)
         C0        C1        C2        C3        C4        C5        C6  \
0  0.002256  0.014969  0.068157  0.030224  0.043937  0.050992  0.023642   
1  0.026249  0.055684  0.013937  0.044967  0.015883  0.012727  0.025819   
2  0.038058  0.105269  0.053092  0.048149  0.050134  0.022210  0.017858   
3  0.014148  0.046026  0.011328  0.023957  0.015194  0.031181  0.017096   
4  0.016313  0.020703  0.003254  0.023225  0.019445  0.044612  0.015210   

         C7        C8        C9  ...      C502      C503      C504      C505  \
0  0.008669  0.020163  0.027555  ...  0.002847  0.008360  0.019811  0.038092   
1  0.029802  0.018709  0.104177  ...  0.013205  0.017230  0.020660  0.015680   
2  0.016442  0.024401  0.035477  ...  0.036244  0.017640  0.019987  0.040829   
3  0.045868  0.023756  0.017671  ...  0.005807  0.039719  0.048694  0.012453   
4  0.019959  0.074474  0.019414  ...  0.010459  0.019003  0.056144  0.021896   

       C506      C507      C508      C509      C510      

In [35]:
my_dict = {'P000':0.000001, 'P001': 29, 'P002': 14, 'P003':54}

classifiers = [
    svm.SVC(gamma='scale', class_weight=my_dict,decision_function_shape = 'ovr', kernel='rbf'),
    ExtraTreesClassifier(n_estimators=5, class_weight='balanced'),
    RandomForestClassifier(n_estimators=5, class_weight='balanced'),
    AdaBoostClassifier(n_estimators=5),
    GradientBoostingClassifier()
]


for model in classifiers:
    clf = model

    # Fit the model to training
    clf.fit(train_X,train_y)

    # Check prediction accuracy
    prediction = clf.predict(test_X)
    print(type(clf), metrics.f1_score(prediction,test_y,average=None))

  'recall', 'true', average, warn_for)


<class 'sklearn.svm.classes.SVC'> [0.         0.2027027  0.4047619  0.20535714]
<class 'sklearn.ensemble.forest.ExtraTreesClassifier'> [0.93131868 0.33333333 0.42857143 0.15384615]
<class 'sklearn.ensemble.forest.RandomForestClassifier'> [0.93188011 0.23529412 0.33333333 0.16      ]


  'recall', 'true', average, warn_for)


<class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'> [0.92076503 0.         0.33333333 0.        ]
<class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> [0.93837535 0.61538462 0.51612903 0.34482759]


In [36]:
# Train on validation also, for the Kaggle submission

s_train_df = pd.read_csv(s_trn_file_path, index_col='Unnamed: 0')
s_train_df.drop(['name'], axis=1, inplace=True)

s_test_df = pd.read_csv(s_val_file_path, index_col='Unnamed: 0')
s_test_df.drop(['name'], axis=1, inplace=True )

train_df = s_train_df
train_df.target.fillna('P000', inplace=True)
train_df.fillna(0.0, inplace=True)

test_df = s_test_df
test_df.target.fillna('P000', inplace=True)
test_df.fillna(0.0, inplace=True)

train_df = train_df.append(test_df, ignore_index=True)

### tf_idf conversion

# 1. Save target column, and drop if from dataframe
train_df_target = pd.DataFrame(train_df['target'], columns=['target'])
train_df.drop(['target'], axis=1, inplace=True )

# 2. Replace frequencies with tf_idf scores
tf_transformer = TfidfTransformer(use_idf=True).fit(train_df)
X_train_tf = tf_transformer.transform(train_df)
train_df = pd.DataFrame(X_train_tf.todense(), columns=train_df.columns.values)

# 3. Add back the target column
train_df = pd.concat([train_df, train_df_target], axis=1)

# Get input training data
train_X = train_df[prediction_var]

# Get input target variable
train_y = train_df.target

In [37]:
s_test_df = pd.read_csv(s_test_file_path, index_col='Unnamed: 0')

name_list = s_test_df['name']

s_test_df.drop(['name'], axis=1, inplace=True )

test_df = s_test_df
test_df.target.fillna('P000', inplace=True)
test_df.fillna(0.0, inplace=True)

### tf_idf conversion

# 1. Save target column, and drop if from dataframe
test_df_target = pd.DataFrame(test_df['target'], columns=['target'])
test_df.drop(['target'], axis=1, inplace=True )

# 2. Replace frequencies with tf_idf scores
tf_transformer = TfidfTransformer(use_idf=True).fit(test_df)
X_train_tf = tf_transformer.transform(test_df)
test_df = pd.DataFrame(X_train_tf.todense(), columns=test_df.columns.values)

# 3. Add back the target column
test_df = pd.concat([test_df, test_df_target], axis=1)

# Machine Learning
prediction_var = list(test_df.columns)
prediction_var.remove('target')

# Get test data features
test_X = test_df[prediction_var]

# Get test data target
test_y = test_df.target

print(test_X.shape)
print(test_y.shape)

clf = svm.SVC(gamma='scale', class_weight=my_dict,decision_function_shape = 'ovr', probability=True)
clf.fit(train_X,train_y)
prediction = clf.predict(test_X)

(1699, 512)
(1699,)


In [38]:
for i in range(len(prediction)):
    if(prediction[i] == 'P001'):
        prediction[i] = '1'
    elif(prediction[i] == 'P002'):
        prediction[i] = '2'
    elif(prediction[i] == 'P003'):
        prediction[i] = '3'
    else:
        prediction[i] = random.sample({1,2,3}, 1)[0]

In [39]:
output_df = pd.DataFrame({"VideoID":name_list, "Label":prediction})
output_df = output_df.set_index('VideoID')

dict1 = output_df.to_dict('index')

In [40]:
with open("../../all_test.video", "r") as f:
    video_list = f.readlines()
    
res = pd.DataFrame(columns=['VideoID', 'Label'])

for line in video_list:
    vid = line.strip("\n")
    if(vid in dict1):
        res = res.append({'VideoID': vid, 'Label':dict1[vid]['Label']}, ignore_index=True)
    else:
        res = res.append({'VideoID': vid, 'Label': random.sample({1,2,3}, 1)[0]}, ignore_index=True)

res.to_csv(path_or_buf='../kaggle_prediction.csv', index=False)