In [None]:
#!/bin/python

import numpy as np
import pandas as pd
import os
import pickle
import sys
import scipy
from pathlib import Path
from collections import Counter
import random
import copy

# Machine Learning libraries
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn import metrics
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import average_precision_score

#m_trn_file_path = "../cnn_bow/cnn_trn.csv"
#m_val_file_path = "../cnn_bow/cnn_val.csv"
#m_test_file_path = "../cnn_bow/cnn_test.csv"

m_trn_file_path = "../surf_bow/surf_1000_trn.csv"
m_val_file_path = "../surf_bow/surf_1000_val.csv"
m_test_file_path = "../surf_bow/surf_1000_test.csv"

In [None]:
#a_train_df = pd.read_csv(a_trn_file_path, index_col='Unnamed: 0')
#a_train_df.drop(['target', 'name'], axis=1, inplace=True)

m_train_df = pd.read_csv(m_trn_file_path, index_col='Unnamed: 0')
m_train_df.drop(['name'], axis=1, inplace=True)

train_df = m_train_df
train_df.target.fillna('P000', inplace=True)
train_df.fillna(0.0, inplace=True)

### tf_idf conversion

# 1. Save target column, and drop if from dataframe
train_df_target = pd.DataFrame(train_df['target'], columns=['target'])
train_df.drop(['target'], axis=1, inplace=True )

# 2. Replace frequencies with tf_idf scores
tf_transformer = TfidfTransformer(use_idf=True).fit(train_df)
X_train_tf = tf_transformer.transform(train_df)
train_df = pd.DataFrame(X_train_tf.todense(), columns=train_df.columns.values)

# 3. Add back the target column
train_df = pd.concat([train_df, train_df_target], axis=1)

#a_test_df = pd.read_csv(a_val_file_path, index_col='Unnamed: 0')
#a_test_df.drop(['target', 'name'], axis=1, inplace=True)
m_test_df = pd.read_csv(m_val_file_path, index_col='Unnamed: 0')
m_test_df.drop(['name'], axis=1, inplace=True )
#test_df = pd.concat([a_test_df, m_test_df], axis=1)

test_df = m_test_df
test_df.target.fillna('P000', inplace=True)
test_df.fillna(0.0, inplace=True)

### tf_idf conversion

# 1. Save target column, and drop if from dataframe
test_df_target = pd.DataFrame(test_df['target'], columns=['target'])
test_df.drop(['target'], axis=1, inplace=True )

# 2. Replace frequencies with tf_idf scores
tf_transformer = TfidfTransformer(use_idf=True).fit(test_df)
X_train_tf = tf_transformer.transform(test_df)
test_df = pd.DataFrame(X_train_tf.todense(), columns=test_df.columns.values)

# 3. Add back the target column
test_df = pd.concat([test_df, test_df_target], axis=1)

# Machine Learning
prediction_var = list(train_df.columns)
prediction_var.remove('target')
#prediction_var.remove('name')

# Get input training data
train_X = train_df[prediction_var]

# Get input target variable
train_y = train_df.target

print(train_X.shape)
print(train_y.shape)

In [None]:
# Machine Learning
prediction_var = list(test_df.columns)
prediction_var.remove('target')

# Get test data feature
test_X = test_df[prediction_var]

# Get test data target
test_y = test_df.target

print(test_X.shape)
print(test_y.shape)

In [None]:
# class_weight='balanced',decision_function_shape = 'ovr',

dict_weights = {'P000':0.0000001, 'P001': 29, 'P002': 14, 'P003':34}

clf = svm.SVC(gamma='scale', probability=True, class_weight=dict_weights,decision_function_shape = 'ovr')

# Fit the model to training
clf.fit(train_X,train_y)

# Check prediction accuracy
prediction = clf.decision_function(test_X)

prob_list = prediction[:,1]
x = np.array([test_y == 'P001'][0]).astype(int)
print('P001 &', round(average_precision_score(x,prob_list, pos_label=1),4))

prob_list = prediction[:,2]
x = np.array([test_y == 'P002'][0]).astype(int)
print('P002 &', round(average_precision_score(x,prob_list, pos_label=1),4))

prob_list = prediction[:,3]
x = np.array([test_y == 'P003'][0]).astype(int)
print('P003 &', round(average_precision_score(x,prob_list, pos_label=1),4))

In [10]:
# Train on validation also, for the Canvas submission

m_train_df = pd.read_csv(m_trn_file_path, index_col='Unnamed: 0')
m_train_df.drop(['name'], axis=1, inplace=True)

m_test_df = pd.read_csv(m_val_file_path, index_col='Unnamed: 0')
m_test_df.drop(['name'], axis=1, inplace=True )

train_df = m_train_df
train_df.target.fillna('P000', inplace=True)
train_df.fillna(0.0, inplace=True)

test_df = m_test_df
test_df.target.fillna('P000', inplace=True)
test_df.fillna(0.0, inplace=True)

train_df = train_df.append(test_df, ignore_index=True)

### tf_idf conversion

# 1. Save target column, and drop if from dataframe
train_df_target = pd.DataFrame(train_df['target'], columns=['target'])
train_df.drop(['target'], axis=1, inplace=True )

# 2. Replace frequencies with tf_idf scores
tf_transformer = TfidfTransformer(use_idf=True).fit(train_df)
X_train_tf = tf_transformer.transform(train_df)
train_df = pd.DataFrame(X_train_tf.todense(), columns=train_df.columns.values)

# 3. Add back the target column
train_df = pd.concat([train_df, train_df_target], axis=1)

# Get input training data
train_X = train_df[prediction_var]

# Get input target variable
train_y = train_df.target

m_test_df = pd.read_csv(m_test_file_path, index_col='Unnamed: 0')

name_list = m_test_df['name']

m_test_df.drop(['name'], axis=1, inplace=True )

test_df = m_test_df
test_df.target.fillna('P000', inplace=True)
test_df.fillna(0.0, inplace=True)

# Machine Learning
prediction_var = list(test_df.columns)
prediction_var.remove('target')

# Get test data features
test_X = test_df[prediction_var]

# Get test data target
test_y = test_df.target

clf = svm.SVC(gamma='scale', probability=True, class_weight=dict_weights,decision_function_shape = 'ovr')

# Fit the model to training
clf.fit(train_X,train_y)

with open("../../all_test.video", "r") as f:
    video_list = f.readlines()

# Check prediction accuracy
prediction = clf.decision_function(test_X)

prob_list = prediction[:,1]

output_df = pd.DataFrame({"VideoID":name_list, "Label":prob_list})
output_df = output_df.set_index('VideoID')
dict1 = output_df.to_dict('index')
res = []

for line in video_list:
    vid = line.strip("\n")
    if(vid in dict1):
        res.append(dict1[vid]['Label'])
    else:
        res.append(0.0)

res = pd.DataFrame(res, columns=None)
res.to_csv(path_or_buf="../scores/" + str('P001')+"_surf.lst", index=False)

prob_list = prediction[:,2]
output_df = pd.DataFrame({"VideoID":name_list, "Label":prob_list})
output_df = output_df.set_index('VideoID')
dict1 = output_df.to_dict('index')
res = []

for line in video_list:
    vid = line.strip("\n")
    if(vid in dict1):
        res.append(dict1[vid]['Label'])
    else:
        res.append(0.0)

res = pd.DataFrame(res, columns=None)
res.to_csv(path_or_buf="../scores/" + str('P002')+"_surf.lst", index=False)

prob_list = prediction[:,3]
output_df = pd.DataFrame({"VideoID":name_list, "Label":prob_list})
output_df = output_df.set_index('VideoID')
dict1 = output_df.to_dict('index')
res = []

for line in video_list:
    vid = line.strip("\n")
    if(vid in dict1):
        res.append(dict1[vid]['Label'])
    else:
        res.append(0.0)

res = pd.DataFrame(res, columns=None)
res.to_csv(path_or_buf="../scores/" + str('P003')+"_surf.lst", index=False)