In [1]:
import os
import numpy as np 
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cmx
import matplotlib.colors as colors
import math

from sklearn.datasets import make_multilabel_classification
from sklearn.datasets import fetch_mldata

from sklearn.dummy import DummyClassifier
from sklearn.multiclass import OneVsRestClassifier
#from sklearn.multiclass import LabelPowerSetClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA



from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import jaccard_similarity_score

from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

from functools import partial

%matplotlib inline



In [2]:
dir_python_notebook = os.getcwd()
dir_movie_project = os.path.abspath(os.path.join(dir_python_notebook, os.pardir))
dir_data = os.path.join(dir_movie_project, 'data')

In [3]:
filename = dir_data + '//imdb_cluster_result_whole.csv'
data_df= pd.read_csv(filename)

In [4]:
data_df.columns

Index([u'certificates_R', u'certificates_PG', u'art.direction_1',
       u'assistant.director_1', u'cinematographer_1', u'costume.department_1',
       u'costume.designer_1', u'countries_1', u'director_1', u'distributors_1',
       u'editor_1', u'languages_1', u'make.up_1', u'miscellaneous.companies_1',
       u'miscellaneous.crew_1', u'original.music_1', u'producer_1',
       u'production.companies_1', u'production.manager_1', u'sound.crew_1',
       u'writer_1', u'special.effects.companies_1', u'cast_1', u'cast_2',
       u'cast_3', u'cast_4', u'runtimes_avg', u'rating', u'imdb_id',
       u'tmdb_id', u'Sci.Fi', u'Crime', u'Romance', u'Animation', u'Music',
       u'Adult', u'Comedy', u'War', u'Horror', u'Film.Noir', u'Western',
       u'News', u'Reality.TV', u'Thriller', u'Adventure', u'Mystery', u'Short',
       u'Talk.Show', u'Drama', u'Action', u'Documentary', u'Musical',
       u'History', u'Family', u'Fantasy', u'Game.Show', u'Sport', u'Biography',
       u'cluster_response', u

In [5]:
X_var= list(data_df.columns.values)
X_var = X_var[0:28]
X_var

['certificates_R',
 'certificates_PG',
 'art.direction_1',
 'assistant.director_1',
 'cinematographer_1',
 'costume.department_1',
 'costume.designer_1',
 'countries_1',
 'director_1',
 'distributors_1',
 'editor_1',
 'languages_1',
 'make.up_1',
 'miscellaneous.companies_1',
 'miscellaneous.crew_1',
 'original.music_1',
 'producer_1',
 'production.companies_1',
 'production.manager_1',
 'sound.crew_1',
 'writer_1',
 'special.effects.companies_1',
 'cast_1',
 'cast_2',
 'cast_3',
 'cast_4',
 'runtimes_avg',
 'rating']

In [6]:
Y_var = list(data_df.columns.values)
Y_var = Y_var[59:66]
Y_var

['cluster_1',
 'cluster_2',
 'cluster_3',
 'cluster_4',
 'cluster_5',
 'cluster_6',
 'cluster_7']

In [7]:
data_df.head(1)

Unnamed: 0,certificates_R,certificates_PG,art.direction_1,assistant.director_1,cinematographer_1,costume.department_1,costume.designer_1,countries_1,director_1,distributors_1,...,Sport,Biography,cluster_response,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7
0,1,0,0.413003,0.000264,0.000147,1.5e-05,0.000469,0.085824,0.000147,0.000103,...,0,0,2,0,0,0,1,0,1,0


In [8]:
#Input: dataframe from csv file
#Output: y: response variable that is good for multi-label classification
#        m: processor, may need to transform back in later
def process_multilabel(dataframe):
    #convert response variable to a set format
    #for example, '"Romance, "Horror"' to ("Romance", "Horror")
    dataframe['genres_comb'] = dataframe['genres_comb'].apply(lambda x: eval(x))
    y = dataframe.ix[:,'genres_comb']
    m = MultiLabelBinarizer().fit(y)
    y = m.transform(y)
    return(y, m)

In [9]:
train_df = data_df[data_df[u'tmdb_id'] < 50000]
test_df = data_df[data_df[u'tmdb_id'] >= 50000]

In [10]:
train_df.shape

(24326, 66)

In [11]:
METRICS = {
    "hamming_loss": hamming_loss,
    "subset_accuracy": accuracy_score,
    "jaccard": jaccard_similarity_score,
    "macro-f1": partial(f1_score, average="macro"),
    "samples-f1": partial(f1_score, average="samples"),
    "weighted-f1": partial(f1_score, average="weighted"),
    "micro-f1": partial(f1_score, average="micro"),
}

In [12]:
MODEL = {
    #"Random Forest": RandomForestClassifier(random_state=0),
    #"Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=0),
    #"SVM": GridSearchCV(LinearSVC(random_state=0), scoring='f1',param_grid={"C": np.logspace(-5, -5, 20)}),
    #"KNN, k=5": KNeighborsClassifier(n_neighbors=5),
    #"KNN, k=10": KNeighborsClassifier(n_neighbors=10),
}


In [13]:
X_train = train_df[X_var]
Y_true_train = train_df[Y_var]
X_test = test_df[X_var]
Y_true_test = test_df[Y_var]

In [19]:
metric_train = {}
metric_test = {}
for model_name, model in MODEL.items():
    clf = OneVsRestClassifier(model)
    clf.fit(X_train, Y_true_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    
    metric_train[model_name] = {} 
    metric_train[model_name]["micro-f1"] = f1_score(Y_true_train, y_pred_train, average="micro")
    metric_train[model_name]["weighted-f1"] = f1_score(Y_true_train, y_pred_train, average="weighted")
    metric_train[model_name]["samples-f1"] = f1_score(Y_true_train, y_pred_train, average="samples")
    metric_train[model_name]["macro-f1"] = f1_score(Y_true_train, y_pred_train, average="macro")
    metric_train[model_name]["hamming_loss"] = hamming_loss(Y_true_train, y_pred_train)
    metric_train[model_name]["subset_accuracy"] = accuracy_score(Y_true_train, y_pred_train)
    metric_train[model_name]["jaccard"] = jaccard_similarity_score(Y_true_train, y_pred_train)
    
    metric_test[model_name] = {} 
    metric_test[model_name]["micro-f1"] = f1_score(Y_true_test, y_pred_test, average="micro")
    metric_test[model_name]["weighted-f1"] = f1_score(Y_true_test, y_pred_test, average="weighted")
    metric_test[model_name]["samples-f1"] = f1_score(Y_true_test, y_pred_test, average="samples")
    metric_test[model_name]["macro-f1"] = f1_score(Y_true_test, y_pred_test, average="macro")
    metric_test[model_name]["hamming_loss"] = hamming_loss(Y_true_test, y_pred_test)
    metric_test[model_name]["subset_accuracy"] = accuracy_score(Y_true_test, y_pred_test)
    metric_test[model_name]["jaccard"] = jaccard_similarity_score(Y_true_test, y_pred_test)

  'precision', 'predicted', average, warn_for)


KeyboardInterrupt: 

In [None]:
y_pred_train

In [90]:
for model_name, model in MODEL.items():
    #print(model_name)
    metric_test_df_new = pd.DataFrame.from_dict(metric_test[ model_name], orient='index').transpose()
    metric_test_df_new['model'] = model_name
    metric_test_df_new['strategy'] = "One vs rest"
    try:
        metric_test_df = metric_test_df.append(metric_test_df_new, ignore_index=True)
    except:
        metric_test_df = metric_test_df_new

In [91]:
metric_test_df

Unnamed: 0,micro-f1,jaccard,macro-f1,samples-f1,subset_accuracy,weighted-f1,hamming_loss,model,strategy
0,0.539654,0.460998,0.34359,0.550919,0.222731,0.500538,0.216458,"KNN, k=5",One vs rest
1,0.55656,0.486085,0.342994,0.562768,0.280324,0.492574,0.187398,Random Forest,One vs rest
2,0.58882,0.532148,0.335847,0.61236,0.316097,0.504393,0.175112,Extra Trees,One vs rest
3,0.549086,0.484711,0.303238,0.562636,0.276835,0.472725,0.191492,"KNN, k=10",One vs rest


In [22]:
from skmultilearn.ensemble.rakeld import RakelD
from skmultilearn.problem_transform import BinaryRelevance, LabelPowerset
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

x, y = make_multilabel_classification(sparse=True, n_labels=5,
                                      return_indicator='sparse', allow_unlabeled=False)

parameters = {
    'labelset_size': range(2, 3),
    'classifier': [LabelPowerset(), BinaryRelevance()],
    'classifier__classifier': [MultinomialNB()],
    'classifier__classifier__alpha': [0.7, 1.0],
}

clf = GridSearchCV(RakelD(), parameters, scoring='f1_macro')
clf.fit(x, y)

print clf.best_params_, clf.best_score_



ImportError: No module named builtins

In [25]:
def get_metric_data_frame(Y_true_train, y_pred_train,train, model_name, strategy):
    metric_train = {} 
    metric_train["micro-f1"] = f1_score(Y_true_train, y_pred_train, average="micro")
    metric_train["weighted-f1"] = f1_score(Y_true_train, y_pred_train, average="weighted")
    metric_train["samples-f1"] = f1_score(Y_true_train, y_pred_train, average="samples")
    metric_train["macro-f1"] = f1_score(Y_true_train, y_pred_train, average="macro")
    metric_train["hamming_loss"] = hamming_loss(Y_true_train, y_pred_train)
    metric_train["subset_accuracy"] = accuracy_score(Y_true_train, y_pred_train)
    metric_train["jaccard"] = jaccard_similarity_score(Y_true_train, y_pred_train)
    
    metric_test_df_new = pd.DataFrame.from_dict(metric_train, orient='index').transpose()
    metric_test_df_new['model'] = model_name
    metric_test_df_new['strategy'] = "One vs rest"
    metric_test_df_new['train_test'] = train
    
    return metric_test_df_new

In [None]:
MODELS = {
    #"Random Forest": RandomForestClassifier(random_state=0),
    #"Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=0),
    #"SVM": GridSearchCV(LinearSVC(random_state=0), scoring='f1',param_grid={"C": np.logspace(-5, -5, 20)}),
    #"KNN, k=5": KNeighborsClassifier(n_neighbors=5),
    #"KNN, k=10": KNeighborsClassifier(n_neighbors=10),
}


In [None]:
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB

In [14]:
# Label Power set
clf = LabelPowerset(GaussianNB())

# train
clf.fit(X_train, Y_true_train)

# predict
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)


get_metric_data_frame(Y_true_train, y_pred_train, "train", "Naive Bayes", "Label Powerset")
get_metric_data_frame(Y_true_test, y_pred_test, "test", "Naive Bayes", "Label Powerset")

In [15]:
y_pred_test

<43860x7 sparse matrix of type '<type 'numpy.int64'>'
	with 161553 stored elements in LInked List format>

In [26]:
get_metric_data_frame(Y_true_train, y_pred_train, "train", "Naive Bayes", "Label Powerset")

Unnamed: 0,micro-f1,jaccard,macro-f1,samples-f1,subset_accuracy,weighted-f1,hamming_loss,model,strategy,train_test
0,0.396492,0.255228,0.377345,0.374896,0.009989,0.466769,0.518111,Naive Bayes,One vs rest,train
