# Project 2


# Movie Genre Classification

Classify a movie genre based on its plot.

<img src="moviegenre.png"
     style="float: left; margin-right: 10px;" />




https://www.kaggle.com/c/miia4200-20191-p2-moviegenreclassification/overview


### Data

Input:
- movie plot

Output:
Probability of the movie belong to each genre


### Evaluation

- 20% API
- 30% Create a solution using with a Machine Learning algorithm - Presentation (5 slides)
- 50% Performance in the Kaggle competition (Normalized acording to class performance in the private leaderboard)


### Acknowledgements

We thank Professor Fabio Gonzalez, Ph.D. and his student John Arevalo for providing this dataset.

See https://arxiv.org/abs/1702.01992

## Sample Submission

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import nltk
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
dataTraining = pd.read_csv('https://github.com/albahnsen/PracticalMachineLearningClass/raw/master/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)
dataTesting = pd.read_csv('https://github.com/albahnsen/PracticalMachineLearningClass/raw/master/datasets/dataTesting.zip', encoding='UTF-8', index_col=0)

In [3]:
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,most is the story of a single father who takes...,"['Short', 'Drama']",8.0
900,2008,How to Be a Serial Killer,a serial killer decides to teach the secrets o...,"['Comedy', 'Crime', 'Horror']",5.6
6724,1941,A Woman's Face,"in sweden , a female blackmailer with a disfi...","['Drama', 'Film-Noir', 'Thriller']",7.2
4704,1954,Executive Suite,"in a friday afternoon in new york , the presi...",['Drama'],7.4
2582,1990,Narrow Margin,"in los angeles , the editor of a publishing h...","['Action', 'Crime', 'Thriller']",6.6


In [4]:
dataTesting.head()

Unnamed: 0,year,title,plot
1,1999,Message in a Bottle,"who meets by fate , shall be sealed by fate ...."
4,1978,Midnight Express,"the true story of billy hayes , an american c..."
5,1996,Primal Fear,martin vail left the chicago da ' s office to ...
6,1950,Crisis,husband and wife americans dr . eugene and mr...
7,1959,The Tingler,the coroner and scientist dr . warren chapin ...


### Create count vectorizer


In [5]:
import numpy as np  
import re  
import nltk  
from sklearn.datasets import load_files
import pickle  
from nltk.corpus import stopwords  

In [6]:
X = pd.Series(dataTraining['plot'])

In [7]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in dataTraining['plot'].index:  
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)

    documents.append(document)

In [8]:
X2 = pd.Series(dataTraining['title'])

In [9]:
documents2 = []
for sen in dataTraining['title'].index:  
    # Remove all the special characters
    document2 = re.sub(r'\W', ' ', str(X2[sen]))

    # remove all single characters
    document2 = re.sub(r'\s+[a-zA-Z]\s+', ' ', document2)

    # Remove single characters from the start
    document2 = re.sub(r'\^[a-zA-Z]\s+', ' ', document2) 

    # Substituting multiple spaces with single space
    document2 = re.sub(r'\s+', ' ', document2, flags=re.I)

    # Removing prefixed 'b'
    document2 = re.sub(r'^b\s+', '', document2)

    # Converting to Lowercase
    document2 = document2.lower()

    # Lemmatization
    document2 = document2.split()

    document2 = [stemmer.lemmatize(word) for word in document2]
    document2 = ' '.join(document2)

    documents2.append(document2)

In [10]:
#from sklearn.feature_extraction.text import CountVectorizer  
#vectorizer = CountVectorizer(ngram_range=(1,10), max_features=10000, min_df=29, stop_words=stopwords.words('english'))  
#X = vectorizer.fit_transform(documents).toarray() 

In [11]:
#from sklearn.feature_extraction.text import TfidfTransformer  
#tfidfconverter = TfidfTransformer()  
#X = tfidfconverter.fit_transform(X).toarray()  

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidfconverter = TfidfVectorizer(ngram_range=(1,10), max_features=10000, min_df=29, stop_words=stopwords.words('english'))  
tfidfconverter.fit(documents)
X = tfidfconverter.transform(documents).toarray()

In [13]:
vect = TfidfVectorizer(ngram_range=(1,10), max_features=10000)  
vect.fit(documents2)
X2 = vect.transform(documents2).toarray()

In [14]:
X_ = pd.DataFrame(X, columns=tfidfconverter.get_feature_names())
X2_ = pd.DataFrame(X2, columns=vect.get_feature_names())

In [15]:
X_.shape

(7895, 2971)

In [16]:
X2_.shape

(7895, 10000)

In [17]:
X_F = pd.concat([X_, X2_], axis=1)

In [33]:
z = pd.Series(dataTraining['year']).tolist()

In [34]:
X_F['year'] = z
print(X_F.shape)

(7895, 12971)


In [35]:
X3 = pd.Series(dataTesting['plot'])

documents3 = []

for sen in dataTesting['plot'].index:  
    # Remove all the special characters
    document3 = re.sub(r'\W', ' ', str(X3[sen]))

    # remove all single characters
    document3 = re.sub(r'\s+[a-zA-Z]\s+', ' ', document3)

    # Remove single characters from the start
    document3 = re.sub(r'\^[a-zA-Z]\s+', ' ', document3) 

    # Substituting multiple spaces with single space
    document3 = re.sub(r'\s+', ' ', document3, flags=re.I)

    # Removing prefixed 'b'
    document3 = re.sub(r'^b\s+', '', document3)

    # Converting to Lowercase
    document3 = document3.lower()

    # Lemmatization
    document3 = document3.split()

    document3 = [stemmer.lemmatize(word) for word in document3]
    document3 = ' '.join(document3)

    documents3.append(document3)

In [36]:
X4 = pd.Series(dataTesting['title'])

documents4 = []

for sen in dataTesting['title'].index:  
    # Remove all the special characters
    document4 = re.sub(r'\W', ' ', str(X4[sen]))

    # remove all single characters
    document4 = re.sub(r'\s+[a-zA-Z]\s+', ' ', document4)

    # Remove single characters from the start
    document4 = re.sub(r'\^[a-zA-Z]\s+', ' ', document4) 

    # Substituting multiple spaces with single space
    document4 = re.sub(r'\s+', ' ', document4, flags=re.I)

    # Removing prefixed 'b'
    document4 = re.sub(r'^b\s+', '', document4)

    # Converting to Lowercase
    document4 = document4.lower()

    # Lemmatization
    document4 = document4.split()

    document4 = [stemmer.lemmatize(word) for word in document4]
    document4 = ' '.join(document4)

    documents4.append(document4)

In [37]:
#vect_testplot = TfidfVectorizer(ngram_range=(1,5), max_features=10000, stop_words=stopwords.words('english'))  
X3 = tfidfconverter.transform(documents3).toarray()

In [38]:
#vect_testtitle = TfidfVectorizer(ngram_range=(1,10), max_features=10000)  
X4 = vect.transform(documents4).toarray()

In [39]:
X3_ = pd.DataFrame(X3, columns=tfidfconverter.get_feature_names())
X4_ = pd.DataFrame(X4, columns=vect.get_feature_names())
XT_F = pd.concat([X3_, X4_], axis=1)
z_ = pd.Series(dataTesting['year']).tolist()
XT_F['year'] = z_
print(XT_F.shape)

(3383, 12971)


### Create y

In [43]:
dataTraining['genres'] = dataTraining['genres'].map(lambda x: (x))

le = MultiLabelBinarizer()
y_genres = le.fit_transform(dataTraining['genres'])

In [44]:
y_genres

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0]])

In [45]:
X_train, X_test, y_train_genres, y_test_genres = train_test_split(X_F, y_genres, random_state=17)

### Train multi-class multi-label model

In [46]:
clf = OneVsRestClassifier(RandomForestClassifier(n_jobs=-1, n_estimators=600, max_features=30, max_depth=10, random_state=17))

In [None]:
clf.fit(X_train, y_train_genres)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=30, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=-1,
            oob_score=False, random_state=17, verbose=0, warm_start=False),
          n_jobs=None)

In [None]:
y_pred_genres = clf.predict_proba(X_test)

In [None]:
roc_auc_score(y_test_genres, y_pred_genres, average='macro')

0.8713739114563562

In [None]:
rend = []
for n in np.arange(10,110,10):
    clf = OneVsRestClassifier(RandomForestClassifier(n_jobs=-1,max_features=30, n_estimators=600, max_depth=10, random_state=17))
    clf.fit(X_train, y_train_genres)
    y_pred_genres = clf.predict_proba(X_test)
    print(roc_auc_score(y_test_genres, y_pred_genres, average='macro'))
    rend.append([roc_auc_score(y_test_genres, y_pred_genres, average='macro'), n])

### Predict the testing dataset

In [None]:
XT_F.shape

In [None]:
#X_test_dtm = vect.transform(dataTesting['plot'])

cols = ['p_Action', 'p_Adventure', 'p_Animation', 'p_Biography', 'p_Comedy', 'p_Crime', 'p_Documentary', 'p_Drama', 'p_Family',
        'p_Fantasy', 'p_Film-Noir', 'p_History', 'p_Horror', 'p_Music', 'p_Musical', 'p_Mystery', 'p_News', 'p_Romance',
        'p_Sci-Fi', 'p_Short', 'p_Sport', 'p_Thriller', 'p_War', 'p_Western']

y_pred_test_genres = clf.predict_proba(XT_F)


In [None]:
res = pd.DataFrame(y_pred_test_genres, index=dataTesting.index, columns=cols)

In [None]:
res.head()

In [None]:
res.to_csv('pred_genres_text_RF.csv', index_label='ID')