In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [11]:
#Plots
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
%matplotlib inline

#NLP TOOLS
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords=stopwords.words('english')


#Strings and regular expressions
import string,re

#Text Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#Classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

#Model Building
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

#Evaluation
from sklearn import metrics

In [12]:
df=pd.read_csv('/kaggle/input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv')

df.head(5)

In [13]:
#fix messy column names
df.columns=df.columns.str.replace('/','_')

In [14]:
#remove the non-english titles
df['Origin_Ethnicity'].value_counts()



In [15]:

movie_origin=['American', 'British', 'Australian', 'Canadian']
#df=df['Origin_Ethnicity'].isin(movie_origin)

df.head()
df=df[df.Origin_Ethnicity.isin(movie_origin)]
df.head()
df.Origin_Ethnicity.value_counts()

In [16]:
#Maybe Changing the order of the genres will streamline the process
def alphabetize_genre(a_string):
    myGenre=a_string.replace('-', ',').replace('/', ',')
    words=word_tokenize(myGenre)
    words.sort()
    
    for word in words:
        if (word==','):
            words.remove(word)
    myGenre=' '.join(words)
    return myGenre

df['Sorted_genre']=df['Genre'].apply(alphabetize_genre)
print('Original Genre:', df['Genre'][2000])
print('Sorted Genre: ', df['Sorted_genre'][2000])

In [17]:
#Changing the value of genre so it only has 1 values
def select_first(a_string):
    words=word_tokenize(a_string)
    
    for word in words:
        if (word==','):
            words.remove(word)
    return words[0]

df['First_Genre']= df['Sorted_genre'].apply(select_first)
print("Original Genre: ", df['Sorted_genre'][2000])
print("First Genre Only: ", df['First_Genre'][2000])

In [18]:
#df.Sorted_genre.value_counts()
comedy_drama= ['comedy', 'drama']
myCondition= df['First_Genre'].isin(comedy_drama)
df= df[myCondition]


In [19]:
#Make a pipeline to change the genres
#make lowercase
def make_lowercase(a_string):
    return a_string.lower()

lower_test_string='This IS MY Test String'
lower_test_string=make_lowercase(lower_test_string)
print('Make Lowercase: ' + lower_test_string)

#remove punctuation
def remove_punct(a_string):
    a_string = re.sub(r'[^\w\s]','',a_string)
    return a_string

punct_test_string='Hello!! This is exciting?? No. It, really isnt.'
punct_test_string=remove_punct(punct_test_string)
print('Punctuation Removed: ' +punct_test_string)

#remove stopwords
def remove_stopwords(a_string):
    words=word_tokenize(a_string)
    
    valid_words=[]
    
    for word in words:
        if word not in stopwords:
            valid_words.append(word)
            
    a_string=' '.join(valid_words)
    return a_string

stopwords_test_string='Hey so this is, well its my stopwords test its really neat i guess to me'
stopwords_test_string=remove_stopwords(stopwords_test_string)
print('Remove Stopwords: ' + stopwords_test_string)
    
#Break the words
def stem_the_words(a_string):
    
    porter=PorterStemmer()
    
    words=word_tokenize(a_string)
    
    valid_words=[]
    
    for word in words:
        stemmed_word=porter.stem(word)
        valid_words.append(stemmed_word)
        
    a_string=' '.join(valid_words)
    return a_string

stemwords_test_string='You walked and I walk along the walkway. Walking is fun since we walk together'
stemwords_test_string=stem_the_words(stemwords_test_string)
print('Stemmed: '+ stemwords_test_string)

#Why is it dropping "e"?

In [20]:
#this is the pipeline so we dont have to type everything a million times
def clean_string_pipeline(a_string):
    a_string=make_lowercase(a_string)
    a_string=remove_punct(a_string)
    a_string=remove_stopwords(a_string)
    
    
    
    return a_string

#sanity check
pipeline_test_string="Hello there! Its a lovely day for a walk, wouldn't you agree Mrs. Smith? I love to garden in my garden. Gardening is so rewarding and you'll agree once you have gardened as well."
pipeline_test_string=clean_string_pipeline(pipeline_test_string)
print('Clean Sentance: '+ pipeline_test_string)
    

In [21]:
#apply pipeline to datafram
df['Clean_Plot']=df['Plot'].apply(clean_string_pipeline)

#sanity check
print("Original Text: "+ df['Plot'][20])
print("Cleaned Test: "+ df['Clean_Plot'][20])

In [22]:
#Select and Engineer Features
#select X and y

X= df['Clean_Plot'].values
y=df['First_Genre'].values

In [23]:
vectorizer = TfidfVectorizer()

# This makes your vocab matrix
vectorizer.fit(X)

# This transforms your documents into vectors.
X=vectorizer.transform(X)
print(X.shape, type(X))

In [24]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=.20, random_state=20)

#Initialize the model
model=MultinomialNB(alpha=.5)
#I chose to only use the hyperparameter alpha since I felt like it was the only one that was relevant to this data set. 
#Alpha as a hyperparameter smooths out the model, making everything work better. 
#There arent too many other parameters for this method, but the accuracy with alpha still comes out pretty good. 

#fit
model.fit(X_train,y_train)

#test
y_pred=model.predict(X_test)

y_pred_proba= model.predict_proba(X_test)

#check accuracy
accuracy=model.score(X_test, y_test)
print('Model Accuracy: %f' % accuracy)

In [25]:
#Check other scores
print(classification_report(y_test, y_pred, target_names=model.classes_))

In [26]:
#plot a confusion matrix
fig, ax= plt.subplots(figsize=(10,10))
disp = plot_confusion_matrix(model, X_test, y_test,
                             display_labels=model.classes_,
                             cmap=plt.cm.Blues, ax=ax)
plt.xticks(rotation=90)
disp