In [None]:
#Trying with the wikipedia data set instead


In [None]:
# Step 1: Import Libraries
#Basics
import pandas as pd
import numpy as np

#Plots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#NLP TOOLS
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords=stopwords.words('english')


#Strings and regular expressions
import string,re

#Text Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#Classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

#Model Building
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

#Evaluation
from sklearn import metrics



In [None]:
#Load the data frame with the csv file to be used
df=pd.read_csv('wiki_movie_plots_deduped.csv')
df.head(5)

In [None]:
#ISBE- Inspect, Select, Build, Evaluate

In [None]:
#Inspect the dataframe- check for null values, duplicates and the breakdowns
df.isnull().sum()


In [None]:
df.duplicated().sum()

In [None]:
df.Genre.value_counts()


In [None]:
#Maybe Changing the order of the genres will streamline the process
def alphabetize_genre(a_string):
    words=word_tokenize(a_string)
    words.sort()
    my_words=''.join(words)
    return my_words

df['Sorted_genre']=df['Genre'].apply(alphabetize_genre)
print('Original Genre:', df['Genre'][20])
print('Sorted Genre: ', df['Sorted_genre'][20])

In [None]:
#Changing the value of genre so it only has 1 values
def select_first(a_string):
    words=word_tokenize(a_string)
    return words[0]

df['First_Genre']= df['Genre'].apply(select_first)
print("Original Genre: ", df['Genre'][10])
print("First Genre Only: ", df['First_Genre'][10])

In [None]:
df.Sorted_genre.value_counts()
#df.First_Genre.value_counts()

In [None]:
#Select and Engineer features
#Make a pipeline to change the genres
#make lowercase
def make_lowercase(a_string):
    return a_string.lower()

lower_test_string='This IS MY Test String'
lower_test_string=make_lowercase(lower_test_string)
print('Make Lowercase: ' + lower_test_string)

#remove punctuation
def remove_punct(a_string):
    a_string = re.sub(r'[^\w\s]','',a_string)
    return a_string

punct_test_string='Hello!! This is exciting?? No. It, really isnt.'
punct_test_string=remove_punct(punct_test_string)
print('Punctuation Removed: ' +punct_test_string)

#remove stopwords
def remove_stopwords(a_string):
    words=word_tokenize(a_string)
    
    valid_words=[]
    
    for word in words:
        if word not in stopwords:
            valid_words.append(word)
            
    a_string=' '.join(valid_words)
    return a_string

stopwords_test_string='Hey so this is, well its my stopwords test its really neat i guess to me'
stopwords_test_string=remove_stopwords(stopwords_test_string)
print('Remove Stopwords: ' + stopwords_test_string)
    
#Break the words
def stem_the_words(a_string):
    
    porter=PorterStemmer()
    
    words=word_tokenize(a_string)
    
    valid_words=[]
    
    for word in words:
        stemmed_word=porter.stem(word)
        valid_words.append(stemmed_word)
        
    a_string=' '.join(valid_words)
    return a_string

stemwords_test_string='You walked and I walk along the walkway. Walking is fun since we walk together'
stemwords_test_string=stem_the_words(stemwords_test_string)
print('Stemmed: '+ stemwords_test_string)

#Why is it dropping "e"?

In [None]:
#this is the pipeline so we dont have to type everything a million times
def clean_string_pipeline(a_string):
    a_string=make_lowercase(a_string)
    a_string=remove_punct(a_string)
    a_string=remove_stopwords(a_string)
    a_string=stem_the_words(a_string)
    
    return a_string

#sanity check
pipeline_test_string="Hello there! Its a lovely day for a walk, wouldn't you agree Mrs. Smith? I love to garden in my garden. Gardening is so rewarding and you'll agree once you have gardened as well."
pipeline_test_string=clean_string_pipeline(pipeline_test_string)
print('Clean Sentance: '+ pipeline_test_string)
    

In [None]:
#apply pipeline to datafram
df['Clean_Plot']=df['Plot'].apply(clean_string_pipeline)

#sanity check
print("Original Text: "+ df['Plot'][20])
print("Cleaned Test: "+ df['Clean_Plot'][20])


In [None]:
#Select and Engineer Features
#select X and y

X= df['Clean_Plot'].values
y=df['First_Genre'].values


In [None]:
# Initialize our vectorizer
vectorizer = TfidfVectorizer()

# This makes your vocab matrix
vectorizer.fit(X)

# This transforms your documents into vectors.
X = vectorizer.transform(X)

print(X.shape, type(X))

In [None]:
#sanity check
X[0]

In [None]:
#Build- now we do train test split on our data
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=.20, random_state=20)

#Initialize the model
model=MultinomialNB(alpha=.5)

#fit
model.fit(X_train,y_train)

#test
y_pred=model.predict(X_test)

y_pred_proba= model.predict_proba(X_test)

#check accuracy
accuracy=model.score(X_test, y_test)
print('Model Accuracy: %f' % accuracy)



In [None]:
#Check other scores
print(classification_report(y_test, y_pred, target_names=model.classes_))

In [None]:
#plot a confusion matrix
fig, ax= plt.subplots(figsize=(21,21))
disp = plot_confusion_matrix(model, X_test, y_test,
                             display_labels=model.classes_,
                             cmap=plt.cm.Blues, ax=ax)
plt.xticks(rotation=90)
disp

In [None]:
#Random Forest Model
rf_model= RandomForestClassifier()

#Fit data
rf_model.fit(X_train, y_train)

#Predctions
y_pred= rf_model.predict(X_test)
y_pred_proba= rf_model.predict_proba(X_test)

#accuracy
accuracy=rf_model.score(X_test,y_test)
print("Model Accuracy: %f" % accuracy)

#Print report
print(classification_report(y_test, y_pred, target_names=rf_model.classes_))



In [None]:
#Display results
# Plot the confusion matrix of our results
fig, ax = plt.subplots(figsize=(21, 21))
disp = plot_confusion_matrix(rf_model, X_test, y_test,
                             display_labels=rf_model.classes_,
                             cmap=plt.cm.Greens, ax=ax)
plt.xticks(rotation=90)
disp
