## Get imdb movies into pandas database

In [1]:
import pandas as pd
import os
import numpy as np

In [95]:
from collections import Counter
c = Counter([list of genres n stuff])

In order to pull a bunch of movie data from the OMdB API (access tokens cost one dollar), we needed to pass it a bunch of IMdB movie ID's. We found them in one column of a Kaggle dataset.

In [93]:
imdb = pd.read_csv('imdb_5000_movies.csv')

### Grab all of the movie codes from the 'imdb_movie_link' column:

In [94]:
imdb_ids = [imdb.iloc[i]['movie_imdb_link'].split('title/')[1].split('/?')[0] for i in range(len(imdb))]

### Insert code into omdb api and turn each one into its own text file

This is where we persist our API queries to files, so that we don't run into query limits and get locked out of our dataset. Our API key has been removed from this notebook.

In [4]:
# for id in imdb_ids:

#     request = requests.get(f'http://www.omdbapi.com/?i={id}&plot=full&apikey={API_KEY}').json()
#     text = str(request)  
    
#     f = open(f'movie_{id}', 'w+')
#     f.write(text)
#     f.close()

### Create new dataframe with relevant columns

Now that we've saved a bunch of files containing data about movies, we'll read them back into pandas in a tidy way:

In [60]:
df = pd.DataFrame(columns=['Title', 'Year', 'ID', 'Plot', 'Genre', 'imdbRating'])

In [61]:
for id in imdb_ids:
    import os
    os.getcwd()
    x_file = open(os.path.join('Movies', f"movie_{id}"), "r")    #open up the movie's text file
    movie_text = x_file.readlines()[0]
    dict = eval(movie_text)    #turn string back to dictionary
    dict['Plot'] = dict['Plot'].replace("\'", "'")
    df = df.append({'Title': dict['Title'], 'Year': dict['Year'], 'ID': id, 'Plot': dict['Plot'], 'Genre': dict['Genre'], 'imdbRating': dict['imdbRating']}, ignore_index=True)    #add to dataframe

We drop rows containing null and N/A values in the plot or critical rating columns:

In [62]:
df = df[~((df['Plot'] == 'N/A')|(df['imdbRating'] == 'N/A'))] # Drops movies with null plots
df.imdbRating = df.imdbRating.astype(float)
df['binary_target'] = df['imdbRating'] >= df['imdbRating'].mean()   #binary target column. True = above mean ; False = below mean

In [63]:
df['Plot'][0]
df

Unnamed: 0,Title,Year,ID,Plot,Genre,imdbRating,binary_target
0,Avatar,2009,tt0499549,"When his brother is killed in a robbery, parap...","Action, Adventure, Fantasy, Sci-Fi",7.8,True
1,Pirates of the Caribbean: At World's End,2007,tt0449088,"After Elizabeth, Will, and Captain Barbossa re...","Action, Adventure, Fantasy",7.1,True
2,Spectre,2015,tt2379713,A cryptic message from the past sends James Bo...,"Action, Adventure, Thriller",6.8,True
3,The Dark Knight Rises,2012,tt1345836,Despite his tarnished reputation after the eve...,"Action, Thriller",8.4,True
5,John Carter,2012,tt0401729,"John Carter, a Civil War veteran, who in 1868 ...","Action, Adventure, Sci-Fi",6.6,True
6,Spider-Man 3,2007,tt0413300,Peter Parker has finally managed to piece toge...,"Action, Adventure, Sci-Fi",6.2,False
7,Tangled,2010,tt0398286,After receiving the healing powers from a magi...,"Animation, Adventure, Comedy, Family, Fantasy,...",7.8,True
8,Avengers: Age of Ultron,2015,tt2395427,Tony Stark creates the Ultron Program to prote...,"Action, Adventure, Sci-Fi",7.4,True
9,Harry Potter and the Half-Blood Prince,2009,tt0417741,In the sixth year at Hogwarts School of Witchc...,"Adventure, Family, Fantasy, Mystery",7.6,True
10,Batman v Superman: Dawn of Justice,2016,tt2975590,The general public is concerned over having Su...,"Action, Adventure, Fantasy, Sci-Fi",6.5,True


In [10]:
from __future__ import print_function
import nltk
import sklearn

from nltk.collocations import *
from nltk import FreqDist, word_tokenize
import string, re
import urllib
from nltk.stem.snowball import SnowballStemmer

Here we get started establishing our Lemmatizer, which will normalize sets of words with the same root, e.g. [is, was, are, been, were] becomes just [be, be, be, be, be], or [ran, runs, running] becomes [run, run, run].

In [11]:
import nltk
from nltk.corpus import wordnet

lmtzr = nltk.WordNetLemmatizer().lemmatize


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def normalize_text(text):
    word_pos = nltk.pos_tag(nltk.word_tokenize(text))
    lemm_words = [lmtzr(sw[0], get_wordnet_pos(sw[1])) for sw in word_pos]

    return [x.lower() for x in lemm_words]

In [12]:
# import pandas and sklearn's CountVectorizer class
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# create a dataframe from a word matrix
def wm2df(wm, feat_names):
    
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
    df = pd.DataFrame(data=wm.toarray(), index=doc_names,
                      columns=feat_names)
    return(df)
  
# set of documents
corpora = [df['Plot'][0], df['Plot'][1]]

# instantiate the vectorizer object
cvec = CountVectorizer(lowercase=False)

# convert the documents into a document-term matrix
wm = cvec.fit_transform(corpora)

# retrieve the terms found in the corpora
tokens = cvec.get_feature_names()

# create a dataframe from the matrix
wm2df(wm, tokens)
tokens

['After',
 'Barbossa',
 'Beckett',
 'Calypso',
 'Captain',
 'Colonel',
 'Company',
 'Cutler',
 'Dalma',
 'Davy',
 'Dutchman',
 'East',
 'Elizabeth',
 'Feng',
 'Flying',
 'In',
 'India',
 'Jack',
 'Jake',
 'Jones',
 'Lord',
 'Lords',
 'Marine',
 'Na',
 'Neytiri',
 'Norrington',
 'Now',
 'Pandora',
 'Parker',
 'Pirate',
 'Pirates',
 'Quaritch',
 'Sao',
 'Selfridge',
 'Sparrow',
 'Sully',
 'The',
 'There',
 'Tia',
 'Trading',
 'When',
 'While',
 'Will',
 'With',
 'against',
 'alien',
 'all',
 'alliance',
 'an',
 'and',
 'attempting',
 'avatar',
 'back',
 'battle',
 'beautiful',
 'begins',
 'bond',
 'bound',
 'brother',
 'by',
 'call',
 'combine',
 'control',
 'cooperating',
 'corners',
 'corporate',
 'crew',
 'damned',
 'dark',
 'dead',
 'decides',
 'distant',
 'driving',
 'entire',
 'epic',
 'exchange',
 'extermination',
 'face',
 'falls',
 'fate',
 'fear',
 'fight',
 'figurehead',
 'final',
 'fix',
 'foes',
 'for',
 'forcing',
 'forms',
 'forward',
 'four',
 'freedom',
 'from',
 'gather

In [25]:
import enchant
d = enchant.Dict("en_US")

### Lemmatization Functions 

In [26]:
import nltk
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
from nltk.corpus import wordnet
from nltk.corpus import stopwords

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def lemmatize(plot_list):
    lemmatized_plots = []
    for plot in plot_list:
        tokenized_lower = word_tokenize(plot.lower())   #make plot summary all lowercase and lemmatize
        
        tokenized_lower =[word for word in tokenized_lower if d.check(word)] # Make sure it's an english word
        
        lemmatizer = WordNetLemmatizer()
        
        dirty_lemma = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokenized_lower]    #lemmatize each word based on part of speech
        dirty_lemma_string = ' '.join(dirty_lemma)
        
        #filter for words that match regex pattern
        reg = re.compile((r"([a-zA-Z]+(?:'[a-z]+)?)"))    #define regular expression pattern
        lemmatized_regex = [word_lem for word_lem in dirty_lemma if word_lem in reg.findall(dirty_lemma_string)]
        
        #filter out stop words
        stop_words = set(stopwords.words('english'))
        lemmatized = [word_lem for word_lem in lemmatized_regex if not word_lem in stop_words]
        lemmatized_string = ' '.join(lemmatized)
        
        lemmatized_plots.append(lemmatized_string)
        
        
    return lemmatized_plots

In [27]:
all_plots = list(df.loc[:,'Plot'])

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

plots = lemmatize(all_plots)

#term frequency = number of times a word appears in a document / number of words in document
#inverse document frequency = log base e(number of ducuments / number of documents with word in it)
#tf-idf = tf * idf

tfidf = TfidfVectorizer()
response = tfidf.fit_transform(plots)
print(response.shape)

tfidf_df = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names())

(5037, 13807)


In [29]:
tfidf_df

Unnamed: 0,aardvark,aback,abandon,abandonment,abate,abatement,abbey,abdicate,abduct,abduction,...,zeta,zigzag,zing,zip,zit,zombie,zone,zoo,zoologist,zoom
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
5,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
6,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
8,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


### Now we have all plots lemmatized as "plots" and vectorized / weighted as "tfidf_df"

### Gensim Topic Modeling

In [30]:
from gensim import corpora, models, similarities

In [31]:
all_words = [plot.split(' ') for plot in plots]

In [32]:
dictionary = corpora.Dictionary(all_words)
corpus = [dictionary.doc2bow(text) for text in all_words]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [33]:
import gensim
NUM_TOPICS = 30
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=10)
ldamodel.save('model5.gensim')

In [35]:
topics = ldamodel.print_topics(num_words=4)
topics

[(1, '0.017*"life" + 0.012*"relationship" + 0.008*"two" + 0.008*"want"'),
 (25, '0.025*"crew" + 0.020*"ship" + 0.018*"earth" + 0.017*"alien"'),
 (18, '0.040*"war" + 0.015*"world" + 0.014*"force" + 0.013*"island"'),
 (14, '0.019*"police" + 0.014*"murder" + 0.014*"killer" + 0.011*"crime"'),
 (9, '0.016*"life" + 0.009*"get" + 0.008*"work" + 0.007*"find"'),
 (20, '0.018*"father" + 0.015*"mother" + 0.014*"find" + 0.013*"young"'),
 (24, '0.012*"black" + 0.009*"war" + 0.009*"mob" + 0.007*"make"'),
 (26, '0.010*"one" + 0.009*"fight" + 0.007*"save" + 0.007*"new"'),
 (10, '0.014*"champion" + 0.011*"smith" + 0.011*"van" + 0.010*"rocky"'),
 (5, '0.019*"family" + 0.016*"father" + 0.013*"young" + 0.012*"new"'),
 (27, '0.009*"woman" + 0.008*"sex" + 0.007*"sexual" + 0.007*"laura"'),
 (0, '0.017*"friend" + 0.017*"life" + 0.016*"get" + 0.012*"go"'),
 (19, '0.023*"jack" + 0.013*"find" + 0.013*"year" + 0.011*"human"'),
 (3, '0.017*"get" + 0.015*"one" + 0.011*"find" + 0.008*"take"'),
 (21, '0.012*"band" + 

In [36]:
tm = pd.DataFrame()
for i in range(len(corpus)):
    new_row = np.zeros(30)
    for toop in ldamodel.get_document_topics(corpus[i]): # These two lines are where you do what you need to do
        new_row[toop[0]] = toop[1]                       # to flip zeroes to ones if the genre appears
    tm = tm.append(pd.Series(new_row), ignore_index=1)

In [38]:
tfidf_df = tfidf_df.join(tm)

## Model Building

In [76]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB

X = tfidf_df
y = df['binary_target']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state = 42)

In [77]:
clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [78]:
from sklearn import metrics

x_train_preds = clf.predict(X_train)

print(metrics.classification_report(y_train, x_train_preds))

              precision    recall  f1-score   support

       False       0.81      1.00      0.90      2092
        True       1.00      0.80      0.89      2441

   micro avg       0.89      0.89      0.89      4533
   macro avg       0.91      0.90      0.89      4533
weighted avg       0.91      0.89      0.89      4533



In [79]:
metrics.accuracy_score(y_train, x_train_preds)

0.8934480476505625

In [80]:
binary_test_predictions = clf.predict(X_test)

print(metrics.classification_report(y_test, binary_test_predictions))

              precision    recall  f1-score   support

       False       0.54      0.60      0.57       241
        True       0.59      0.52      0.55       263

   micro avg       0.56      0.56      0.56       504
   macro avg       0.56      0.56      0.56       504
weighted avg       0.56      0.56      0.56       504



In [81]:
metrics.accuracy_score(y_test, binary_test_predictions)

0.5595238095238095

In [83]:
# np.argsort(clf.coef_)

### Dimensionality Reduction

In [65]:
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
X_train_pca = svd.fit_transform(X_train)

print(svd.explained_variance_ratio_.sum())

0.3190262791673567


In [74]:
X_train_pca.shape

(4533, 100)