In [78]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk import FreqDist, word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import enchant
english_d = enchant.Dict("en_US")
import re
import gensim
from gensim import corpora, models, similarities
import pickle

In [2]:
imdb = pd.read_csv('imdb_5000_movies.csv')

### Grab all of the movie codes from the 'imdb_movie_link' column:

In [3]:
imdb_ids = [imdb.iloc[i]['movie_imdb_link'].split('title/')[1].split('/?')[0] for i in range(len(imdb))]

### Insert code into omdb api and turn each one into its own text file

This is where we persist our API queries to files, so that we don't run into query limits and get locked out of our dataset. Our API key has been removed from this notebook.

In [4]:
# for id in imdb_ids:

#     request = requests.get(f'http://www.omdbapi.com/?i={id}&plot=full&apikey={API_KEY}').json()
#     text = str(request)  
    
#     f = open(f'movie_{id}', 'w+')
#     f.write(text)
#     f.close()

### Create new dataframe with relevant columns

Now that we've saved a bunch of files containing data about movies, we'll read them back into pandas in a tidy way:

In [5]:
#create empty dataframe with relevant columns

df = pd.DataFrame(columns=['Title', 'Year', 'ID', 'Plot', 'Genre', 'Production', 
                           'Director', 'Actor_1_name', 'Actor_1_fb_likes', 'Actor_2_name', 
                           'Actor_2_fb_likes', 'Actor_3_name', 'Actor_3_fb_likes', 'Budget', 
                           'Rated', 'Language', 'imdbRating'])

### Drawing from both a kaggle dataset and the OMdB API data we downloaded

In [6]:
for i in range(len(imdb_ids)):
    id = imdb.iloc[i]['movie_imdb_link'].split('title/')[1].split('/?')[0]
    x_file = open(os.path.join('Movies', f"movie_{id}"), "r")    #open up the movie's text file
    movie_text = x_file.readlines()[0]
    dict = eval(movie_text)    #turn string back to dictionary
    dict['Plot'] = dict['Plot'].replace("\'", "'")    #clean up
    df = df.append({'Title': dict['Title'], 'Year': dict['Year'], 'ID': id, 
                    'Plot': dict['Plot'], 'Genre': dict['Genre'], 
                    'imdbRating': dict['imdbRating'], 
                    'Director': imdb.iloc[i,:].loc['director_name'], 
                    'Actor_1_name':imdb.iloc[i,:].loc['actor_1_name'], 
                    'Actor_1_fb_likes':imdb.iloc[i,:].loc['actor_1_facebook_likes'], 
                    'Actor_2_name':imdb.iloc[i,:].loc['actor_2_name'], 
                    'Actor_2_fb_likes':imdb.iloc[i,:].loc['actor_2_facebook_likes'], 
                    'Actor_3_name':imdb.iloc[i,:].loc['actor_3_name'], 
                    'Actor_3_fb_likes':imdb.iloc[i,:].loc['actor_3_facebook_likes'], 
                    'Budget':imdb.iloc[i,:].loc['budget'], 'Language':dict['Language'], 
                    'Rated':dict['Rated']}, ignore_index=True)    #add to dataframe

### Parsing out Genres as one-hot Columns:

In [7]:
li = []
for i in range(len(df)):
    movie_genres = df.iloc[i]['Genre'].split(', ')
    li.append(movie_genres)
    
final_genres = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Sport', 'Thriller', 'War', 'Western']    

for genre in final_genres:
    list = []
    for movie in li:
        if genre in movie:
            list.append(1)
        else:
            list.append(0)
    df[genre] = list

### Remove movies with null plots and ratings

In [8]:
df = df[~((df['Plot'] == 'N/A')|(df['imdbRating'] == 'N/A'))] # Drops movies with null plots
df.imdbRating = df.imdbRating.astype(float)

In [9]:
df = df.reset_index()

In [10]:
df = df.drop(['index'], axis=1)

In [11]:
df[:3] # Our source data.

Unnamed: 0,Title,Year,ID,Plot,Genre,Production,Director,Actor_1_name,Actor_1_fb_likes,Actor_2_name,...,History,Horror,Musical,Mystery,Romance,Sci_Fi,Sport,Thriller,War,Western
0,Avatar,2009,tt0499549,"When his brother is killed in a robbery, parap...","Action, Adventure, Fantasy, Sci-Fi",,James Cameron,CCH Pounder,1000.0,Joel David Moore,...,0,0,0,0,0,0,0,0,0,0
1,Pirates of the Caribbean: At World's End,2007,tt0449088,"After Elizabeth, Will, and Captain Barbossa re...","Action, Adventure, Fantasy",,Gore Verbinski,Johnny Depp,40000.0,Orlando Bloom,...,0,0,0,0,0,0,0,0,0,0
2,Spectre,2015,tt2379713,A cryptic message from the past sends James Bo...,"Action, Adventure, Thriller",,Sam Mendes,Christoph Waltz,11000.0,Rory Kinnear,...,0,0,0,0,0,0,0,1,0,0


### Creating a Pristine and Beautiful NEW DataFrame

In [12]:
main_df = pd.DataFrame()

In [13]:
genres = df.iloc[:,-20:]
main_df['Year'] = [int(year.split('–')[0]) for year in df['Year'].values]
main_df = main_df.join(genres)

In [14]:
main_df[:3]

Unnamed: 0,Year,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,History,Horror,Musical,Mystery,Romance,Sci_Fi,Sport,Thriller,War,Western
0,2009,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2007,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2015,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [15]:
df.plot.scatter(x='Actor_1_fb_likes',
                    y='imdbRating')

<matplotlib.axes._subplots.AxesSubplot at 0x11b43e240>

### Log Transforming Actor Facebook Likes for use as a Feature

In [16]:
def log_transform_col(feature, dataframe):
    logged = pd.Series(np.log(dataframe[feature].values+1), name=feature+'_logged')
    return logged

actor_features = ['Actor_1_fb_likes', 'Actor_2_fb_likes','Actor_3_fb_likes']

actor_likes = [log_transform_col(actor_features[i], df) for i in range(len(actor_features))]

In [17]:
main_df = main_df.join(actor_likes)

In [18]:
main_df['ratings'] = df['imdbRating']

In [26]:
main_df[:3]

Unnamed: 0,Year,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,Romance,Sci_Fi,Sport,Thriller,War,Western,Actor_1_fb_likes_logged,Actor_2_fb_likes_logged,Actor_3_fb_likes_logged,ratings
0,2009,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6.908755,6.842683,6.75227,7.8
1,2007,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,10.59666,8.517393,6.908755,7.1
2,2015,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,9.305741,5.976351,5.087596,6.8


### Incorporating Natural Language Processing with Plot Synopses

#### Setting up Lemmatization / Normalization Functions

In [63]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def lemmatize(plot_list):
    lemmatized_plots = []
    for plot in plot_list:
        tokenized_lower = word_tokenize(plot.lower())   #make plot summary all lowercase and lemmatize
        
        tokenized_lower =[word for word in tokenized_lower if english_d.check(word)] # Make sure it's an english word
        
        lemmatizer = WordNetLemmatizer()
        
        dirty_lemma = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokenized_lower]    #lemmatize each word based on part of speech
        dirty_lemma_string = ' '.join(dirty_lemma)
        
        #filter for words that match regex pattern
        reg = re.compile((r"([a-zA-Z]+(?:'[a-z]+)?)"))    #define regular expression pattern
        lemmatized_regex = [word_lem for word_lem in dirty_lemma if word_lem in reg.findall(dirty_lemma_string)]
        
        #filter out stop words
        stop_words = set(stopwords.words('english'))
        lemmatized = [word_lem for word_lem in lemmatized_regex if not word_lem in stop_words]
        lemmatized_string = ' '.join(lemmatized)
        
        lemmatized_plots.append(lemmatized_string)
        
        
    return lemmatized_plots

In [64]:
all_plots = [plot for plot in df.loc[:,'Plot'].values] # Get all the plots.

### Running the Lemmatizer

In [75]:
plots = lemmatize(all_plots) # Lemmatize.

### Vectorizing Plots

In [76]:
#term frequency = number of times a word appears in a document / number of words in document
#inverse document frequency = log base e(number of ducuments / number of documents with word in it)
# tf:idf = tf * idf

tfidf = TfidfVectorizer()
response = tfidf.fit_transform(plots)
print(response.shape)

tfidf_df = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names())

(5037, 13807)


Now we have all plots lemmatized as "plots" and vectorized / weighted as "tfidf_df".

### Incorporating some LDA - Clustering Documents by Topic

We tried running a topic modeling algorithm over our corpus of text. "Gensim" clusters words that appear together frequently across several documents. The clusters can be interpreted as general themes, and each movie has weights of how much it belongs to each theme. These weights are then re-incorporated as features in our dataset.

In [81]:
all_words = [plot.split(' ') for plot in plots] # Just formatting our corpus how Gensim wants it

In [82]:
dictionary = corpora.Dictionary(all_words)
corpus = [dictionary.doc2bow(text) for text in all_words]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [83]:
NUM_TOPICS = 30 # This value was arbitrarily chosen.
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=10) # Also arb
ldamodel.save('model5.gensim')

In [87]:
topics = ldamodel.print_topics(num_words=4)
topics # These are examples of some of the clusters created by Gensim.

[(3, '0.024*"murder" + 0.018*"police" + 0.015*"killer" + 0.014*"case"'),
 (17, '0.011*"bond" + 0.008*"world" + 0.008*"face" + 0.008*"one"'),
 (5, '0.012*"kill" + 0.009*"gay" + 0.008*"local" + 0.007*"seek"'),
 (16, '0.013*"team" + 0.010*"find" + 0.007*"father" + 0.007*"game"'),
 (11, '0.009*"dancer" + 0.008*"town" + 0.008*"faith" + 0.007*"box"'),
 (18, '0.011*"prince" + 0.008*"new" + 0.008*"neighborhood" + 0.008*"school"'),
 (19, '0.024*"peter" + 0.016*"one" + 0.016*"story" + 0.013*"world"'),
 (29, '0.009*"murder" + 0.009*"show" + 0.009*"death" + 0.008*"town"'),
 (13, '0.010*"terry" + 0.008*"game" + 0.007*"mob" + 0.007*"bos"'),
 (28, '0.015*"new" + 0.014*"film" + 0.013*"show" + 0.013*"record"'),
 (20, '0.025*"frank" + 0.023*"life" + 0.015*"year" + 0.011*"old"'),
 (25, '0.009*"new" + 0.008*"victor" + 0.007*"night" + 0.007*"food"'),
 (7, '0.019*"friend" + 0.017*"school" + 0.013*"get" + 0.011*"family"'),
 (15, '0.011*"group" + 0.009*"undercover" + 0.009*"molly" + 0.009*"lesbian"'),
 (23, '

In [90]:
tm = pd.DataFrame()
for i in range(len(corpus)):
    new_row = np.zeros(30)
    for toop in ldamodel.get_document_topics(corpus[i]): # These two lines are where you do what you need to do
        new_row[toop[0]] = toop[1]                       # to flip zeroes to ones if the genre appears
    tm = tm.append(pd.Series(new_row), ignore_index=1)

In [92]:
tm.head() # This is a DataFrame with the weights from the GenSim clustering.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.452292,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.137938,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.112485,0.0,0.11327,0.0,0.0,0.021481,0.0,0.143831,0.164722,...,0.0,0.020978,0.0,0.0,0.0,0.216339,0.028442,0.0,0.0,0.0
3,0.254436,0.0,0.0,0.1285,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.099967,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.105058,0.0,0.065696,0.0,0.0,0.48191,0.215647,0.0,...,0.0,0.0,0.0,0.115349,0.0,0.0,0.0,0.0,0.0,0.0


### Joining GenSim Results with Vectorized Plot Synopses

In [93]:
plots_and_topics = tfidf_df.join(tm)

In [95]:
plots_and_topics.head()

Unnamed: 0,aardvark,aback,abandon,abandonment,abate,abatement,abbey,abdicate,abduct,abduction,...,20,21,22,23,24,25,26,27,28,29
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.137938,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.020978,0.0,0.0,0.0,0.216339,0.028442,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.099967,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.115349,0.0,0.0,0.0,0.0,0.0,0.0
