In [None]:
# Implementation of LDA on the IMDB 5000 dataset

In [None]:
from __future__ import absolute_import, division, print_function
import multiprocessing, os, pprint, re
import nltk
import gensim.models.word2vec as word2vec
import sklearn.manifold
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re, string

In [None]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('movie_metadata.csv')

In [None]:
df = df.drop(df.movie_imdb_link.name, 1)
def trim_and_remove_spaces(raw):
    if pd.isnull(raw) == False:
        return raw.strip().replace(' ', '_')

df.director_name = df.director_name.apply(trim_and_remove_spaces)
df.actor_2_name = df.actor_2_name.apply(trim_and_remove_spaces)

df.actor_1_name = df.actor_1_name.apply(trim_and_remove_spaces)
df.movie_title = df.movie_title.apply(trim_and_remove_spaces)
df.actor_3_name = df.actor_3_name.apply(trim_and_remove_spaces)
df.language = df.language.apply(trim_and_remove_spaces)
df.country = df.country.apply(trim_and_remove_spaces)

In [None]:
def descretize3(df, col):
    
    if(df[col].dtype == np.float64 or df[col].dtype == np.int64):
        df[col] = pd.qcut(df[col], 3, labels=['low', 'medium', 'high'], retbins=False)
        df[col] = df[col].apply(lambda x : (str(x)).replace(" ", ""))
        
def descretize2(df, col):
    
    if(df[col].dtype == np.float64 or df[col].dtype == np.int64):
        df[col] = pd.qcut(df[col], 2, labels=['low', 'high'], retbins=False)
        df[col] = df[col].apply(lambda x : (str(x)).replace(" ", ""))
        
def descretize10(df, col):
    
    if(df[col].dtype == np.float64 or df[col].dtype == np.int64):
        df[col] = pd.qcut(df[col], 3, labels=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], retbins=False)
        df[col] = df[col].apply(lambda x : (str(x)).replace(" ", ""))

descretize3(df, df.director_facebook_likes.name)
descretize3(df, df.num_critic_for_reviews.name)
descretize3(df, df.duration.name)
descretize3(df, df.actor_3_facebook_likes.name)
descretize3(df, df.actor_1_facebook_likes.name)

descretize3(df, df.gross.name)
descretize3(df, df.num_voted_users.name)
descretize3(df, df.cast_total_facebook_likes.name)
descretize3(df, df.num_user_for_reviews.name)
descretize3(df, df.budget.name)

descretize3(df, df.actor_2_facebook_likes.name)
descretize3(df, df.imdb_score.name)
descretize2(df, df.facenumber_in_poster.name)
descretize3(df, df.title_year.name)
descretize3(df, df.aspect_ratio.name)
descretize2(df, df.movie_facebook_likes.name)

In [None]:
from sklearn.utils import shuffle
df = shuffle(df)
df

In [None]:
raw_sentences = []
for a in df.iterrows():
    words = []    
    for b in a[1].iteritems():
        if b[0] == 'genres':
            for g in str(b[1]).split("|"):
                words.append('genre')
                words.append(trim_and_remove_spaces(g.lower()))
        elif b[0] == 'plot_keywords':
            for p in str(b[1]).split("|"):
                words.append('plot')
                words.append(trim_and_remove_spaces(p.lower()))
        elif b[0] in ['director_name', 'actor_2_name', 'actor_1_name', 'actor_3_name']:
            words.append(str(b[0]))
            words.append(str(b[1]))
        else:
            words.append(str(b[0]) + "_" + str(b[1]))
    raw_sentences.append(' '.join(words))

In [None]:
def sentence_to_wordlist(raw):
    words = raw.split()
    return words

tokenized_sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        tokenized_sentences.append(sentence_to_wordlist(raw_sentence))

In [None]:
token_count = sum([len(sentence) for sentence in tokenized_sentences])
print('Total token count = {0:,}'.format(token_count))

In [132]:
tokenized_sentences[51]

['color_Color',
 'director_name',
 'Peter_Berg',
 'num_critic_for_reviews_medium',
 'duration_medium',
 'director_facebook_likes_high',
 'actor_3_facebook_likes_high',
 'actor_2_name',
 'Rosario_Dawson',
 'actor_1_facebook_likes_high',
 'gross_high',
 'genre',
 'action',
 'genre',
 'adventure',
 'genre',
 'comedy',
 'genre',
 'thriller',
 'actor_1_name',
 'Dwayne_Johnson',
 'movie_title_The_Rundown',
 'num_voted_users_high',
 'cast_total_facebook_likes_high',
 'actor_3_name',
 'Ewen_Bremner',
 'facenumber_in_poster_low',
 'plot',
 'amazon',
 'plot',
 'bounty_hunter',
 'plot',
 'fight',
 'plot',
 'hunter',
 'plot',
 'jungle',
 'num_user_for_reviews_medium',
 'language_English',
 'country_USA',
 'content_rating_PG-13',
 'budget_high',
 'title_year_medium',
 'actor_2_facebook_likes_high',
 'imdb_score_medium',
 'aspect_ratio_medium',
 'movie_facebook_likes_low']

In [133]:
# Hyperparameters and training

num_features = 128
min_word_count = 1
num_workers = multiprocessing.cpu_count()
context_size = 10
downsampling = 1e-3
seed = 123

model = word2vec.Word2Vec(
    tokenized_sentences,
    sg=0, # for CBOW
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling,
    iter=10
)

In [None]:
# Compress the model to 2D
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
all_word_vectors_matrix = model.wv.syn0
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [None]:
colors = {'genre': 1, 'plot': 1, 'director':3}
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[model.wv.vocab[word].index])
            for word in model.wv.vocab
        ]
    ],
    columns=["word", "x", "y"]
)

%pylab inline



In [None]:
points['color'] = 0

In [None]:
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")

sns.lmplot('x', 'y',
           data=points,
           fit_reg=False,
           scatter_kws={"marker": "D",
                        "s": 10})
plt.title('t-SNE')
plt.xlabel('X')
plt.ylabel('Y')

In [None]:
def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) & 
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])
    ]
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

In [None]:
plot_region(x_bounds=(0.0, 0.2), y_bounds=(0.0, 0.2))

In [179]:
model.wv.

array([-0.13049535,  0.05881098,  0.12270202,  0.09838531,  0.13205636,
        0.06448915, -0.20486678,  0.00444005,  0.21556512, -0.0716982 ,
        0.00519379, -0.2883549 ,  0.11012977, -0.06077716,  0.07668479,
        0.26939711,  0.23944943, -0.16256697,  0.14142168,  0.11269884,
        0.11095925, -0.09057152, -0.11743756, -0.16584449,  0.07440011,
       -0.06001869,  0.2445102 ,  0.18840589,  0.34108984, -0.08446051,
        0.06635012,  0.02845452,  0.21246713,  0.14055717,  0.00928075,
        0.03632695, -0.17384207, -0.23710252, -0.12127564,  0.01456438,
        0.22097614,  0.00709927,  0.2507517 , -0.0772662 , -0.20985299,
        0.06874697,  0.12411286,  0.08478492,  0.03403108, -0.14745893,
       -0.23782755,  0.23149908, -0.19273543,  0.01799721,  0.20531186,
        0.1835835 , -0.00243864, -0.16327122,  0.18075036,  0.18875886,
       -0.40238693, -0.28551352,  0.18977487,  0.04106176, -0.0267708 ,
        0.25772169,  0.05636012, -0.09839606,  0.08687349,  0.06

In [289]:
def likeBradPitt(v):
    if not v:
        return False
    return str(v) != 'Brad_Pitt' and model.wv.similarity(v, 'Brad_Pitt') > 0.8

def likeToken(v, token):
    try:
        return model.wv.similarity(v, token) > 0.5
    except:
        return False

In [291]:
df[(df.movie_title.apply(lambda y: likeToken('movie_title_' + y, 'comedy'))) & (df.actor_1_name.apply(lambda x : likeBradPitt(x)))]

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
2393,Color,Penny_Marshall,medium,high,high,high,Jon_Lovitz,high,high,Comedy|Drama|Family|Fantasy|Romance,Tom_Hanks,Big,high,high,John_Heard,low,friend|job|new york city|toy|wish,medium,English,USA,PG,medium,low,high,high,low,low
3279,Color,Floria_Sigismondi,high,medium,medium,high,Scout_Taylor-Compton,high,low,Biography|Drama|Music,Kristen_Stewart,The_Runaways,medium,high,Johnny_Lewis,high,band|box office flop|critically bashed|histori...,medium,English,USA,R,low,high,high,medium,medium,low
1501,Color,Taylor_Hackford,low,high,high,high,Jesse_Borrego,medium,low,Crime|Drama,Delroy_Lindo,"Blood_In,_Blood_Out",medium,medium,Raymond_Cruz,high,1970s|1980s|barrio|gang war|mexican,medium,English,USA,R,high,low,medium,high,low,high
1574,Color,Ariel_Vromen,medium,high,medium,low,Jordi_Mollà,high,medium,Action|Crime|Drama|Mystery|Sci-Fi|Thriller,Gary_Oldman,Criminal,medium,high,Doug_Cockle,high,cia|husband wife relationship|memory|tied up w...,medium,English,UK,R,medium,high,high,medium,medium,low
2929,Color,Mike_Judge,low,low,high,high,Demi_Moore,high,high,Adventure|Animation|Comedy|Crime,Bruce_Willis,Beavis_and_Butt-Head_Do_America,medium,high,John_Doman,low,beavis and butt head|fbi|score|television|tele...,medium,English,USA,PG-13,medium,low,high,medium,low,low
2133,Color,Trey_Parker,high,low,high,medium,Eric_Idle,medium,high,Animation|Comedy|Fantasy|Musical,Minnie_Driver,South_Park:_Bigger_Longer_&_Uncut,high,medium,Trey_Parker,low,boy|canada|hell|misunderstanding|refrence to l...,high,English,USA,R,medium,low,medium,high,low,low
1989,Color,John_Hillcoat,high,medium,high,high,Charlize_Theron,high,low,Adventure|Drama,Viggo_Mortensen,The_Road,high,high,Robert_Duvall,high,apocalypse|boy|food|pistol|survival,high,English,USA,R,medium,medium,high,high,medium,high
3405,Color,Anand_Tucker,medium,low,low,medium,Jim_Broadbent,high,low,Biography|Drama,Colin_Firth,When_Did_You_Last_See_Your_Father?,low,high,Gina_McKee,high,cancer|children|death|terminal cancer|time,low,English,UK,PG-13,low,medium,high,medium,medium,high
1266,Color,Steven_Brill,medium,medium,medium,medium,Shaun_Weiss,medium,medium,Comedy|Drama,Lisa_Ann_Walter,Drillbit_Taylor,medium,medium,Matt_Walsh,low,bodyguard|bully|generation y|high school|nerd,medium,English,USA,PG-13,high,medium,medium,low,medium,high
2629,Color,Michael_Cimino,medium,high,high,high,Meryl_Streep,high,,Drama|War,Robert_De_Niro,The_Deer_Hunter,high,high,John_Savage,high,escape|friend|party|pittsburgh steelers|vietnam,high,English,UK,R,medium,low,high,high,medium,high
