# Milestone 2

In [None]:
#pip install nltk
#pip install vaderSentiment

In [54]:
#imports

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from nltk import tokenize
from itertools import groupby
import seaborn as sns
import matplotlib.pyplot as plt

In [55]:
df_plots = pd.read_csv('MovieSummaries/plot_summaries.txt', sep = '\t', header=None, names = ['id', 'plot'])

In [56]:
df_plots.shape[0]

42303

In [57]:
df_plots.head(5)

Unnamed: 0,id,plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [58]:
#tranform plots into arrays of sentences using the ntlk library
df_plots['plot']= df_plots['plot'].apply(lambda t: tokenize.sent_tokenize(t))
df_plots.head(2)

Unnamed: 0,id,plot
0,23890098,"[Shlykov, a hard-working taxi driver and Lyosh..."
1,31186339,[The nation of Panem consists of a wealthy Cap...


In [59]:
analyzer = SentimentIntensityAnalyzer()

def classify(compound):
    if compound >= 0.05:
        #positive
        return 1
    elif compound <= -0.05:
        #negative
        return -1
    #neutral
    return 0

def analyse_text(text): 
    res = []
    for sentence in text:
        vs = analyzer.polarity_scores(sentence)
        res.append(classify(vs['compound']))
    return res

In [60]:
df_plots['plot'] = df_plots['plot'].apply(lambda t: analyse_text(t))

In [61]:
df_plots.head(5)

Unnamed: 0,id,plot
0,23890098,[0]
1,31186339,"[0, -1, 1, 0, 0, -1, -1, 1, 1, 1, 0, -1, -1, 0..."
2,20663735,"[-1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 0, -1, -1..."
3,2231378,"[-1, 1, 1, -1, -1, 0, 0, 1, -1, -1, 1, 0, -1, ..."
4,595909,"[1, 1, 1, -1, 1, 0, -1, 1, -1, -1, -1, 1, -1, ..."


In [62]:
#group consecutives duplicates
#Example : l = [1,1,2,2,2,1] becomes [(1,2), (2,3), (1,1)]

df_plots['plot'] = df_plots['plot'].apply(lambda t: [(k, sum(1 for i in g)) for k,g in groupby(t)])

In [12]:
df_plots.head(5)

Unnamed: 0,id,plot
0,23890098,"[(0, 1)]"
1,31186339,"[(0, 1), (-1, 1), (1, 1), (0, 2), (-1, 2), (1,..."
2,20663735,"[(-1, 1), (1, 1), (-1, 5), (1, 3), (0, 1), (-1..."
3,2231378,"[(-1, 1), (1, 2), (-1, 2), (0, 2), (1, 1), (-1..."
4,595909,"[(1, 3), (-1, 1), (1, 1), (0, 1), (-1, 1), (1,..."


In [26]:
df_plots.head(5)

Unnamed: 0,id,plot
0,23890098,[0]
1,31186339,"[0, -1, 1, 0, 0, -1, -1, 1, 1, 1, 0, -1, -1, 0..."
2,20663735,"[-1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 0, -1, -1..."
3,2231378,"[-1, 1, 1, -1, -1, 0, 0, 1, -1, -1, 1, 0, -1, ..."
4,595909,"[1, 1, 1, -1, 1, 0, -1, 1, -1, -1, -1, 1, -1, ..."


In [63]:
df_metadatas = pd.read_csv('MovieSummaries/movie.metadata.tsv',sep='\t', names=["id", "FreebaseId", "Title", "release date", "boxOffice", "Runtime", "language", "country", "genres"])
df_metadatas.head(5)

Unnamed: 0,id,FreebaseId,Title,release date,boxOffice,Runtime,language,country,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [64]:
df = df_metadatas.merge(df_plots, left_on="id", right_on='id')
df.head(5)

Unnamed: 0,id,FreebaseId,Title,release date,boxOffice,Runtime,language,country,genres,plot
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","[(1, 2), (-1, 2), (1, 1), (0, 1), (-1, 3), (0,..."
1,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...","[(-1, 1), (1, 2), (-1, 3), (0, 2), (-1, 2), (1..."
2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}","[(-1, 1), (0, 2), (1, 1), (0, 3), (1, 1), (-1,..."
3,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...","[(-1, 2), (0, 1), (1, 2), (-1, 2)]"
4,6631279,/m/0gffwj,Little city,1997-04-04,,93.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...","[(0, 1), (-1, 1), (1, 1), (0, 1), (-1, 2), (0,..."


In [65]:
df = df[['id', 'plot', 'release date', 'Title', 'boxOffice']]

In [66]:
df.head(5)

Unnamed: 0,id,plot,release date,Title,boxOffice
0,975900,"[(1, 2), (-1, 2), (1, 1), (0, 1), (-1, 3), (0,...",2001-08-24,Ghosts of Mars,14010832.0
1,9363483,"[(-1, 1), (1, 2), (-1, 3), (0, 2), (-1, 2), (1...",1987,White Of The Eye,
2,261236,"[(-1, 1), (0, 2), (1, 1), (0, 3), (1, 1), (-1,...",1983,A Woman in Flames,
3,18998739,"[(-1, 2), (0, 1), (1, 2), (-1, 2)]",2002,The Sorcerer's Apprentice,
4,6631279,"[(0, 1), (-1, 1), (1, 1), (0, 1), (-1, 2), (0,...",1997-04-04,Little city,


In [32]:
df_titles = pd.read_csv('MovieSummaries/titles.tsv', sep = '\t', low_memory=False)

In [33]:
df_title = df_titles[df_titles['titleType'] == 'movie']

In [34]:
df_title.head(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama


In [35]:
df_ratings = pd.read_csv('MovieSummaries/ratings.tsv', sep = '\t')

In [36]:
df_ratings.head(5)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1922
1,tt0000002,5.8,259
2,tt0000003,6.5,1734
3,tt0000004,5.6,174
4,tt0000005,6.2,2545


In [37]:
df_imdb = df_title.merge(df_ratings, left_on='tconst', right_on='tconst')

In [40]:
df_imdb= df_imdb[['originalTitle', 'averageRating', 'numVotes']]

In [41]:
df_imdb.head(5)

Unnamed: 0,originalTitle,averageRating,numVotes
0,Miss Jerry,5.2,200
1,Bohemios,4.2,14
2,The Story of the Kelly Gang,6.0,794
3,L'enfant prodigue,5.1,20
4,Robbery Under Arms,4.3,23


In [42]:
#Need to verify if Title and originalTitle can be used to merge
df = df.merge(df_imdb, left_on='Title', right_on='originalTitle')

In [50]:
df = df[['Title', 'plot', 'release date', 'averageRating', 'numVotes']]

In [51]:
df.shape[0]

10773

In [67]:
#Remove rows where box office is Nan
df = df.loc[pd.notnull(df.boxOffice)]

In [68]:
# Throws an error if we try to convert 1010-12-02 into DateTime 
df = df[df['release date'] != '1010-12-02']

In [69]:
df['release date'] = pd.to_datetime(df['release date'])
df['release date'] = df['release date'].dt.year

In [70]:
df.head(5)

Unnamed: 0,id,plot,release date,Title,boxOffice
0,975900,"[(1, 2), (-1, 2), (1, 1), (0, 1), (-1, 3), (0,...",2001.0,Ghosts of Mars,14010832.0
5,171005,"[(0, 1), (1, 1), (-1, 1)]",1989.0,Henry V,10161099.0
8,77856,"[(0, 1), (1, 1), (-1, 2), (1, 1), (-1, 2), (1,...",1964.0,Mary Poppins,102272727.0
12,156558,"[(0, 1), (1, 1), (-1, 1), (0, 1), (-1, 1), (1,...",2001.0,Baby Boy,29381649.0
22,261237,"[(1, 2), (-1, 1), (0, 1), (-1, 1), (0, 1), (-1...",1980.0,The Gods Must Be Crazy,34331783.0
