In [3]:
# lets start by getting the data
import csv
import math
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter
import urllib2
import json
import re
import os
import numpy as np
from datetime import datetime
import sys

# Purpose

In this notebook we gather additional data to support our analysis of the Kaggle movie dataset. The additional data is the following

- **IMDb rating for each movie in the dataset**
- **IMDb storyline for each movie in the dataset**
- ** Download all manuscripts from the www.imsdb.com website, 1116 movie manuscripts in total. **



**Note** We also downloaded the movie reviews from the website http://ai.stanford.edu/~amaas/data/sentiment/. The code as well as analysis of the movie reviews can be found in the notebook 'Sentiment_reviews'

In [4]:
#These are movies we couldn't find the IMDb rating for or they were missing crucial data in the Kaggle dataset
movies_to_delete =[
    'National Lampoon’s Van Wilder',
    'Bran Nue Dae',
    'Pokémon: Spell of the Unknown',
    'Alien³',
    'Guten Tag, Ramón'
]

In [5]:
# Get data line by line from tmdb-5000-movie-database and save the data as dictionary

filepath = "/Users/GretarAtli/Documents/GitHub/Dtu/Dtu-SocialGraphs-FinalProject/Data/tmdb-5000-movie-dataset/tmdb_5000_movies.csv"
tmdb_5000_movies = defaultdict(dict)

with open(filepath) as csvfile:
    reader = csv.DictReader(csvfile, delimiter=",")
    for row in reader:
        tmdb_5000_movies[row["title"]] = row
        
for title in sorted(tmdb_5000_movies.keys()):
    title = title.replace(" ","-").replace("(","").replace(")","")

In [6]:
# Get data line by line from tmdb-5000-movie-database and save the data as dictionary
filepath = "/Users/GretarAtli/Documents/GitHub/Dtu/Dtu-SocialGraphs-FinalProject/Data/tmdb-5000-movie-dataset/tmdb_5000_credits.csv"
tmdb_5000_credits = defaultdict(dict)

with open(filepath) as csvfile:
    reader = csv.DictReader(csvfile, delimiter=",")
    for row in reader:
        tmdb_5000_credits[row["title"]] = row

# Scrape the IMDb website for IMDb ratings

The following code uses the BeautifulSoup module in python to get the IMDb rating for each movie in our Kaggle dataset. The code scrapes the [IMDb](http://www.imdb.com/) website and collects the rating for each movie and saves it to a dictionary. We then save the dictionary as a json file.

In [None]:
import os
import sys
import re
import urllib
import urlparse
 
from mechanize import Browser
from BeautifulSoup import BeautifulSoup

# title of the movie
title = "Avatar"

def getImdbRatings(title):

    # IMDB URL of the movie
    url = None
    # IMDB rating of the movie
    rating = None
    # Did we find a result?
    found = False

    # constant
    BASE_URL = 'http://www.imdb.com'

    movie = '+'.join(title.split())
    br = Browser()
    url = "%s/find?s=tt&q=%s" % (BASE_URL, movie)
        
    try:
        br.open(url)
    
        if re.search(r'/title/tt.*', br.geturl()):
            url = "%s://%s%s" % urlparse.urlparse(br.geturl())[:3]
            print url
            soup = BeautifulSoup( html_doc, 'html.parser' )
        else:
            link = br.find_link(url_regex = re.compile(r'/title/tt.*'))
            res = br.follow_link(link)
            url = urlparse.urljoin(BASE_URL, link.url)
            print url
            soup = BeautifulSoup(res.read())

        title = soup.find('h1').contents[0].strip()
        for span in soup.findAll('span'):
            if span.has_key('itemprop') and span['itemprop'] == 'ratingValue':
                rating = span.contents[0]
                break
        found = True
        return rating
    except:
        return 0

Imdb_5000_movies = defaultdict(float)

for i,title in enumerate(sorted(tmdb_5000_movies.keys())):
    title_to_use = title.replace("(","").replace(")","")
    print title_to_use
    imdb_rating = getImdbRatings(title)
    print imdb_rating
    Imdb_5000_movies[tmdb_5000_movies[title]['id']] = imdb_rating

**Save IMDb ratings to file**

Adding the IMDb values manually that failed when scraping the imdb website. There where some movies that failed when we tried to scrape the IMDb website. These movies where added manually, if IMDb did not have any information about the movie then we simply assign the value 'None' to those movies and then in the analysis we ignore these movies. These movies are only 5 and therefore we did not consider this to be a major problem.

In [53]:
Imdb_5000_movies_withid["161795"] = '7.2'
Imdb_5000_movies_withid["8859"] = '5.5'
Imdb_5000_movies_withid["11452"] = 'None'
Imdb_5000_movies_withid["10681"] = '8.4'
Imdb_5000_movies_withid["10991"] = 'None'
Imdb_5000_movies_withid["37137"] = '6.8'
Imdb_5000_movies_withid["18480"] = 'None'
Imdb_5000_movies_withid["8077"] = 'None'
Imdb_5000_movies_withid["10664"] = '7.5'
Imdb_5000_movies_withid["11661"] = '7.8'
Imdb_5000_movies_withid["82695"] = '7.6'
Imdb_5000_movies_withid["333355"] = '8.0'
Imdb_5000_movies_withid["41009"] = '6.4'
Imdb_5000_movies_withid["1391"] = '7.7'
Imdb_5000_movies_withid["242575"] = 'None'
Imdb_5000_movies_withid["36593"] = '6.5'
Imdb_5000_movies_withid["335244"] = '6.9'
Imdb_5000_movies_withid["64499"] = '7.7'
Imdb_5000_movies_withid["38570"] = '2.6'
Imdb_5000_movies_withid["301325"] = '3.1'
Imdb_5000_movies_withid["194"] = '8.3'
Imdb_5000_movies_withid["304410"] = '7.4'
Imdb_5000_movies_withid["11011"] = '5.3'

Saving the result as a json file. 

In [56]:
import json

result_file_uri = "/Users/GretarAtli/Dropbox/SocialGraph/results/imdb-score-mod.json"

#This is commented out as to not repeat
#with open(result_file_uri, 'w') as fp:
#    json.dump(Imdb_5000_movies_withid, fp)


In [55]:
len(Imdb_5000_movies_withid)

4797

** Get IMDb rating from file **

In [61]:
# We use the fact that we know that each movie has a name, so we do not include any error handling
def findMovieNameFromId(id):
    name = [key for key,info in tmdb_5000_movies.items() if info['id'] == id]
    return name[0]

In [62]:
result_file_uri = "/Users/GretarAtli/Dropbox/SocialGraph/results/imdb-score-mod.json"

# Getting Imdb scores from data and convert the id to name 
json1_file = open(result_file_uri)
json1_str = json1_file.read()
Imdb_5000_movies_withid = json.loads(json1_str)

Imdb_5000_movies = defaultdict(float)

# change the dictionary key from id to name of the movie
for key,value in Imdb_5000_movies_withid.items():
    try:
        name = findMovieNameFromId(key)
        Imdb_5000_movies[name] = (value)
    except: 
        print("The movie {} has been removed from kaggle database".format(key))

# Scrape the IMDb website for IMDb story line

The following code uses the BeautifulSoup module in python to get the IMDb storyline for each movie in our Kaggle database. The code scrapes the [IMDb](http://www.imdb.com/) website and collects the storyline for each movie and saves it to a dictionary. We then save the dictionary as a json file.  

In [4]:
import os
import sys
import re
import urllib
import urlparse
 
from mechanize import Browser
from BeautifulSoup import BeautifulSoup

# title of the movie
title = "Avatar"

def getImdbStoryLine(title):

    # IMDB URL of the movie
    url = None
    # IMDB rating of the movie
    rating = None
    # Did we find a result?
    found = False

    # constant
    BASE_URL = 'http://www.imdb.com'

    movie = '+'.join(title.split())
    br = Browser()
    url = "%s/find?s=tt&q=%s" % (BASE_URL, movie)
        
    try:
        br.open(url)
    
        if re.search(r'/title/tt.*', br.geturl()):
            url = "%s://%s%s" % urlparse.urlparse(br.geturl())[:3]
            print url
            soup = BeautifulSoup( html_doc, 'html.parser' )
        else:
            link = br.find_link(url_regex = re.compile(r'/title/tt.*'))
            res = br.follow_link(link)
            url = urlparse.urljoin(BASE_URL, link.url)
            print url
            soup = BeautifulSoup(res.read())

        title = soup.find('h1').contents[0].strip()
        
        for i in soup.findAll('div',{"class":"inline canwrap","itemprop":"description"}):
            p = i.find("p")
            return p.text

        found = True
        return rating
    except Exception as e:
        print e
        return 0

Imdb_5000_storyline = defaultdict(str)

for i,title in enumerate(sorted(tmdb_5000_movies.keys())):
    title_to_use = title.replace("(","").replace(")","")
    print title_to_use
    imdb_storyline = getImdbStoryLine(title)
    #print imdb_storyline
    Imdb_5000_storyline[tmdb_5000_movies[title]['id']] = imdb_storyline

#Horror

500 Days of Summer
http://www.imdb.com/title/tt1022603/?ref_=fn_tt_tt_1
10 Cloverfield Lane
http://www.imdb.com/title/tt1179933/?ref_=fn_tt_tt_1
10 Days in a Madhouse
http://www.imdb.com/title/tt3453052/?ref_=fn_tt_tt_1
10 Things I Hate About You
http://www.imdb.com/title/tt0147800/?ref_=fn_tt_tt_1
102 Dalmatians
http://www.imdb.com/title/tt0211181/?ref_=fn_tt_tt_1
10th & Wolf
http://www.imdb.com/title/tt0360323/?ref_=fn_tt_tt_1
11:14
http://www.imdb.com/title/tt0331811/?ref_=fn_tt_tt_1
12 Angry Men
http://www.imdb.com/title/tt0050083/?ref_=fn_tt_tt_1
12 Rounds
http://www.imdb.com/title/tt1160368/?ref_=fn_tt_tt_1
12 Years a Slave
http://www.imdb.com/title/tt2024544/?ref_=fn_tt_tt_1
127 Hours
http://www.imdb.com/title/tt1542344/?ref_=fn_tt_tt_1
13 Going on 30
http://www.imdb.com/title/tt0337563/?ref_=fn_tt_tt_1
13 Hours: The Secret Soldiers of Benghazi
http://www.imdb.com/title/tt4172430/?ref_=fn_tt_tt_1
1408
http://www.imdb.com/title/tt0450385/?ref_=fn_tt_tt_1
15 Minutes
http:

** Save IMDb storyline to file **

In [5]:
import json

result_file_uri_story = "/Users/GretarAtli/Dropbox/SocialGraph/results/imdb-storyline.json"

with open(result_file_uri_story, 'w') as fp:
    json.dump(Imdb_5000_storyline, fp)

# Download all manuscripts from imsdb.com

This is a python code that downloads (and cleans) all 
scripts on the website [IMSDb](http://www.imsdb.com).

This code was taken from the [this github repository](https://github.com/j2kun/imsdb_download_all_scripts) and adapted to our needs. 

In [None]:
import os
from urllib.parse import quote

from bs4 import BeautifulSoup
import requests

BASE_URL = 'http://www.imsdb.com'
SCRIPTS_DIR = 'scripts'


def clean_script(text):
    text = text.replace('Back to IMSDb', '')
    text = text.replace('''<b><!--
</b>if (window!= top)
top.location.href=location.href
<b>// -->
</b>
''', '')
    text = text.replace('''          Scanned by http://freemoviescripts.com
          Formatting by http://simplyscripts.home.att.net
''', '')
    return text.replace(r'\r', '')


def get_script(relative_link):
    tail = relative_link.split('/')[-1]
    print('fetching %s' % tail)
    script_front_url = BASE_URL + quote(relative_link)
    front_page_response = requests.get(script_front_url)
    front_soup = BeautifulSoup(front_page_response.text, "html.parser")

    try:
        script_link = front_soup.find_all('p', align="center")[0].a['href']
    except IndexError:
        print('%s has no script :(' % tail)
        return None, None

    if script_link.endswith('.html'):
        title = script_link.split('/')[-1].split(' Script')[0]
        script_url = BASE_URL + script_link
        script_soup = BeautifulSoup(requests.get(script_url).text, "html.parser")
        script_text = script_soup.find_all('td', {'class': "scrtext"})[0].get_text()
        script_text = clean_script(script_text)
        return title, script_text
    else:
        print('%s is a pdf :(' % tail)
        return None, None


if __name__ == "__main__":
    response = requests.get('http://www.imsdb.com/all%20scripts/')
    html = response.text

    soup = BeautifulSoup(html, "html.parser")
    paragraphs = soup.find_all('p')

    for p in paragraphs:
        relative_link = p.a['href']
        title, script = get_script(relative_link)
        if not script:
            continue

        with open(os.path.join(SCRIPTS_DIR, title.strip('.html') + '.txt'), 'w') as outfile:
            outfile.write(script)
