In [11]:
#################################################
# Prepare data for baseline
#
# Source: IMSDb
# This script extracts and prepares screenplay text data ready for machine learning.
# - Scrapes screenplay text data from IMSDb
# - Data cleaning and text preprocessing
#   - Removing HTML Markup: The Beautiful Soup package
#   - Dealing with punctuation, numbers and stopwords
# - Put it back all together
#################################################

In [12]:
import csv
from BeautifulSoup import BeautifulSoup  # I'm using version 3.2.1 BeautifulSoup version 4 has different syntax
from collections import Counter
import urllib
import importlib
import numpy as np
import json
import os
import re
import sys
import math
import pandas as pd
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [13]:
class preprocessing:
    def __init__(self):
        self.y = []             # y labels: global box-office for each movie
        self.x = []             # x input: screenplay text
        self.title_list =[]     # look-up movie titles for each x value
        self.tmp_y = []
        self.tmp_title_list = []
        self.fail_screenplay = 0
        self.success_screenplay = 0
        self.y = []

    def create_label_data(self):
        with open("movie_data.csv", 'r') as f:
            reader = csv.reader(f, delimiter=',')
            for row in reader:
                self.tmp_title_list.append(row[0]) 
                self.tmp_y.append(row[1])
            print("Started with %f total movie lists" %len(self.tmp_title_list))
            #print("Created label data of %f y value" %len(self.tmp_y))

    def create_input_data(self):  # Go through the title list and scrape screenplay text
        for m in self.tmp_title_list:
            self.x.append(preprocessing.scrape_screenplay(self,m))
        # Filter out none values
        self.x = filter(lambda a: a != 0, self.x)
        #print("Scraped and saved: %f screenplay" %len(self.x))
        #print("Scraped and saved: %f titles" % len(self.title_list))
        #print("Titles:", self.title_list) # Check movie titles which screenplays aren't available
        print("Success movies: ", self.success_screenplay)
        print("Fail movies: ", self.fail_screenplay)

    def scrape_screenplay(self, movie_title):
        try:
            url = 'http://www.imsdb.com/scripts/%s.html' % movie_title
            page = urllib.urlopen(url)
            soup = BeautifulSoup(page.read())
            rawtext = str(soup.find("td", {"class": "scrtext"}))
            clean = rawtext.lower()
            clean = re.sub('<[^<]+?>','',clean)
            words = clean.split()
            if len(words) > 200:
                self.success_screenplay += 1
                #print(self.tmp_title_list.index(movie_title))
                #print(self.tmp_y[self.tmp_title_list.index(movie_title)])
                self.y.append(self.tmp_y[self.tmp_title_list.index(movie_title)]) # Find the value from the temp list
                self.title_list.append(movie_title)
                results = " ".join(words)
                return results
            else: 
                self.fail_screenplay += 1
                #print("%s movie screenplay is not available" %movie_title)
                return 0
        except:  # If movie URL is dead, skip it
            pass
        
    def hit_or_flop(self):
        # Defines hit or flop scale:
        #  5: blockbuster, 400M ~ above, ex. Finding Dory, Star Wars
        #  4: hit, 200M ~ 400M, ex. Ghostbusters, King Kong
        #  3: average, 100M ~ 200M, ex. Bridesmaid, Sleepless in Seattle
        #  2: flop, 50M ~ 100M, ex. Sin City, Splash
        #  1: disaster, ~50M, ex. who cares...
        #self.y = self.y[:-1] #BUG: I don't know why there is extra here.
        for idx,b in enumerate(self.y):
            if self.y[idx] > '400000000':
                self.y[idx] = 5
            elif self.y[idx] > '200000000' and self.y[idx] < '400000000':
                self.y[idx] = 4
            elif self.y[idx] > '100000000' and self.y[idx] < '200000000' :
                self.y[idx] = 3
            elif self.y[idx] > '50000000' and self.y[idx] < '100000000':
                self.y[idx] = 2
            elif self.y[idx] < '50000000' :
                self.y[idx] = 1
        #print(self.y) # Check y values are changed.

    def split_train_test(self,train_portion):
        self.train_set_x = self.x[:train_portion]
        self.test_set_x = self.x[train_portion:]
        self.train_set_y = self.y[:train_portion]
        self.test_set_y = self.y[train_portion:]
        print("Train set %f" %len(self.train_set_x))
        print("Test set %f" % len(self.test_set_x))

    def feature_extraction(self):
        vectorizer = CountVectorizer(analyzer= 'word',
                                     tokenizer= None,
                                     preprocessor= None,
                                     stop_words= 'english',
                                     max_features= 500)

        self.train_data_features = vectorizer.fit_transform(self.train_set_x)
        self.train_data_features = self.train_data_features.toarray()
        
        self.test_data_features = vectorizer.fit_transform(self.test_set_x)
        self.test_data_features = self.test_data_features.toarray()
        print("Training data array looks like this:", self.train_data_features.shape)
        print("Test data array looks like this:", self.test_data_features.shape)

    def train_randomForest(self):
        # Initialize RF with 100 trees
        forest = RandomForestClassifier(n_estimators=100)

        # Fit the forest to the training set, using the bag of words as features
        # and box office labels as the response variable

        self.forest = forest.fit(self.train_data_features, self.train_set_y)
        print("Finished training the data")

    def make_prediction(self): 
        self.result = self.forest.predict(self.test_data_features)
        self.output = pd.DataFrame(data={"box-office": self.result})
        print(self.output)
    
    def score(self):
        self.pscore = metrics.accuracy_score(self.test_set_y, self.result)
        #self.pscore_train = metrics.accuracy_score(y_train, pred_train)
        print(self.pscore)

In [14]:
job = preprocessing()
job.create_label_data()
job.create_input_data()

Started with 1513.000000 total movie lists
('Success movies: ', 195)
('Fail movies: ', 1318)


In [15]:
job.hit_or_flop()

In [16]:
# Split train and test set. Argument is the count of movie titles for train dataset.
job.split_train_test(150)

Train set 150.000000
Test set 45.000000


In [17]:
job.feature_extraction()

('Training data array looks like this:', (150, 500))
('Test data array looks like this:', (45, 500))


In [18]:
job.train_randomForest()

Finished training the data


In [19]:
job.make_prediction()

    box-office
0            5
1            5
2            5
3            5
4            5
5            5
6            5
7            5
8            5
9            4
10           5
11           5
12           4
13           5
14           5
15           5
16           5
17           5
18           5
19           3
20           5
21           5
22           5
23           5
24           5
25           5
26           5
27           5
28           5
29           5
30           5
31           5
32           5
33           5
34           5
35           5
36           5
37           5
38           5
39           5
40           5
41           4
42           5
43           5
44           3


In [20]:
job.score()

0.333333333333


In [21]:
job.title_list[:45]

['Bean',
 'American-Sniper',
 'Gladiator',
 'Extract',
 'Colombiana',
 'Argo',
 'Django-Unchained',
 'Limitless',
 'Life-of-Pi',
 'Funny-People',
 'Dragonslayer',
 'Despicable-Me-2',
 'Deception',
 'Buried',
 'Big-Fish',
 'Crank',
 'Cliffhanger',
 'Collateral-Damage',
 'In-the-Bedroom',
 'Lake-Placid',
 'Gamer',
 'Insidious',
 'Bad-Boys',
 'Dallas-Buyers-Club',
 'E.T.',
 'Larry-Crowne',
 'Field-of-Dreams',
 "Malibu's-Most-Wanted",
 'Beasts-of-the-Southern-Wild',
 'Collateral',
 'Lincoln',
 'Dumb-and-Dumber',
 'Hostage',
 'Burn-After-Reading',
 'Disturbia',
 'Go',
 'King-Kong',
 'Finding-Nemo',
 'Dogma',
 'Backdraft',
 'Enough',
 'Inception',
 '2001-A-Space-Odyssey',
 'Easy-A',
 'Liar-Liar']