# Project Description

The goal of this project is to find out whether textual information can be a sole factor to distinguish one screenplay from another. The first part of this project will cover computing similarities between different movies based on text. The second part of this experiment will cover a simple Random Forest model to predict a movie's box-office revenue based on text.

#### Table of Contents
1. Prepare Dataset
2. Compute Similarities using LSA, tf-idf, and cosine similarity
3. Predict movie revenue using BOW and Random Forest classifier

### 1. Prepare Dataset
- Scrape and join dataset
- Standard preprocessing
- Additional preprocessing

In [1]:
# Import
import csv
from BeautifulSoup import BeautifulSoup
from collections import Counter
import urllib
import importlib
import numpy as np
import json
import os
import re
import sys
import math
import pandas as pd
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from string import digits
import copy
import nltk
from gensim import corpora, models, similarities
import gensim
import itertools
import math
from textblob import TextBlob as tb
from nltk.corpus import stopwords
import copy
import csv
from nltk.stem import SnowballStemmer
import matplotlib.pyplot as plt

#### 1-1. Scraping and saving into csv file

For the purpose of this study, movie meta data is already scraped and saved using publically available scraping python codes
(For example, https://github.com/skozilla/BoxOfficeMojo/tree/master/boxofficemojoAPI). 

From a set of movie titles and revenue ("movie_data_625.csv"), the following scrapes corresponding movie scrips. Run them if you want to gather it yourself, but saved files are provided in the folder so you may want to just read in those csv files.

In [3]:
class upload_dataset:
    def __init__(self):
        self.upload_revenue = []
        self.upload_title_list = []

    def upload_movieList(self):
        #with open("movie_data_romance.csv", 'r') as f:
        with open("movie_data_625.csv", 'r') as f:
            reader = csv.reader(f, delimiter=',')
            for row in reader:
                self.upload_title_list.append(row[0]) 
                self.upload_revenue.append(row[1])
            print("Uploaded %s total movie lists" %len(self.upload_title_list))
            #print("Uploaded %s Box Office revenue" %len(self.upload_revenue))
            return self.upload_title_list, self.upload_revenue

In [7]:
class scrape_data:
    def __init__(self,upload_title_list,upload_revenue):
        self.upload_title_list = upload_title_list
        self.upload_revenue = upload_revenue
        self.y = []             # y labels: global box-office for each movie
        self.x = []             # x input: screenplay text
        self.title_list =[]     # look-up movie titles for each x value
        self.fail_screenplay = 0
        self.success_screenplay = 0
        self.y = []
        
    def input_data_save2file(self):  # Go through the title list and scrape screenplay text
        for m in self.upload_title_list:
            self.x.append(scrape_data.scrape_screenplay(self,m))
            self.y.append(self.upload_revenue[self.upload_title_list.index(m)])
            self.title_list.append(m)
        
        # Filter out none values
        for index, item in enumerate(self.x):
            if item == 0:
                self.y[index] = 0
                self.title_list[index] = 0

        self.x = filter(lambda a: a != 0, self.x)
        #print(self.x)
        #print(type(self.x))
        self.y = filter(lambda a: a != 0, self.y)
        #print(self.y)
        #print(type(self.y))
        self.title_list = filter(lambda a: a!= 0, self.title_list)
        #print(self.title_list)
        #print(type(self.title_list))

        # Test:
        with open('625_x.csv', 'wb') as f:
            writer = csv.writer(f, delimiter = ',')
            writer.writerow(self.x)

        with open('625_y.csv', 'wb') as csv_file:
            writer = csv.writer(csv_file, delimiter = ',')
            writer.writerow(self.y)
        
        with open('625_title.csv', 'wb') as f:
            writer = csv.writer(f, delimiter = ',')
            writer.writerow(self.title_list)
        
        # Entire set
        print('Number of success movies: ', self.success_screenplay)
        print('Number of fail movies: ', self.fail_screenplay)
        #return self.x, self.y, self.title_list

    def scrape_screenplay(self, movie_title):
        try:
            url = 'http://www.imsdb.com/scripts/%s.html' % movie_title
            page = urllib.urlopen(url)
            soup = BeautifulSoup(page.read())
            rawtext = str(soup.find("td", {"class": "scrtext"}))
            clean = re.sub('<[^<]+?>','',rawtext)
            clean = re.sub('[^a-zA-Z0-9 \n\.]', '', clean)
            clean = re.sub('&nbsp','',clean)
            words = clean.split()
            if len(words) > 200:
                self.success_screenplay += 1
                results = " ".join(words) # Back into string separated by space
                return results
            else: 
                self.fail_screenplay += 1
                #print("%s movie screenplay is not available" %movie_title)
                return 0
        except:  # If movie URL is dead, skip it
            pass

Uncomment the following if you want to gather dataset yourself. However, the data is already downloaded and provided as csv files.

In [8]:
#upload = upload_dataset()
#upload_title_list, upload_revenue = upload.upload_movieList()
#step_one = scrape_data(upload_title_list,upload_revenue)
#step_one.input_data_save2file()

('Number of success movies: ', 436)
('Number of fail movies: ', 188)


Otherwise, please read in the folloiwng csv files.

In [2]:
# Load y-value (movie revenue)

with open('625_y.csv', 'rb') as f:
    reader = csv.reader(f, delimiter = ',')
    temp_y = list(reader)

y_boxoffice = []
for sublist in temp_y:
    for val in sublist:
        y_boxoffice.append(val)
print(y_boxoffice[1])
print(len(y_boxoffice)) 

187733202
437


In [3]:
# Load x-value (movie text)
csv.field_size_limit(sys.maxsize)

with open('625_x.csv', 'rb') as f:
    reader = csv.reader(f, delimiter = ',')
    temp_x = list(reader)

x_scrtext = []
for sublist in temp_x:
    for val in sublist:
        x_scrtext.append(val)
#print(len(x_scrtext))

131072

In [5]:
# Load titles for reference.

with open('625_title.csv', 'rb') as f:
    reader = csv.reader(f, delimiter = ',')
    temp_title = list(reader)

movie_title = []
for sublist in temp_title:
    for val in sublist:
        movie_title.append(val)
print(movie_title[0])
print(len(movie_title))

500-Days-of-Summer
437


#### 1-2. Preprocessing (Standard + Additional)

In [8]:
def preprocessing_removeName(movie_text):
    movie_text_1 = []
    progress = 0
    for movie in movie_text:
        movie_split = movie.split()
        #print('Originally this many words: ',len(movie_split))
        for single_word in movie_split:
            try: 
                for w in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(single_word))):
                    if w.label() == 'PERSON':
                        movie_split.remove(single_word)
            except:
                continue
        #print('After moving names:', len(movie_split))
        movie_text_1.append(movie_split)
        progress += 1
        if progress%10 == 0:
            print('This many done: ', progress)
    #print(len(movie_text_1))
    #print(type(movie_text_1))
    return movie_text_1

In [9]:
x_scrtext = preprocessing_removeName(x_scrtext)

('This many done: ', 10)
('This many done: ', 20)
('This many done: ', 30)
('This many done: ', 40)
('This many done: ', 50)
('This many done: ', 60)
('This many done: ', 70)
('This many done: ', 80)
('This many done: ', 90)
('This many done: ', 100)
('This many done: ', 110)
('This many done: ', 120)
('This many done: ', 130)
('This many done: ', 140)
('This many done: ', 150)
('This many done: ', 160)
('This many done: ', 170)
('This many done: ', 180)
('This many done: ', 190)
('This many done: ', 200)
('This many done: ', 210)
('This many done: ', 220)
('This many done: ', 230)
('This many done: ', 240)
('This many done: ', 250)
('This many done: ', 260)
('This many done: ', 270)
('This many done: ', 280)
('This many done: ', 290)
('This many done: ', 300)
('This many done: ', 310)
('This many done: ', 320)
('This many done: ', 330)
('This many done: ', 340)
('This many done: ', 350)
('This many done: ', 360)
('This many done: ', 370)
('This many done: ', 380)
('This many done: ', 

In [10]:
# The above process took a while, so better save it and call it later. 
with open('625_x_removeName.csv', 'wb') as f:
            writer = csv.writer(f, delimiter = ',')
            writer.writerow(x_scrtext)

In [15]:
#print(x_scrtext[3])

In [6]:
# THEN OPEN IT BACK ... 
with open('625_x_removeName.csv', 'rb') as f:
    reader = csv.reader(f, delimiter = ',')
    temp_x = list(reader)

x_scrtext_sample = []
for sublist in temp_x:
    for val in sublist:
        x_scrtext_sample.append(sublist)
print(len(x_scrtext_sample))

437


In [7]:
# Make it flat. for some reason...
flat_x_scrtext = x_scrtext_sample[0]
#print(flat_x_scrtext[1])

In [8]:
MovieTransform1 = []
for movie in flat_x_scrtext:
    new = movie.replace(',', '')
    new1 = new.replace('.', '')
    new2 = new1.replace("'", "")
    new3 = new2.replace("[", "")
    new4 = new3.replace("]", "")
    new5 = new4.split()
    MovieTransform1.append(new5)

In [9]:
print(MovieTransform1[0][3])

SUMMER


In [10]:
def preprocessing_removeScene(movie_text):
    movie_text2 = []
    progress = 0
    for movie in movie_text:
        #print(type(movie))
        #print('Originally this many words:', len(movie))
        for w in movie:
            #print(w)
            if w.isupper():
                movie.remove(w)
        #print('After removing scene direcitions:', len(movie))
        movie_text2.append(movie)
        progress += 1
        if progress%50 == 0:
            print('This many done: ', progress)
    #print(len(movie_text2))
    #print(type(movie_text2))
    return(movie_text2)

In [11]:
x_scrtext_2 = preprocessing_removeScene(MovieTransform1)

('This many done: ', 50)
('This many done: ', 100)
('This many done: ', 150)
('This many done: ', 200)
('This many done: ', 250)
('This many done: ', 300)
('This many done: ', 350)
('This many done: ', 400)


In [43]:
print(x_scrtext_2[436])

['DARK', 'Written', 'by', 'Boal', 'October', '3rd', '2011', 'EMERGE', 'We', 'hear', 'the', 'actual', 'recorded', 'emergency', 'calls', 'made', 'by', 'World', 'Trade', 'office', 'workers', 'to', 'police', 'and', 'fire', 'departments', 'after', 'the', 'planes', 'struck', 'on', '911', 'just', 'before', 'the', 'buildings', 'collapsed', '11', '2001', 'We', 'listen', 'to', 'fragments', 'from', 'a', 'number', 'of', 'these', 'callsstarting', 'with', 'pleas', 'for', 'help', 'building', 'to', 'a', 'panic', 'ending', 'with', 'the', 'callers', 'grim', 'acceptance', 'that', 'help', 'will', 'not', 'arrive', 'that', 'the', 'situation', 'is', 'hopeless', 'that', 'they', 'are', 'about', 'to', 'die', 'YEARS', 'own', 'you', 'Ammar', 'You', 'belong', 'to', 'me', 'Look', 'at', 'me', 'This', 'is', 'STANTON', 'the', 'CIAs', 'man', 'in', 'Islamabad', 'a', 'big', 'American', 'late', '30s', 'with', 'a', 'long', 'anarchical', 'beard', 'snaking', 'down', 'to', 'his', 'tattooed', 'neck', 'He', 'looks', 'like', 'a'

In [12]:
# do it one more time
x_scrtext_3 = preprocessing_removeScene(x_scrtext_2)

('This many done: ', 50)
('This many done: ', 100)
('This many done: ', 150)
('This many done: ', 200)
('This many done: ', 250)
('This many done: ', 300)
('This many done: ', 350)
('This many done: ', 400)


In [13]:
print(len(x_scrtext_2[436]))
print(len(x_scrtext_3[436]))

18019
18019


In [14]:
stop_words = set(stopwords.words('english'))

In [15]:
def preprocessing_stemmer(movie_text):
    movie_text4 = []
    stemmer = PorterStemmer()
    for movie in movie_text:
        tmp_movie = []
        for w in movie:
            w = w.lower()
            if w not in stop_words:
                tmp_movie.append(stemmer.stem(w))
        movie_text4.append(tmp_movie)
    print(len(movie_text4))
    print(type(movie_text4))
    return movie_text4

In [16]:
x_scrtext_4 = preprocessing_stemmer(x_scrtext_3)

437
<type 'list'>


In [17]:
print(len(x_scrtext_3[436]))
print(len(x_scrtext_4[436]))

18019
10035


In [5]:
#print(x_scrtext_4[436])

### 2. Compute Similarities
- Create the dictionary and corpus
- Apply LSA methodology

In [51]:
# Create the dictionary
dictionary = corpora.Dictionary(x_scrtext_4)

In [52]:
list(itertools.islice(dictionary.token2id.items(), 0, 10))

[(u'fawn', 5331),
 (u'circuitri', 20598),
 (u'mmmpossibl', 79999),
 (u'schlegel', 9287),
 (u'sonji', 12699),
 (u'auggi', 83467),
 (u'mustachio', 43063),
 (u'woodi', 17913),
 (u'grandkid', 54710),
 (u'alrahman', 85306)]

In [54]:
# Creating corpus
corpus = [dictionary.doc2bow(text) for text in x_scrtext_4]

In [55]:
print(corpus[0][:10])

[(0, 1), (1, 8), (2, 1), (3, 11), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 11)]


In [56]:
dictionary[24]

u'enjoy'

In [21]:
# Apply tf-idf transformation

In [57]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [58]:
print(corpus_tfidf[0][:10])

[(0, 0.00023454463167326504), (1, 0.0018155088528265197), (2, 0.014476316160116187), (3, 0.007628791419793525), (4, 0.008366429014692997), (5, 0.0016735061875926688), (6, 0.011650254164639015), (7, 0.013497337656869264), (8, 0.010200649417867285), (9, 0.0007060137287942953)]


In [65]:
# Creating LSA Model:
lsi = models.LsiModel(corpus_tfidf, id2word = dictionary, num_topics = 30)
index = similarities.MatrixSimilarity(lsi[corpus_tfidf])

# Make it into dataframe
movie = pd.DataFrame(
    {'title': movie_title,
     'boxoffice': y_boxoffice,
     'text': x_scrtext_4
    })

In [85]:
# Add similarity score

movie['similarity'] = 'unknown'
movie['size_similar'] = 0
total_sims = [] # storage of all similarity vectors to analysis
threshold = 0.2
for i, doc in enumerate(corpus_tfidf):
    vec_lsi = lsi[doc] # convert the vector to LSI space
    sims = index[vec_lsi] # perform a similarity vector against the corpus
    total_sims = np.concatenate([total_sims, sims])
    similarity = [] # Create a list with movie_id and similarity value
    for j, x in enumerate(movie.title):
        if sims[j] > threshold:
            similarity.append((x, sims[j]))
    similarity = sorted(similarity, key=lambda item: -item[1])
    movie = movie.set_value(i, 'similarity', similarity)
    movie = movie.set_value(i, 'size_similar', len(similarity))

In [86]:
db_similarity = movie[['title', 'similarity']]
db_similarity.to_csv('625_similarity.csv', sep = '|') #Store

### 3. Predict Movie Revenue
- Transform movie revenue into 3 scale (3 being hit, 1 being flop)
- Create BOW
- Train RF and predict movie revenue

In [18]:
def hit_or_flop(boxoffice):
    boxoffice_range = copy.copy(boxoffice)
    for idx,b in enumerate(boxoffice_range):
        if boxoffice_range[idx] > '300000000':
            boxoffice_range[idx] = 3
        elif boxoffice_range[idx] > '100000000' and boxoffice_range[idx] < '300000000':
            boxoffice_range[idx] = 2
        elif boxoffice_range[idx] < '100000000':
            boxoffice_range[idx] = 1
    return boxoffice_range

In [19]:
boxoffice_range = hit_or_flop(y_boxoffice)

In [20]:
#print(boxoffice_range)

In [21]:
class train_and_test:
    def __init__(self,scrtext, boxoffice_range, movieList):
        self.x = scrtext
        self.y = boxoffice_range
        self.movieList = movieList
        
    def split_train_test(self,train_portion):
        self.train_set_x = self.x[:train_portion]
        self.test_set_x = self.x[train_portion:]
        self.train_set_y = self.y[:train_portion]
        self.test_set_y = self.y[train_portion:]
        self.train_movieList = self.movieList[:train_portion]
        self.test_movieList = self.movieList[train_portion:]
        print("Train set %f" %len(self.train_set_x))
        print("Test set %f" % len(self.test_set_x))

    def feature_extraction(self):
        vectorizer = CountVectorizer(analyzer= 'word',
                                     tokenizer= None,
                                     preprocessor= None,
                                     stop_words= 'english',
                                     max_features= 700)

        self.train_data_features = vectorizer.fit_transform(self.train_set_x)
        self.train_data_features = self.train_data_features.toarray()
        
        self.test_data_features = vectorizer.fit_transform(self.test_set_x)
        self.test_data_features = self.test_data_features.toarray()
        print("Training data array looks like this:", self.train_data_features.shape)
        print("Test data array looks like this:", self.test_data_features.shape)

    def train_randomForest(self):
        # Initialize RF with 100 trees
        forest = RandomForestClassifier(n_estimators=300)

        # Fit the forest to the training set, using the bag of words as features
        # and box office labels as the response variable
        self.forest = forest.fit(self.train_data_features, self.train_set_y)
        print("Finished training the data")

    def make_prediction(self): 
        self.result = self.forest.predict(self.test_data_features)
        self.output = pd.DataFrame(data={"box-office": self.result})
        print(self.output)
    
    def score(self):
        self.pscore = metrics.accuracy_score(self.test_set_y, self.result)
        #self.pscore_train = metrics.accuracy_score(y_train, pred_train)
        print(self.pscore)

In [22]:
movie_text = []
for movie in x_scrtext_4:
    results = " ".join([w for w in movie])
    #print(len(results))
    movie_text.append(results)

In [23]:
print(len(movie_text))

437


In [24]:
job = train_and_test(movie_text, boxoffice_range, movie_title)
job.split_train_test(300)
job.feature_extraction()

Train set 300.000000
Test set 137.000000
('Training data array looks like this:', (300, 700))
('Test data array looks like this:', (137, 700))


In [25]:
job.train_randomForest()

Finished training the data


In [26]:
job.make_prediction()

     box-office
0             2
1             3
2             2
3             2
4             3
5             3
6             2
7             3
8             3
9             2
10            3
11            2
12            2
13            3
14            3
15            2
16            3
17            3
18            2
19            3
20            2
21            2
22            2
23            3
24            3
25            3
26            2
27            3
28            2
29            2
..          ...
107           2
108           2
109           2
110           2
111           3
112           3
113           2
114           3
115           2
116           3
117           2
118           2
119           3
120           3
121           3
122           2
123           3
124           2
125           3
126           3
127           2
128           3
129           3
130           3
131           3
132           2
133           3
134           3
135           3
136           2

[137 ro

In [27]:
job.score()

0.583941605839
