# IMDBY movie analysis
Predicting IMDB Movie Ratings: Using social media data and open source tools 
W205 - Greg, Talieh, Apekshit

#### First connect to hive and get sentiment analysis data

In [5]:
import pyhs2
import pandas as pd
import statsmodels as sm
import imdb
import statsmodels.formula.api as smf
import re

with pyhs2.connect(host='52.23.237.143',
               port=10000,
               authMechanism="PLAIN",
               user='root',
               database='default') as conn:
    with conn.cursor() as cur:
        #Show databases
        #print cur.getDatabases()
        
        cur.execute("select title as title \
                        ,   production_year as production_year \
                       ,    se_ratio as se_ratio from cp_finalresults_temp")
        
        #Return column info from query
        schema =  cur.getSchema()
        records = cur.fetchall()
    
        cols = []
        for i in range(0,len(schema)):
             cols.append(schema[i].values()[1])
                
## Create Dataframe        
movies_temp=pd.DataFrame(records,columns=cols)
movies_temp

Unnamed: 0,title,production_year,se_ratio
0,45 Years,2015,0.638695
1,Amy,2015,0.333125
2,Anomalisa,2015,0.648936
3,Beasts of No Nation,2015,0.439338
4,Breathe,2015,0.2976
5,Bridge of Spies,2015,0.472897
6,Brooklyn,2015,0.160243
7,Carol,2015,0.577378
8,Cartel Land,2015,0.622642
9,Cinderella,2015,0.548825


## Gather current movie_attributes using IMDbPy 

In [6]:
# Import the imdb package.
import imdb

# Create the object that will be used to access the IMDb's database.
ia = imdb.IMDb() # by default access the web.

movie_attributes = pd.DataFrame()

for index, row in movies_temp.iterrows():
    s_result = ia.search_movie(row['title'])
    movie = s_result[0]
    ia.update(movie)
    join_title = pd.Series(row['title'])
    imdb_title = pd.Series(movie['title'])
    ## get first element in list
    genre = pd.Series((movie['genre'][0]))
    rating = pd.Series(movie['rating'])
    votes = pd.Series(movie['votes'])
    runtime = pd.Series(movie['runtime'])
    language = pd.Series(movie['language'])
    director = re.search(r"Name:\s(.*)", movie['director'][0].summary()).group(1)
    producer = re.search(r"Name:\s(.*)", movie['produced by'][0].summary()).group(1)
    
    temp = pd.DataFrame({'join_title':join_title
                 ,'imdb_title':imdb_title
                 ,'imdb_rating':rating
                 ,'genre':genre
                 ,'imdb_votes':votes
                 ,'language':language
                 ,'runtime':runtime
                 ,'director':director
                 ,'producer':producer  
                        })
    
    movie_attributes = movie_attributes.append(temp)
                                 
movie_attributes.ix[0]

Unnamed: 0,director,genre,imdb_rating,imdb_title,imdb_votes,join_title,language,producer,runtime
0,"Haigh, Andrew",Drama,7.5,45 Years,2110,45 Years,English,"Dargavel, Rachel",95
0,"Kapadia, Asif",Documentary,7.9,Amy,13450,Amy,English,"Bell, Paul",128
0,"Johnson, Duke",Animation,8.1,Anomalisa,1090,Anomalisa,English,"Calder, Keith",90
0,"Fukunaga, Cary Joji",Drama,7.9,Beasts of No Nation,20128,Beasts of No Nation,English,"Asomugha, Nnamdi",137
0,"Passon, Stacie",Drama,5.7,Concussion,2603,Breathe,English,"Chenfeld, Cliff",96
0,"Spielberg, Steven",Biography,7.9,Bridge of Spies,34777,Bridge of Spies,English,"Fisser, Christoph",141
0,"Crowley, John",Drama,8.1,Brooklyn,5057,Brooklyn,English,"Dwyer, Finola",111
0,"Haynes, Todd",Drama,7.8,Carol,4743,Carol,English,"Berwin, Dorothy",118
0,"Heineman, Matthew",Documentary,7.4,Cartel Land,2755,Cartel Land,English,"Bigelow, Kathryn",100
0,"Branagh, Kenneth",Drama,7.1,Cinderella,82026,Cinderella,English,"Barron, David",105


## Create Datasets for Analysis


In [7]:
## Merge Everything Together
cp_results = pd.merge(left=movies_temp,right=movie_attributes.ix[0], left_on='title', right_on='join_title')

#cleanup runtime
cp_results['runtime'] = cp_results['runtime'].str.replace(r'[^0-9]', '')
cp_results['runtime'] = cp_results['runtime'].astype(float)

## Create a training Set 
train = cp_results.sample(frac=0.5, random_state=1)

##Create a test set
test = cp_results.loc[~cp_results.index.isin(train.index)]


## Fit Model

In [8]:
# a utility function to only show the coeff section of summary
from IPython.core.display import HTML
def short_summary(est):
    return HTML(est.summary().tables[1].as_html())

# fit OLS on a bunch of variables 
est = smf.ols(formula="imdb_rating ~ se_ratio + imdb_votes + runtime + C(language) + C(genre)", data=train).fit()
short_summary(est)

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,4.6203,1.219,3.789,0.005,1.808 7.432
C(language)[T.French],0.7670,0.818,0.937,0.376,-1.120 2.654
C(language)[T.German],0.5805,0.740,0.784,0.456,-1.127 2.288
C(language)[T.Russian Sign Language],0.1701,0.460,0.370,0.721,-0.890 1.231
C(genre)[T.Adventure],0.5811,0.963,0.604,0.563,-1.639 2.802
C(genre)[T.Animation],1.3111,0.694,1.890,0.095,-0.288 2.910
C(genre)[T.Biography],0.5677,0.659,0.862,0.414,-0.952 2.087
C(genre)[T.Comedy],0.1687,0.708,0.238,0.818,-1.463 1.800
C(genre)[T.Crime],0.1701,0.460,0.370,0.721,-0.890 1.231


In [9]:
est.params

Intercept                               4.620270
C(language)[T.French]                   0.767006
C(language)[T.German]                   0.580477
C(language)[T.Russian Sign Language]    0.170130
C(genre)[T.Adventure]                   0.581142
C(genre)[T.Animation]                   1.311068
C(genre)[T.Biography]                   0.567684
C(genre)[T.Comedy]                      0.168744
C(genre)[T.Crime]                       0.170130
C(genre)[T.Documentary]                 1.154071
C(genre)[T.Drama]                       0.234517
C(genre)[T.Horror]                     -0.215739
C(genre)[T.Mystery]                     0.373197
se_ratio                                1.053653
imdb_votes                              0.000002
runtime                                 0.016193
dtype: float64

In [24]:
#est.predict(exog=dict(se_ratio=test['se_ratio'],imdb_votes=test['imdb_votes'],language=C(test['language'])))
predicted_values = est.predict(test[['se_ratio','imdb_votes','runtime','language','genre']])

##create a df to join back to test
predicted_imdb = pd.DataFrame({'predicted_imdb':predicted_values.tolist()})

test = test.reset_index()

#update test, rename as cp_final_results
cp_finalresults = pd.merge(left=test,right=predicted_imdb,left_index=True,right_index=True)


In [84]:
#test = test.reset_index()
cp_finalresults = pd.merge(left=test,right=predicted_imdb,left_index=True,right_index=True)
cp_finalresults = cp_finalresults.ix[:,2:15]
cp_finalresults

Unnamed: 0,title,production_year,se_ratio,director,genre,imdb_rating,imdb_title,imdb_votes,join_title,language,producer,runtime,predicted_imdb
0,45 Years,2015,0.638695,"Haigh, Andrew",Drama,7.5,45 Years,2110,45 Years,English,"Dargavel, Rachel",95,7.070011
1,Amy,2015,0.333125,"Kapadia, Asif",Documentary,7.9,Amy,13450,Amy,English,"Bell, Paul",128,8.223123
2,Bridge of Spies,2015,0.472897,"Spielberg, Steven",Biography,7.9,Bridge of Spies,34777,Bridge of Spies,English,"Fisser, Christoph",141,8.034305
3,Brooklyn,2015,0.160243,"Crowley, John",Drama,8.1,Brooklyn,5057,Brooklyn,English,"Dwyer, Finola",111,6.830473
4,Carol,2015,0.577378,"Haynes, Todd",Drama,7.8,Carol,4743,Carol,English,"Berwin, Dorothy",118,7.382753
5,Cartel Land,2015,0.622642,"Heineman, Matthew",Documentary,7.4,Cartel Land,2755,Cartel Land,English,"Bigelow, Kathryn",100,8.054818
6,Cinderella,2015,0.548825,"Branagh, Kenneth",Drama,7.1,Cinderella,82026,Cinderella,English,"Barron, David",105,7.286349
7,Creed,2015,0.189662,"Coogler, Ryan",Drama,8.6,Creed,18205,Creed,English,"Chartoff, Robert",133,7.242245
8,Dope,2015,0.201,"Famuyiwa, Rick",Comedy,7.3,Dope,29227,Dope,English,"Bongiovi, Nina Yang",103,6.723195
9,Ex Machina,2015,0.356215,"Garland, Alex",Drama,7.7,Ex Machina,198351,Ex Machina,English,"Bush, Eli",108,7.349012


## Write Data Back to Hive

In [85]:
import pydoop.hdfs as hdfs
#hdfs.mkdir("/root/persist/data/cp_imdb_db/final_results/")

cp_finalresults.to_csv("/root/persist/data/imdb_csvs/cp_finalresults.csv", sep=',', na_rep='', header=True, index=False)
#hdfs.put("/root/persist/data/imdb_csvs/cp_finalresults.csv","/root/persist/data/cp_imdb_db/final_results/")

#cp_results.to_csv("/root/persist/data/imdb_csvs/cp_results.csv", sep=',', na_rep='', header=True, index=False)

#hdfs.put("/root/persist/data/imdb_csvs/cp_finalresults.csv","/root/persist/data/cp_imdb_db/final_results/")

#from pyhive import presto
#from pyhive import hive

#from sqlalchemy import *
#from sqlalchemy.engine import create_engine
#from sqlalchemy.schema import *

#engine = create_engine('presto://52.23.237.143:10000/hive/default')

#cp_finalresults.to_sql('cp_finalresults', engine, if_exists='replace', index=False)


#from pyhive import presto
#from pyhive import hive
#from sqlalchemy import *
#from sqlalchemy.engine import create_engine
#from sqlalchemy.schema import *
#conn = hive.Connection(host="52.23.237.143", port=10000, username="root", database="default")
#engine = create_engine('presto://localhost:10000/hive/default')
#cp_finalresults.to_sql('cp_finalresults', engine, if_exists='replace', index=False)