In [1]:
from pandas import *
import numpy as np
import matplotlib.pyplot as mplt
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
import datetime
from decimal import Decimal

Get the data:

In [2]:
def get_data():
    temp = '/Users/kafka/PycharmProjects/PortentIO/%s.csv'
    data =  read_csv(temp % 'movie_wiki_twitter_by_days_to_release')
    return(data)

In [3]:
data = get_data()
print(list(data.keys()))

['imdb_movie_id', 'number_of_views', 'days_to_release', 'imdb_id', 'title', 'year', 'day_of_year', 'weekend_num', 'lead_actor', 'imdb_rating', 'genre', 'budget', 'theatres', 'opening_we_bor', 'twitter_volume', 'date']


The data has redundant rows due to IMDb view and time stamps, so we take out these redundancies.

In [4]:
def group_data(data):

    df = data.drop(['title', 'imdb_movie_id', 'number_of_views',
                 'days_to_release'], axis=1)
    df1 = df[np.isfinite(data['theatres'])]
    tiny_data = df1.groupby('imdb_id').first()
    full_data= data.groupby('imdb_id')

    return(tiny_data, full_data)

In [5]:
grouped_df, full_df = group_data(data)

We will attempt to do some principle componenet analysis and momentum analysis on the IMDb views data the following is to tease out the data and make a sanity plot-check:

In [6]:
def get_views(grouped, show_sample_plots=True):

    count = 0
    imdbviews_list={}
    for name, group in grouped:

        #get information for plots
        num       = np.array(group['number_of_views'])
        dtr       = np.array(group['days_to_release'])
        imdbviews = Series(num, index=dtr)

        #get info from first instances of group
        doy   = int(group['day_of_year'].iloc[0])
        yr    = int(group['year'].iloc[0])
        title = group['title'].iloc[0]
        bor   = '%.2E' % Decimal(str(group['opening_we_bor'].iloc[0]))
        date  = str(datetime.datetime.strptime(str(yr)+' '+str(doy),
                                           '%Y %j'))
        count +=1

        if(show_sample_plots==True and count%414==0):
            get_mom_plot(imdbviews, dtr, title, bor, date)

        if(len(num)==100):
            imdbviews_list[bor] = imdbviews
    return(imdbviews_list)

The momentum band is plotted thusly:

In [9]:
def get_mom_plot(data, index, title, bor, date):
    #we can plot momentum curves to predidct what a movie will do towards
    #opening day. this kind of analysis might be better fitted for
    #the cum-sum of twitter sentiment

    ax = data.plot( title='Momentum Plot \n'+title+\
                                      '  BOR($):'+bor+'  OpDay:'+date )
    ax.set_xlabel("days to release")
    ax.set_ylabel("number of views")

    rolling_means ={}
    for i in np.linspace(1,80, 10):
        X=Series(rolling_mean(data, i), index=index)
        rolling_means[i] = X
        X.plot(alpha = 0.7)

    mplt.show()

In [None]:
views = get_views(full_df)

We can use random forest on the exisitng data:

In [None]:
y = grouped_df['opening_we_bor']
X = grouped_df[['weekend_num', 'imdb_rating', 'budget', 'theatres']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y,
                                                random_state = 12, test_size=0.1)
rf = RandomForestRegressor(n_estimators=100)
rf.fit(Xtrain, ytrain)

When plotting the true vs. predited values, we get an r_squared value of roughly 0.8, not bad.. on a first run.

In [None]:
#we can generate the importance of each column on determining the BOR:
print(sorted(zip(map(lambda x: round(x, 4),
                     rf.feature_importances_), X), reverse=True))

#Let's see how well we did at predicting the BOR on the test set:
y_predict = rf.predict(Xtest)
r2 = r2_score(ytest, y_predict)

In addition to using the momentum charts, and time permitting, we can play around more with this data set. 
The data for the views and BOR are stored in a dictionary with the follwoing format: {"BOR": [view1, view2, ...],...}.
Assuming we extract some significance out the "shape" of the view pattern, we might attempt a PCA fit.

In [None]:
X1 = np.array(list(views.values()))
y1 = np.array(list(views.keys()))
X2 = (X1-np.mean(X1))/np.std(X1)

For PCA to work properly, we have to classify the goal funcitons so that they represent brackets of BOR (i.e. they are not all unique). Once we change the values of X2, we might do something like the following:

In [None]:
pca_ = PCA().fit(, )

For future work (an afternoon project), we can factorize the 'genere' and 'leading actor' columns to gain a better model. However, there doesn't seem to be enogh movies for the latter column to make much of a difference. 

In [None]:
actors = factorize(gdf['lead_actor'])
genre = factorize(gdf['genre'])[0]
gen = DataFrame(genre)
gen.columns=['genre']