- # Import Useful Packages
- # Bring in key features / labels
- # Determine features
- # Create dictionary to hold model results

## Import Useful Packages

In [None]:
import pandas as pd
import numpy as np
import pickle
import sys

## Bring in key features / labels

In [125]:
##########################################################################################
#################### Pulling in the Lookup Table for Movies #############################
##########################################################################################
lookup_table_movies = pd.read_pickle('lookup_table_movies.pickle')

##########################################################################################
#################### Pulling in the User Ratings #########################################
##########################################################################################

user_ratings = pd.read_pickle('user_ratings.pickle')

# This contains all the movieIDs for which we have already compiled metadata info available (i.e. from hetrec)
movies_with_metadata = lookup_table_movies["movieID"].unique()  #10197 in length

# This takes all the user ratings from the ml-latest download, and pairs it down to only those movies 
# for which we have metadata information.
user_ratings_for_movies_with_metadata = user_ratings[user_ratings["movieID"].isin(movies_with_metadata)] # 10196 in length

# This creates special lists that will be used to bias our training set towards users with a lot of ratings.
users_ordered_by_frequency_table = user_ratings_for_movies_with_metadata[["userID","original_9"]].groupby("userID", as_index = False).count().sort('original_9', ascending=False)
#users_ordered_by_frequency = list(users_ordered_by_frequency_table["userID"])
# These are the ID's of the 25000 most frequently rating users. 
#users_high_frequency_25000 = users_ordered_by_frequency[0:25000]  
users_over20_reviews = users_ordered_by_frequency_table[users_ordered_by_frequency_table["original_9"] > 20]["userID"]
users_over30_reviews = users_ordered_by_frequency_table[users_ordered_by_frequency_table["original_9"] > 30]["userID"]

#user_ratings_high_frequency = user_ratings_for_movies_with_metadata[user_ratings_for_movies_with_metadata["userID"].isin(users_w_most_ratings_25000)]
# These are the ID's of the other are the less frequently rating users
#users_low_frequency_remainder = users_ordered_by_frequency[25000:]
#user_ratings_low_frequency = user_ratings_for_movies_with_metadata[user_ratings_for_movies_with_metadata["userID"].isin(users_low_frequency_remainder)]

##########################################################################################
#################### Pulling in the Movie - X Feature Candidates #########################
##########################################################################################
imdb_and_rt_ratings_feature = pd.read_pickle('imdb_and_rt_ratings_feature.pickle') #<- ??Maybe I should consider dim. reducing these too?
imdb_and_rt_ratings_feature_reduced = imdb_and_rt_ratings_feature[["movieID","rtAllCriticsScore","rtAudienceScore"]]

misc_movie_features = pd.read_pickle('misc_movie_features.pickle')

#This is the full genre dummy feature... seems like perhaps I could test it whole since it's relatively small
genre_feature_dummied = pd.read_pickle('genre_feature_dummied.pickle')

#This is the svd shrunk features.
directors_feature_dim_reduced = pd.read_pickle('directors_feature_dim_reduced.pickle')
genre_feature_dim_reduced = pd.read_pickle('genre_feature_dim_reduced.pickle')
tags_feature_dim_reduced= pd.read_pickle('tags_feature_dim_reduced.pickle')
actors_feature_dim_reduced = pd.read_pickle('actors_feature_dim_reduced.pickle')



  if __name__ == '__main__':


## Create dictionary to hold model results

In [126]:
with open('model_results_dict.pickle', 'rb') as handle:
  model_results_dict = pickle.load(handle)

In [63]:
# model_results_dict = {}

# Sample Key: (label_type, feature_set, classifier_model, n_train_users, parameter_metadata_string ) = 

# Sample Stored Ouput:

# Model_Details
#  n_train_ratings
#  n_test_ratings, 
#  n_test_users, 
# feature weights

# Model_Performance
#  confusion matrix
#  accuracy, 
#  precision, 
#  recall, 
#  f_stat
# 

##################### Explanation of Possible Values ##################### 


# label_type = how the "y" is coded in.
# _______________________Example Values:_______________________
# # binned_5 = ratings of 1,2,3,4,5
# # original_9 = ratings of 0.5,1,1.5,2,2.5,3,3.5,4,4.5,5
# # binary = ratings of <3 or <= 3
# # binned_3 = ratings of <3, 3, >3


# # n_train_users = number of unique users in the training set (note: these are biased towards selecting from the top 25000 most frequent users)
# # n_test_users = number of unique users in the test set (note: these contain ALL of the users who not in the top 25k most frequent users and are biased towards selecting from the top 25000 most frequent users)

## feature_set - what features grouping is this?
# standard_features = year + genreDR + agg_ratingDR + director_DR + actorDR + tag_rating

# classifier_model = the type of model that was used for classifcation. i.e.
# _______________________Example Values:_______________________
# multiclass_mle == multinomial maximum liklihood estimation
# dtc = decision tree classifer
# rfc = random forrest classifier
# ada = adaboost classifier
# svm = support vector machines
# nn = neural networks

# # n_train_ratings = number of observations in the training set
# # n_test_ratings =  number of observations in the test set


# parameter_metadata_string = a string composed of what values I gave the paramters if the classifier involved paramters

At the end of this, you should have:
X_train, X_test, 
y_train_binary, y_test_binary
y_train_original_9, y_test_original_9
y_train_binned_3, y_train_binned_3
y_test_binned_5, y_test_binned_5
n_train_users -- len(x_train["users"].unique())

## (1) Create your "everything" matrix using the features you are interested in.
##### [Delete some old variables to free up cache memory] i.e. list of features etc...
## Choose # to have in your test set (from 25000 users) & split your data
##### [Delete some old variables to free up cache memory] i.e. All_data]
## Break out different "y" inputs depending on model
##### [Delete some old variables to free up cache memory] i.e. list of frequency users, y train matrix.


## (1) Create your "everything_matrix"
#### Bumps Memory usage from 2/3gb -> 11/12gb

In [127]:
# These steps may take a long time...
All_data = pd.merge(user_ratings_for_movies_with_metadata,imdb_and_rt_ratings_feature_reduced, on = "movieID")
print "Merge 1 Complete"
# All_data = pd.merge(All_data,misc_movie_features, on = "movieID")
print "Merge 2 Complete"
All_data = pd.merge(All_data,directors_feature_dim_reduced, on = "movieID")
print "Merge 3 Complete"
All_data = pd.merge(All_data,genre_feature_dim_reduced, on = "movieID")
print "Merge 4 Complete"
#All_data = pd.merge(All_data,tags_feature_dim_reduced, on = "movieID")
print "Merge 5 Complete"
#All_data = pd.merge(All_data,actors_feature_dim_reduced, on = "movieID")
#print "Merge 6 Complete"

Merge 1 Complete
Merge 2 Complete
Merge 3 Complete
Merge 4 Complete
Merge 5 Complete


In [128]:
All_data = All_data[All_data["userID"].isin(users_over30_reviews)]

##### [Delete some old variables to free up cache memory] i.e. list of features etc...

In [129]:
#### Reduces Memory usage from 8.6gb -> 6
this = sys.modules[__name__]
list_of_vars_to_be_removed = ["user_ratings", "movies_with_metadata","user_ratings_for_movies_with_metadata", 
                              "imdb_and_rt_ratings_feature","misc_movie_features",  "genre_feature_dummied",
                              "directors_feature_dim_reduced",  "genre_feature_dim_reduced","tags_feature_dim_reduced",
                              "actors_feature_dim_reduced", "lookup_table_movies","user_ratings_totals",
                              "users_ordered_by_frequency","users_w_most_ratings_25000","user_ratings_totals_top75000",
                              "user_ratings_for_movies_with_metadata_ratings_count","top25000","y_train",
                              "Y_not_top25000", "user_ratings_user_ratings_for_movies_with_metadata_small","X_train",
                             "imdb_and_rt_ratings_feature_reduced"]

# [v for v in globals().keys() if not v.startswith('_')] # <-- Seems to be the same as that
#for i in vars().keys():  # <-- Seems to be the same as that
for i in dir():    
    if i[0] != "_":
        if i in list_of_vars_to_be_removed:
            print i
            delattr(this,str(i))

%reset out        

X_train
actors_feature_dim_reduced
directors_feature_dim_reduced
genre_feature_dim_reduced
genre_feature_dummied
imdb_and_rt_ratings_feature
imdb_and_rt_ratings_feature_reduced
lookup_table_movies
misc_movie_features
movies_with_metadata
tags_feature_dim_reduced
user_ratings
user_ratings_for_movies_with_metadata
Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Flushing output cache (6 entries)


# Some minor data cleanup...dropping NA's before splitting

In [130]:
All_data = All_data.dropna()    #Drops all rows with NA's

In [131]:
print All_data.shape
All_data.head()

(19295303, 29)


Unnamed: 0,userID,movieID,rating,binned_5,binned_3,binary,original_9,rtAllCriticsScore,rtAudienceScore,director_nmf_vector_0,...,genre_nmf_vector_0,genre_nmf_vector_1,genre_nmf_vector_2,genre_nmf_vector_3,genre_nmf_vector_4,genre_nmf_vector_5,genre_nmf_vector_6,genre_nmf_vector_7,genre_nmf_vector_8,genre_nmf_vector_9
1,13,169,1.0,1.0,1.0,0.0,2.0,25.0,53.0,39.236357,...,0.073128,4.100759,1.114735,0.566696,0.733227,4.401808,3.831007,1.563178,0.231911,4.135643
2,14,169,3.0,3.0,3.0,1.0,6.0,25.0,53.0,39.236357,...,0.073128,4.100759,1.114735,0.566696,0.733227,4.401808,3.831007,1.563178,0.231911,4.135643
3,17,169,1.0,1.0,1.0,0.0,2.0,25.0,53.0,39.236357,...,0.073128,4.100759,1.114735,0.566696,0.733227,4.401808,3.831007,1.563178,0.231911,4.135643
4,68,169,1.0,1.0,1.0,0.0,2.0,25.0,53.0,39.236357,...,0.073128,4.100759,1.114735,0.566696,0.733227,4.401808,3.831007,1.563178,0.231911,4.135643
5,178,169,2.5,3.0,1.0,0.0,5.0,25.0,53.0,39.236357,...,0.073128,4.100759,1.114735,0.566696,0.733227,4.401808,3.831007,1.563178,0.231911,4.135643


In [132]:
Y = All_data[["binned_5","binned_3","binary","original_9"]]
print Y.shape

(19295303, 4)


In [133]:
## I'm not sure why, but this takes a lot of time to load, so I'm going to try it my old way...
X = All_data

In [135]:
X.head()

Unnamed: 0,userID,movieID,rating,binned_5,binned_3,binary,original_9,rtAllCriticsScore,rtAudienceScore,director_nmf_vector_0,...,genre_nmf_vector_0,genre_nmf_vector_1,genre_nmf_vector_2,genre_nmf_vector_3,genre_nmf_vector_4,genre_nmf_vector_5,genre_nmf_vector_6,genre_nmf_vector_7,genre_nmf_vector_8,genre_nmf_vector_9
1,13,169,1.0,1.0,1.0,0.0,2.0,25.0,53.0,39.236357,...,0.073128,4.100759,1.114735,0.566696,0.733227,4.401808,3.831007,1.563178,0.231911,4.135643
2,14,169,3.0,3.0,3.0,1.0,6.0,25.0,53.0,39.236357,...,0.073128,4.100759,1.114735,0.566696,0.733227,4.401808,3.831007,1.563178,0.231911,4.135643
3,17,169,1.0,1.0,1.0,0.0,2.0,25.0,53.0,39.236357,...,0.073128,4.100759,1.114735,0.566696,0.733227,4.401808,3.831007,1.563178,0.231911,4.135643
4,68,169,1.0,1.0,1.0,0.0,2.0,25.0,53.0,39.236357,...,0.073128,4.100759,1.114735,0.566696,0.733227,4.401808,3.831007,1.563178,0.231911,4.135643
5,178,169,2.5,3.0,1.0,0.0,5.0,25.0,53.0,39.236357,...,0.073128,4.100759,1.114735,0.566696,0.733227,4.401808,3.831007,1.563178,0.231911,4.135643


In [136]:
this = sys.modules[__name__]
list_of_vars_to_be_removed = ["All_data"]

for i in dir():    
    if i[0] != "_":
        if i in list_of_vars_to_be_removed:
            print i
            delattr(this,str(i))

%reset out

All_data
Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Flushing output cache (3 entries)


In [137]:
X = X.drop(["binned_5","binned_3","binary","original_9","rating"],axis = 1)

## split your data

In [7]:
# Splitting your data old / wrong way which chose only those users with a lot of ratings...

# n_train_users = 10
# n_test_users = len(All_data["userID"].unique()) - n_train_users
# lucky_training_set_users = np.random.choice(users_high_frequency_25000, n_train_users, replace = False )

# X_train = All_data[All_data["userID"].isin(lucky_training_set_users)].drop(['movieID', "original_9","binned_5","binned_3","binary"], axis=1)
# Y_train = All_data[["original_9","binned_5","binned_3","binary"]][All_data["userID"].isin(lucky_training_set_users)]

# X_test = All_data[-All_data["userID"].isin(lucky_training_set_users)].drop(['movieID', "original_9","binned_5","binned_3","binary"], axis=1)
# Y_test = All_data[["original_9","binned_5","binned_3","binary"]][-All_data["userID"].isin(lucky_training_set_users)]

# n_train_ratings = X_train.shape[0]
# n_test_ratings = X_test.shape[0]

In [138]:
from sklearn.cross_validation import train_test_split

In [139]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [140]:
n_train_ratings = X_train.shape[0]
n_test_ratings = X_test.shape[0]

In [141]:
print n_train_ratings
print n_test_ratings

15436242
3859061


##### [Delete some old variables to free up cache memory] i.e. All_data]

In [142]:
# Bumps data demands from 17gb to 12gb

this = sys.modules[__name__]
list_of_vars_to_be_removed = ["X","Y"]

for i in dir():    
    if i[0] != "_":
        if i in list_of_vars_to_be_removed:
            print i
            delattr(this,str(i))

%reset out

X
Y
Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Flushing output cache (0 entries)


# Subdivide Y matrix into individual series
??? Could i have just run the models with a matrix set of y's??

In [143]:
y_train_original_9 = y_train["original_9"]
y_train_binned_5 = y_train["binned_5"]
y_train_binned_3 = y_train["binned_3"]
y_train_binary = y_train["binary"]

y_test_original_9 = y_test["original_9"]
y_test_binned_5 = y_test["binned_5"]
y_test_binned_3 = y_test["binned_3"]
y_test_binary = y_test["binary"]

In [144]:
y_train_binary.shape

(15436242,)

In [145]:
y_test_binary.shape

(3859061,)

In [146]:
x = 5
x

5

In [147]:
this = sys.modules[__name__]
list_of_vars_to_be_removed = ["y_train","y_test"]

for i in dir():    
    if i[0] != "_":
        if i in list_of_vars_to_be_removed:
            print i
            delattr(this,str(i))

%reset out

y_test
y_train
Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Flushing output cache (3 entries)


# Some prepwork before running models...
### Importing Packages...
### Defining Functions

In [148]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [149]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier 

In [150]:
def create_model_object(model_type,model_params):
    if model_type == "dtc":
        from sklearn.tree import DecisionTreeClassifier
        return DecisionTreeClassifier()
    elif model_type == "MNB":
        from sklearn.naive_bayes import MultinomialNB
        return MultinomialNB()
    elif model_type == "RFC":
        from sklearn.ensemble import RandomForestClassifier
        return RandomForestClassifier(n_jobs= -1)
    elif model_type == "Ada":
        from sklearn.ensemble import AdaBoostClassifier
        return AdaBoostClassifier()

In [151]:
def model_score(y_true,y_pred, label_type, feature_weights):
    model_confusion_matrix = confusion_matrix(y_true, y_pred)
    model_accuracy_score = accuracy_score(y_true, y_pred)
    model_f1_score = f1_score(y_true, y_pred, average= "micro")
    model_precision = precision_score(y_true, y_pred, average= "micro")
    model_recall = recall_score(y_true, y_pred, average= "micro")
    
    # Generate Key
    # Sample Key: (label_type, feature_set, classifier_model, n_train_users )
    model_key = (label_type, feature_set, model_type, n_train_users,model_params)
    print model_key
    
    # Generate Value
    model_values = (n_train_ratings,n_test_ratings, n_test_users, feature_weights, 
                model_confusion_matrix, model_accuracy_score, model_precision,
               model_recall, model_f1_score)
    print model_values
    
    model_results_dict[model_key] = model_values

In [152]:
def run_model_save_results(feature_set,model_type, n_train_users, model_params):
    model_object = create_model_object(model_type,model_params)
    
    # Train model -> Predict Y-test -> Generate scores & save to dictionary
    model_object.fit(X_train,y_train_binary)
    print "y_train_binary - (1) Finished fitting model"
    try:
        feature_weights = zip(X_train.columns, model_object.feature_importances_)
    except:
        feature_weights = ""
    print "feature weights saved"
    y_pred = model_object.predict(X_test)
    print "y_train_binary - (2) Finished predicting y_pred (test set)"
    model_score(y_test_binary,y_pred,"binary", feature_weights)
    print "y_train_binary - (3) Finished generating scores"
    
    # Train model -> Predict Y-test -> Generate scores & save to dictionary
    model_object.fit(X_train,y_train_binned_5)
    print "y_train_binned_5 - (1) Finished fitting model"
    try:
        feature_weights = zip(X_train.columns, model_object.feature_importances_)
    except:
        feature_weights = ""
    print "feature weights saved"
    y_pred = model_object.predict(X_test)
    print "y_train_binned_5 - (2) Finished predicting y_pred (test set)"
    model_score(y_test_binned_5,y_pred, "binned_5",feature_weights)
    print "y_train_binned_5 - (3) Finished generating scores"

    # Train model -> Predict Y-test -> Generate scores & save to dictionary
    model_object.fit(X_train,y_train_binned_3)
    print "y_train_binned_3 - (1) Finished fitting model"
    try:
        feature_weights = zip(X_train.columns, model_object.feature_importances_)
    except:
        feature_weights = ""
    print "feature weights saved"
    y_pred = model_object.predict(X_test)
    print "y_train_binned_3 - (2) Finished predicting y_pred (test set)"
    model_score(y_test_binned_3,y_pred, "binned_3",feature_weights)
    print "y_train_binned_3 - (3) Finished generating scores"
    
    # Train model -> Predict Y-test -> Generate scores & save to dictionary
    model_object.fit(X_train,y_train_original_9)
    print "y_train_original_9 - (1) Finished fitting model"
    try:
        feature_weights = zip(X_train.columns, model_object.feature_importances_)
    except:
        feature_weights = ""
    print "feature weights saved"
    y_pred = model_object.predict(X_test)
    print "y_train_original_9 - (2) Finished predicting y_pred (test set)"
    model_score(y_test_original_9,y_pred, "original_9",feature_weights)
    print "y_train_original_9 - (3) Finished generating scores"
    
    print
    print "whooray, you're done with this set"    

# Finally, let's run some models

# ! I'm not sure what happened here...may need to investigate...

In [153]:
n_train_users = len(X_train["userID"].unique())
n_test_users = len(X_test["userID"].unique())

In [154]:
# Some prelimary variables
feature_set = "UsersOver30_UserRatings_Genre_Director"
model_type = "MNB"
model_params = ""

run_model_save_results(feature_set,model_type,n_train_users,model_params)

y_train_binary - (1) Finished fitting model
feature weights saved
y_train_binary - (2) Finished predicting y_pred (test set)




('binary', 'UsersOver30_UserRatings_Genre_Director', 'MNB', 116709, '')
(15436242, 3859061, 116701, '', array([[ 178820,  503083],
       [ 738944, 2438214]]), 0.67815305329457087, 0.82895878926881572, 0.76741981355664401, 0.79700316501469726)
y_train_binary - (3) Finished generating scores
y_train_binned_5 - (1) Finished fitting model
feature weights saved
y_train_binned_5 - (2) Finished predicting y_pred (test set)
('binned_5', 'UsersOver30_UserRatings_Genre_Director', 'MNB', 116709, '')
(15436242, 3859061, 116701, '', array([[   1578,    2278,  124326,   45448,    4642],
       [   2435,    3373,  235366,   81588,    9279],
       [   5763,    7485,  719756,  232059,   29650],
       [   8321,   10048, 1021646,  407815,   50229],
       [   3490,    4052,  606178,  211798,   30458]]), 0.30136346639765477, 0.30136346639765477, 0.30136346639765477, 0.30136346639765477)
y_train_binned_5 - (3) Finished generating scores
y_train_binned_3 - (1) Finished fitting model
feature weights save



In [156]:
x = 5
x

5

In [155]:
# Some prelimary variables
feature_set = "UsersOver30_UserRatings_Genre_Director"
model_type = "RFC"
model_params = ""

run_model_save_results(feature_set,model_type,n_train_users,model_params)

KeyboardInterrupt: 

In [None]:
# Some prelimary variables
feature_set = "UsersOver30_UserRatings_Genre_Director"
model_type = "Ada"
model_params = ""

run_model_save_results(feature_set,model_type,n_train_users,model_params)

In [117]:
model_results_dict[model_key] = model_values

In [100]:
model_results_dict.keys()

[('binned_3', 'All_main', 'MNB', 6664, ''),
 ('original_9', 'All_main', 'RFC', 6664, ''),
 ('binned_5', 'All_main', 'dtc', 10, ''),
 ('binary', 'All_main', 'MNB', 6664, ''),
 ('binned_3', 'All_main', 'RFC', 6664, ''),
 ('binary', 'All_main', 'Ada', 6664, ''),
 ('binned_3', 'All_main', 'dtc', 10, ''),
 ('binned_3', 'All_main', 'Ada', 6664, ''),
 ('binned_5', 'All_main', 'Ada', 6664, ''),
 ('original_9', 'All_main', 'MNB', 6664, ''),
 ('binned_5', 'All_main', 'MNB', 6664, ''),
 ('binary', 'All_main', 'dtc', 10, ''),
 ('binned_5', 'All_main', 'RFC', 6664, ''),
 ('binary', 'All_main', 'RFC', 6664, ''),
 ('original_9', 'All_main', 'Ada', 6664, '')]

In [121]:
for i in [y_label_type, feature_set,model_type, user_count, n_train_ratings,n_test_users,model_accuracy_score, model_precision,model_recall,model_f1_score]:
    i = list()

In [122]:
type(y_label_type)

str

In [123]:
for k,v in model_results_dict.iteritems():
    y_label_type = k[0]
    feature_set = k[1]
    model_type = k[2]
    user_count = k[3]
    
    n_train_ratings = v[0]
    n_test_users = v[2]
    model_accuracy_score = v[5]
    model_precision= v[6]
    model_recall= v[7]
    model_f1_score= v[8]

In [113]:
model_results_df = pd.DataFrame([y_label_type, feature_set,model_type, user_count, n_train_ratings,n_test_users,model_accuracy_score, model_precision,model_recall,model_f1_score])

In [114]:
model_results_df

Unnamed: 0,0
0,original_9
1,All_main
2,Ada
3,6664
4,7000
5,2942
6,0.277667
7,0.277667
8,0.277667
9,0.277667


In [101]:
with open('model_results_dict.pickle', 'wb') as f:
    pickle.dump(model_results_dict, f)

In [102]:
#sorted([v for v in globals().keys() if not v.startswith('_')])

In [103]:

# this = sys.modules[__name__]
# list_of_vars_to_be_removed = ["y_pred"]

# # [v for v in globals().keys() if not v.startswith('_')] # <-- Seems to be the same as that
# #for i in vars().keys():  # <-- Seems to be the same as that
# for i in dir():    
#     if i[0] != "_":
#         if i in list_of_vars_to_be_removed:
#             print i
#             delattr(this,str(i))

# %reset out        