In [1]:
# Given a dataset of user <> movie <> rating <> date where we subset to only be the movies
# outside the top 500 movies (check success for imdb top 500 vs. facebook like top 500)
# Can we then give you a dataset of movie <> rating <> date and have you predict user
import pandas as pd


In [2]:
ratings = pd.read_csv("./movie_metadata.csv")

In [3]:
titles=pd.read_csv("./movie_titles.csv")

In [4]:
titles.head()

Unnamed: 0,movieID,year,title
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [5]:
ratings.columns.values 

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'], dtype=object)

In [57]:
ratings=ratings.sort_values('imdb_score', ascending=False)
ratings.head()
top500_imdb_ratings=ratings.head(n=500)
top100_imdb_ratings=ratings.head(n=100)
top2000_imdb_ratings=ratings.head(n=2000)

ratings_facebook = ratings.sort_values('movie_facebook_likes', ascending=False)
ratings_facebook.head()
top500_fb_ratings = ratings_facebook.head(n=500)
top100_fb_ratings = ratings_facebook.head(n=100)

#create a boolean column in the movie titles for whether that movie is in the top 100 or 500 #for facebook and IMDB
titles['top100fb']=titles.title.apply(lambda x: any((i for i in top100_fb_ratings.movie_title if i.find(x) >= 0)))
titles['top500fb']=titles.title.apply(lambda x: any((i for i in top500_fb_ratings.movie_title if i.find(x) >= 0)))
titles['top100imdb']=titles.title.apply(lambda x: any((i for i in top100_imdb_ratings.movie_title if i.find(x) >= 0)))
titles['top500imdb']=titles.title.apply(lambda x: any((i for i in top500_imdb_ratings.movie_title if i.find(x) >= 0)))
titles['top2000imdb']=titles.title.apply(lambda x: any((i for i in top2000_imdb_ratings.movie_title if i.find(x) >= 0)))

In [76]:
#want data in the form movie <> userID <> rating <> date
import os
import re
#list the files'
filelist = os.listdir('/Users/Elissa/Documents/University of Maryland/Graduate/cmsc818o-attack/training_set2') 

fb_100_data = []
fb_500_data = []
imdb_100_data= []
imdb_500_data= []
imdb_2000_data= []
all_imdb_data = []
for file in filelist:
    frame = pd.read_csv(('/Users/Elissa/Documents/University of Maryland/Graduate/cmsc818o-attack/training_set2/'+file), parse_dates=True, header=1)
    filename_raw= os.path.basename(file)
    try:
        filename = re.search('mv_(.+?).txt', filename_raw).group(1)
    except AttributeError:
        filename = ''
    frame['movieID'] = int(filename)
    frame.columns=['uid','rating','date','movieID']
    
    #want to check if this movieID has a true in any of the four columns in titles
    row = titles.loc[titles['movieID'] == int(filename)]
    cnt=row.index.tolist()[0]
    if(not (row.get_value(cnt,'top100fb'))):
        fb_100_data.append(frame)
    if(not (row.get_value(cnt,'top500fb'))):
        fb_500_data.append(frame)
    if(not (row.get_value(cnt,'top100imdb'))):
        imdb_100_data.append(frame)
    if(not (row.get_value(cnt,'top500imdb'))):
        imdb_500_data.append(frame)
    if(not (row.get_value(cnt,'top2000imdb'))):
        imdb_2000_data.append(frame)
    all_imdb_data.append(frame)
       
#concatenate them together
imdb_100_df = pd.concat(imdb_100_data)
imdb_500_df = pd.concat(imdb_500_data)
imdb_2000_df = pd.concat(imdb_2000_data)
fb_100_df = pd.concat(fb_100_data)
fb_500_df = pd.concat(fb_500_data)
all_imdb_df = pd.concat(all_imdb_data)

imdb_100_df['date']=pd.to_datetime(imdb_100_df['date'], yearfirst=True)
imdb_100_df['date']=imdb_100_df.date.apply(lambda x: x.toordinal())
imdb_500_df['date']=pd.to_datetime(imdb_500_df['date'], yearfirst=True)
imdb_500_df['date']=imdb_500_df.date.apply(lambda x: x.toordinal())
imdb_2000_df['date']=pd.to_datetime(imdb_2000_df['date'], yearfirst=True)
imdb_2000_df['date']=imdb_2000_df.date.apply(lambda x: x.toordinal())
all_imdb_df['date']=pd.to_datetime(all_imdb_df['date'], yearfirst=True)
all_imdb_df['date']=all_imdb_df.date.apply(lambda x: x.toordinal())
fb_100_df['date']=pd.to_datetime(fb_100_df['date'], yearfirst=True)
fb_100_df['date']=fb_100_df.date.apply(lambda x: x.toordinal())
fb_500_df['date']=pd.to_datetime(fb_500_df['date'], yearfirst=True)
fb_500_df['date']=fb_500_df.date.apply(lambda x: x.toordinal())

In [59]:
len(fb_100_df.index)
len(imdb_100_df.index)

352594

In [60]:
len(fb_500_df.index)
len(imdb_500_df.index)

349317

In [61]:
#split into training and test set where you have X number of movies 
#watched by a given user, their ratings, and dates and then you try to predict
#in the test set the userID

#show that given n movie ratings we can detect the user in the full dataset

In [77]:
training_imdb_2000 = imdb_2000_df[['uid', 'rating', 'date','movieID']].groupby(['uid']).head(2)
training_imdb_500 = imdb_500_df[['uid', 'rating', 'date','movieID']].groupby(['uid']).head(2)
training_imdb_100 = imdb_100_df[['uid', 'rating', 'date','movieID']].groupby(['uid']).head(2)
training_fb_500 = fb_500_df[['uid', 'rating', 'date','movieID']].groupby(['uid']).head(2)
training_fb_100 = fb_100_df[['uid', 'rating', 'date','movieID']].groupby(['uid']).head(2)
training_all_imdb = all_imdb_df[['uid', 'rating', 'date','movieID']].groupby(['uid']).head(2)



training_imdb_500_4p=training_imdb_500.sample(n=int(training_imdb_500.size*0.04))
training_labels_imdb_500_4p = training_imdb_500_4p[['uid']].copy().as_matrix()
training_imdb_500_4p = training_imdb_500_4p[['rating', 'date','movieID']].copy()
training_imdb_500_3p=training_imdb_500.sample(n=int(training_imdb_500.size*0.03))
training_labels_imdb_500_3p = training_imdb_500_3p[['uid']].copy().as_matrix()
training_imdb_500_3p = training_imdb_500_3p[['rating', 'date','movieID']].copy()
training_imdb_500_2p=training_imdb_500.sample(n=int(training_imdb_500.size*0.02))
training_labels_imdb_500_2p = training_imdb_500_2p[['uid']].copy().as_matrix()
training_imdb_500_2p = training_imdb_500_2p[['rating', 'date','movieID']].copy()
training_imdb_500_1p=training_imdb_500.sample(n=int(training_imdb_500.size*0.01))
training_labels_imdb_500_1p = training_imdb_500_1p[['uid']].copy().as_matrix()
training_imdb_500_1p = training_imdb_500_1p[['rating', 'date','movieID']].copy()

training_imdb_100_4p=training_imdb_100.sample(n=int(training_imdb_100.size*0.04))
training_labels_imdb_100_4p = training_imdb_100_4p[['uid']].copy().as_matrix()
training_imdb_100_4p = training_imdb_100_4p[['rating', 'date','movieID']].copy()

training_fb_100_4p=training_fb_100.sample(n=int(training_imdb_100.size*0.04))
training_labels_fb_100_4p = training_fb_100_4p[['uid']].copy().as_matrix()
training_fb_100_4p = training_fb_100_4p[['rating', 'date','movieID']].copy()

training_fb_500_4p=training_fb_500.sample(n=int(training_fb_500.size*0.04))
training_labels_fb_500_4p = training_fb_500_4p[['uid']].copy().as_matrix()
training_fb_500_4p = training_fb_500_4p[['rating', 'date','movieID']].copy()

training_imdb_2000_4p=training_imdb_2000.sample(n=int(training_imdb_2000.size*0.04))
training_labels_imdb_2000_4p = training_imdb_2000_4p[['uid']].copy().as_matrix()
training_imdb_2000_4p = training_imdb_2000_4p[['rating', 'date','movieID']].copy()

training_all_imdb_4p=training_all_imdb.sample(n=int(training_all_imdb.size*0.04))
training_labels_all_imdb_4p = training_all_imdb_4p[['uid']].copy().as_matrix()
training_all_imdb_4p = training_all_imdb_4p[['rating', 'date','movieID']].copy()



In [63]:
training_imdb_500_4p
#imdb_500_df[['uid', 'rating', 'date','movieID']].groupby(['uid']).agg(['mean', 'count'])

Unnamed: 0,rating,date,movieID
15316,2,732042,58
27510,3,731701,30
60173,2,731998,30
5243,4,732174,30
441,4,732084,57
24918,2,731680,30
5812,4,731722,30
40272,4,731690,30
3062,5,731659,79
27174,4,731811,28


In [78]:
test_imdb_500_4p = imdb_500_df.sample(n=int(training_imdb_500.size*0.04))
test_imdb_500_nouid_4p= test_imdb_500_4p[['rating','date','movieID']].copy()
test_imdb_500_3p = imdb_500_df.sample(n=int(training_imdb_500.size*0.03))
test_imdb_500_nouid_3p= test_imdb_500_3p[['rating','date','movieID']].copy()
test_imdb_500_2p = imdb_500_df.sample(n=int(training_imdb_500.size*0.02))
test_imdb_500_nouid_2p= test_imdb_500_2p[['rating','date','movieID']].copy()
test_imdb_500_1p = imdb_500_df.sample(n=int(training_imdb_500.size*0.01))
test_imdb_500_nouid_1p= test_imdb_500_1p[['rating','date','movieID']].copy()


test_imdb_100_4p = imdb_100_df.sample(n=int(training_imdb_100.size*0.04))
test_imdb_100_nouid_4p= test_imdb_100_4p[['rating','date','movieID']].copy()
test_fb_500_4p = fb_500_df.sample(n=int(training_fb_500.size*0.04))
test_fb_500_nouid_4p= test_fb_500_4p[['rating','date','movieID']].copy()
test_fb_100_4p = fb_100_df.sample(n=int(training_fb_100.size*0.04))
test_fb_100_nouid_4p= test_fb_100_4p[['rating','date','movieID']].copy()

test_imdb_2000_4p = imdb_2000_df.sample(n=int(training_imdb_2000.size*0.04))
test_imdb_2000_nouid_4p= test_imdb_2000_4p[['rating','date','movieID']].copy()

test_all_imdb_4p = all_imdb_df.sample(n=int(training_all_imdb.size*0.04))
test_all_imdb_nouid_4p= test_all_imdb_4p[['rating','date','movieID']].copy()

In [65]:
#training['date']=pd.to_datetime(training['date'], yearfirst=True)
#training['date']=training.date.apply(lambda x: x.toordinal())
#training_small = training.sample(n=int(training.size*0.04))
#training_small.size
#labels=training_small[['uid']].copy().as_matrix()
#training_small_nouid=training_small[['rating','date','movieID']].copy()

In [79]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(training_all_imdb_4p, training_labels_all_imdb_4p)

In [80]:
predict=clf.predict(test_all_imdb_nouid_4p)

In [68]:
#test_small['uid'].as_matrix()

In [69]:
#predict

In [81]:
from sklearn.metrics import accuracy_score
accuracy_score(test_all_imdb_4p['uid'].as_matrix(),predict, normalize=True)

0.058351585014409221