In [35]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

r_cols = ['user_id','movie_id', 'rating','unix_timestamp']
ratings=pd.read_csv('ratings.csv', sep=',', names=r_cols, encoding='latin-1',skiprows=1)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [36]:
ratings_df = ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating')
ratings_df.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,


In [38]:
ratings_data_item_list=list(ratings_df)

In [39]:
item_feature_matrix=pd.read_csv('item_feature_matrix_ex_crew.csv', sep=',', encoding='latin-1',skiprows=0)

In [40]:
item_feature_matrix.head()

Unnamed: 0.1,Unnamed: 0,movieId,budget1,budget2,budget3,budget4,budget5,revenue1,revenue2,revenue3,...,Romanian,Arabic,Estonian,Indonesian,Lao,running_time1,running_time2,running_time3,directed_high,directed_low
0,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,1,2,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,2,3,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,3,4,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,4,5,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [41]:
item_feature_item_list=list(item_feature_matrix['movieId'])

In [42]:
dropped_items_list_from_ratings=list(set(ratings_data_item_list)-set(item_feature_item_list))
print dropped_items_list_from_ratings, len(dropped_items_list_from_ratings)

[62336, 769, 94466, 108548, 27611, 106642, 150548, 90647, 26649, 126106, 73759, 2851, 55207, 77359, 162376, 108979, 99764, 108727, 52281, 31193, 79299, 26693, 150856, 96075, 27724, 72781, 7502, 5069, 720, 721, 4051, 4568, 69849, 77658, 26587, 32352, 100450, 108583, 1133, 4207, 7669, 85780] 42


In [43]:
ratings_df.drop(dropped_items_list_from_ratings,axis=1)

movie_id,1,2,3,4,5,6,7,8,9,10,...,160718,161084,161155,161594,161830,161918,161944,162542,162672,163949
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,3.0,,,,,,,,,3.0,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,4.0,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [51]:
print ratings_df.shape,sum(ratings_df.count())

(671, 9066) 100004


In [45]:
import random
ix = [(row, col) for row in range(ratings_df.shape[0]) for col in range(ratings_df.shape[1])]
pointer_list=[] #pointers_list - the list that contains all tuples of ratings in the ratings universe
for row,col in ix:
    if ratings_df.iat[row,col]>0:
        pointer_list.append((row,col))
print 'len(pointer_list)',len(pointer_list)
sample_20=random.sample(pointer_list, int(round(.2*len(pointer_list))))#20% of sample overwritten with NaN
print 'len(sample_20)',len(sample_20)
sample_80=list(set(pointer_list)-set(sample_20))
print len(sample_80)#rest of the 8-% sample from initial ratings set
print sum(ratings_df.count())

len(pointer_list) 100004
len(sample_20) 20001
80003
100004


In [47]:
test_df=ratings_df.copy()
train_df=ratings_df.copy()

In [48]:
for row, col in sample_20:
    train_df.iat[row, col] = np.nan #train_df gets updated with 80% of the data
print train_df.shape,sum(train_df.count())

(671, 9066) 80003


In [50]:
for row, col in sample_80:
    test_df.iat[row, col] = np.nan #test_df gets updated with 80% of the data
print test_df.shape,sum(test_df.count())

(671, 9066) 20001


In [52]:
train_df.to_csv('train_unn.csv')
test_df.to_csv('test_unn.csv')
ratings_df.to_csv('ratings_unn.csv')

In [54]:
train_df.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,


In [55]:
test_df.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [56]:
ratings_df.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,
