In [1]:
# runtime: 20 sec
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df = pd.read_csv(r'rating.csv')

In [2]:
df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
5,1,112,3.5,2004-09-10 03:09:00
6,1,151,4.0,2004-09-10 03:08:54
7,1,223,4.0,2005-04-02 23:46:13
8,1,253,4.0,2005-04-02 23:35:40
9,1,260,4.0,2005-04-02 23:33:46


In [3]:
len(df)

20000263

In [4]:
column = df["userId"]
max_value = column.max() 
max_value
# Output: total number of users in data frame

138493

In [5]:
# runtime: 3 sec
# choose randomly a user for whom to give recommendations
user_id = int(column.sample(1))
user_id

4655

In [6]:
# Pulling off the records of just the target user
user_df = df[df["userId"] == user_id]
user_df.head(10)
len(user_df)

173

In [7]:
# Putting the movies watched by the target user into a list
movies_watched = user_df["movieId"].tolist()
#len(movies_watched)
# Output is number of movies rated by target user
print(movies_watched[0:10])


[1, 2, 17, 19, 29, 47, 260, 293, 296, 318]


In [8]:
# Runs slow (10 seconds?)
movies_watched_df = df[df["movieId"].isin(movies_watched)]
movies_watched_df.head()
movies_watched_df.shape
# This is how many rows refer to a movie the target user watched
movies_watched_df.head(20)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
3,1,47,3.5,2005-04-02 23:32:07
9,1,260,4.0,2005-04-02 23:33:46
10,1,293,4.0,2005-04-02 23:31:43
11,1,296,4.0,2005-04-02 23:32:47
12,1,318,4.0,2005-04-02 23:33:18
13,1,337,3.5,2004-09-10 03:08:29
17,1,593,3.5,2005-04-02 23:31:01
22,1,1036,4.0,2005-04-02 23:44:40


In [43]:
len(movies_watched)

173

In [44]:
user_movie_count = movies_watched_df.groupby(["userId"]).movieId.count()
user_movie_count.head(10)
#len(user_movie_count) # Comparing this to printout of maxvalue up above, this shows the process doesn't narrow it down much; 

userId
1     41
2      7
3     20
4      3
5     14
6      5
7     36
8     11
9      8
10     9
Name: movieId, dtype: int64

In [45]:
# We will pull off only users who have rated at least this proportion of the movies the target user has rated.
m_count = 0.5*len(movies_watched)
m_count

86.5

In [46]:
user_movie_count = user_movie_count.reset_index()
user_movie_count.columns = ["userId", "movie_count"]

# Choosing 50% instead of 60% makes a huge difference in number of records that finally get through.

# choose a ratio of 0.50. User ids who watched at least 50 percent of target user's movies 
users_same_movies = user_movie_count[user_movie_count["movie_count"] > m_count].sort_values("movie_count", ascending=False)
users_same_movies.nunique()
# Output: total number of users that rated at least 50% of movies rated by target user

userId         4598
movie_count      78
dtype: int64

In [13]:
users_same_movies.head(10)

Unnamed: 0,userId,movie_count
4579,4655,173
8263,8405,167
103817,105580,166
113321,115226,163
33985,34576,163
116254,118205,163
130932,133133,161
78855,80207,161
6266,6373,160
134019,136268,159


In [14]:
similar_users = users_same_movies["userId"].tolist()
similar_users_df = df[df["userId"].isin(similar_users)]

similar_users_df.head(20)

Unnamed: 0,userId,movieId,rating,timestamp
9621,91,1,4.0,2005-03-22 22:46:02
9622,91,2,3.5,2005-03-29 01:55:58
9623,91,3,3.0,2005-03-22 22:43:12
9624,91,7,2.5,2005-07-18 08:09:11
9625,91,10,4.0,2005-07-18 07:59:03
9626,91,11,4.0,2005-07-18 08:01:15
9627,91,19,2.0,2005-09-12 06:46:12
9628,91,21,4.0,2005-03-23 03:36:57
9629,91,22,4.0,2005-09-12 06:51:31
9630,91,24,3.5,2005-04-11 06:40:37


In [15]:
len(similar_users_df)

4295017

In [16]:
# Now the goal is to line up reviews by movie and userId. So movies viewed by target user would
# be side index and userids would be top index. corr() finds pairwise correlation between columns.


In [17]:
 user_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
701874,4655,1,3.5,2005-12-21 15:58:50
701875,4655,2,2.0,2005-12-22 10:12:33
701876,4655,17,4.0,2005-12-22 10:12:06
701877,4655,19,1.5,2005-12-21 15:49:41
701878,4655,29,2.5,2005-12-21 16:01:08
701879,4655,47,3.5,2005-12-21 15:55:05
701880,4655,260,4.0,2005-12-21 15:54:53
701881,4655,293,5.0,2005-12-21 15:55:45
701882,4655,296,4.0,2005-12-21 15:54:25
701883,4655,318,5.0,2005-12-21 15:53:50


In [18]:
movies_watched_df[movies_watched_df["userId"]==user_id]

Unnamed: 0,userId,movieId,rating,timestamp
701874,4655,1,3.5,2005-12-21 15:58:50
701875,4655,2,2.0,2005-12-22 10:12:33
701876,4655,17,4.0,2005-12-22 10:12:06
701877,4655,19,1.5,2005-12-21 15:49:41
701878,4655,29,2.5,2005-12-21 16:01:08
701879,4655,47,3.5,2005-12-21 15:55:05
701880,4655,260,4.0,2005-12-21 15:54:53
701881,4655,293,5.0,2005-12-21 15:55:45
701882,4655,296,4.0,2005-12-21 15:54:25
701883,4655,318,5.0,2005-12-21 15:53:50


In [19]:
type(similar_users)

list

In [20]:
# Time-consuming: half a minute
movies_watched_df2 = movies_watched_df[movies_watched_df['userId'].isin(similar_users)]
movies_watched_df3 = movies_watched_df2.drop(['timestamp'], axis=1)
movies_watched_df_pivot = movies_watched_df3.pivot(index='userId', columns='movieId')

In [21]:
movies_watched_df_pivot.head(10)

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movieId,1,2,17,19,29,47,260,293,296,318,319,337,345,356,364,500,509,551,588,593,594,597,736,778,780,904,917,1036,1089,1175,1196,1206,1208,1210,1216,1222,1246,1249,1258,1446,1517,1527,1580,1617,1682,1704,1721,1923,2028,2125,2232,2291,2324,2329,2359,2541,2542,2571,2580,2683,2692,2706,2710,2724,2762,2858,2908,2959,2997,3000,3081,3160,3176,3408,3481,3483,3578,3751,3793,3897,3908,3967,3988,3996,4011,4022,4027,4226,4235,4239,4246,4306,4308,4310,4370,4447,4718,4720,4874,4878,4886,4963,4973,4976,4979,4993,5015,5218,5377,5378,5444,5464,5502,5528,5618,5669,5791,5890,5902,5952,5971,5994,5995,6218,6291,6333,6377,6378,6385,6539,6711,6863,6867,6874,6953,7022,7099,7139,7147,7151,7153,7154,7323,7361,7372,7382,7438,8360,8368,8636,8645,8798,8873,8949,8961,8970,26662,27416,27721,27731,30749,30793,31658,32587,33166,33493,33794,37729,37733,38038,38061,40815,41569
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2,Unnamed: 81_level_2,Unnamed: 82_level_2,Unnamed: 83_level_2,Unnamed: 84_level_2,Unnamed: 85_level_2,Unnamed: 86_level_2,Unnamed: 87_level_2,Unnamed: 88_level_2,Unnamed: 89_level_2,Unnamed: 90_level_2,Unnamed: 91_level_2,Unnamed: 92_level_2,Unnamed: 93_level_2,Unnamed: 94_level_2,Unnamed: 95_level_2,Unnamed: 96_level_2,Unnamed: 97_level_2,Unnamed: 98_level_2,Unnamed: 99_level_2,Unnamed: 100_level_2,Unnamed: 101_level_2,Unnamed: 102_level_2,Unnamed: 103_level_2,Unnamed: 104_level_2,Unnamed: 105_level_2,Unnamed: 106_level_2,Unnamed: 107_level_2,Unnamed: 108_level_2,Unnamed: 109_level_2,Unnamed: 110_level_2,Unnamed: 111_level_2,Unnamed: 112_level_2,Unnamed: 113_level_2,Unnamed: 114_level_2,Unnamed: 115_level_2,Unnamed: 116_level_2,Unnamed: 117_level_2,Unnamed: 118_level_2,Unnamed: 119_level_2,Unnamed: 120_level_2,Unnamed: 121_level_2,Unnamed: 122_level_2,Unnamed: 123_level_2,Unnamed: 124_level_2,Unnamed: 125_level_2,Unnamed: 126_level_2,Unnamed: 127_level_2,Unnamed: 128_level_2,Unnamed: 129_level_2,Unnamed: 130_level_2,Unnamed: 131_level_2,Unnamed: 132_level_2,Unnamed: 133_level_2,Unnamed: 134_level_2,Unnamed: 135_level_2,Unnamed: 136_level_2,Unnamed: 137_level_2,Unnamed: 138_level_2,Unnamed: 139_level_2,Unnamed: 140_level_2,Unnamed: 141_level_2,Unnamed: 142_level_2,Unnamed: 143_level_2,Unnamed: 144_level_2,Unnamed: 145_level_2,Unnamed: 146_level_2,Unnamed: 147_level_2,Unnamed: 148_level_2,Unnamed: 149_level_2,Unnamed: 150_level_2,Unnamed: 151_level_2,Unnamed: 152_level_2,Unnamed: 153_level_2,Unnamed: 154_level_2,Unnamed: 155_level_2,Unnamed: 156_level_2,Unnamed: 157_level_2,Unnamed: 158_level_2,Unnamed: 159_level_2,Unnamed: 160_level_2,Unnamed: 161_level_2,Unnamed: 162_level_2,Unnamed: 163_level_2,Unnamed: 164_level_2,Unnamed: 165_level_2,Unnamed: 166_level_2,Unnamed: 167_level_2,Unnamed: 168_level_2,Unnamed: 169_level_2,Unnamed: 170_level_2,Unnamed: 171_level_2,Unnamed: 172_level_2,Unnamed: 173_level_2
91,4.0,3.5,,2.0,,4.0,4.5,,3.5,4.0,,,4.0,4.0,4.5,4.0,,,,3.0,3.5,3.0,3.0,4.0,3.0,4.5,,,2.5,,5.0,5.0,4.0,4.5,,3.5,4.5,,4.5,,4.0,3.5,3.5,3.5,3.5,2.5,3.5,2.5,4.0,,4.5,3.5,3.0,3.5,,,4.5,3.5,2.5,4.0,4.0,3.5,3.0,,4.0,4.5,,5.0,4.0,,3.5,0.5,3.0,3.0,4.0,,3.5,4.0,4.0,3.0,,4.0,,4.0,4.0,3.0,4.0,5.0,,,4.0,3.5,2.5,,3.5,,,4.0,4.0,,4.0,4.0,4.0,,4.0,5.0,,3.5,4.0,3.0,,4.0,4.5,,,4.5,,,3.5,5.0,,,4.0,3.0,,3.5,4.0,,,3.5,4.0,3.0,,4.0,,,,,3.0,,5.0,,,4.0,,,3.0,4.0,3.5,2.0,,,4.0,4.0,4.5,4.0,,,3.5,,4.5,3.5,,2.5,,3.5,,3.5,,4.5,,4.5,
116,3.0,2.0,,2.5,,4.5,4.5,5.0,4.5,4.5,3.5,,,4.0,4.0,3.5,,3.0,3.0,3.0,1.5,,1.0,1.5,1.0,,,4.5,4.0,,4.5,3.5,3.5,5.0,,4.0,2.0,,3.0,,2.5,4.0,2.5,4.0,2.5,,0.5,2.0,5.0,,,3.0,,4.5,,1.5,,4.0,,3.5,3.0,2.0,,0.5,3.0,4.5,,5.0,,,,,,,1.5,,4.0,,4.0,3.0,,,3.0,3.0,4.0,2.5,,,,,,3.0,,0.5,1.0,1.5,1.5,,3.0,4.5,3.0,2.5,,,3.0,3.5,,3.0,,2.5,1.5,,3.5,3.0,,3.0,3.0,,,3.5,,,,,,4.5,3.0,,,4.0,,3.0,,3.5,,,,,2.5,,3.5,,,4.0,,,3.5,,,2.5,,4.0,,3.5,3.0,,,,,,5.0,4.5,,5.0,,3.0,4.5,2.5,,,,,
208,4.0,,5.0,,5.0,3.5,4.5,,5.0,4.5,3.0,4.0,3.5,1.0,3.5,,4.0,3.5,4.5,4.5,,0.5,2.5,4.0,4.0,5.0,,3.0,4.0,4.0,4.0,4.0,4.5,4.0,,3.0,1.5,2.0,4.5,,3.0,,1.5,4.0,3.0,4.0,4.0,4.0,4.0,,,3.5,4.0,,2.0,,3.0,3.0,,,4.0,1.0,3.5,2.0,3.0,3.5,5.0,4.0,4.5,2.0,,5.0,4.5,,4.5,,4.0,4.0,,4.0,,,,3.5,,3.0,4.0,4.5,,,3.5,4.0,4.0,,3.0,,,,,4.0,,3.0,4.0,,4.5,4.0,3.5,,4.0,1.5,,,,,2.0,,,,5.0,4.0,,,4.0,4.0,,,3.5,,4.0,,4.5,3.0,4.0,,3.5,,,,,,4.0,,4.0,4.0,,4.5,2.0,3.5,,,4.0,4.0,,4.5,4.5,,,,,,,,,,4.0,,,4.0,4.0,4.0,4.0,,
271,1.5,2.5,,2.0,,3.5,2.5,,3.5,,,3.0,,3.5,2.0,3.0,,3.0,1.5,4.0,2.0,2.0,3.5,,3.5,,,4.0,,,3.0,,,2.5,,,4.0,,3.5,,2.0,3.5,3.0,,3.5,2.5,3.5,,3.0,2.5,,2.5,4.5,,3.0,,,4.0,,3.0,,,,,4.0,,,4.5,,4.0,,,,2.5,,,,4.0,3.0,,,,,,3.0,3.5,3.0,4.0,,,,2.0,4.0,,4.5,2.5,,4.5,4.0,4.0,2.0,3.5,4.0,,0.5,3.0,,2.5,4.0,2.0,,,2.5,4.0,3.0,3.5,3.0,,,2.5,,,4.5,3.5,,3.0,2.5,4.5,,3.5,,3.0,,4.0,,4.0,,,4.5,,2.5,,,,,,3.5,0.5,3.5,1.5,,4.0,3.5,0.5,2.0,3.5,4.0,,,,4.0,3.5,,1.0,,2.5,4.0,,,3.5,,3.0,3.0
294,4.5,4.5,,1.0,,5.0,4.0,,4.5,4.0,,,4.0,4.5,5.0,,3.5,5.0,4.0,4.5,4.0,3.5,4.0,,3.5,2.0,,4.5,4.5,,4.0,4.0,4.0,5.0,,5.0,4.5,4.0,3.0,,4.0,4.0,,2.5,4.5,4.5,2.0,2.0,5.0,4.0,,2.5,,2.0,,3.5,,5.0,,2.0,4.0,2.5,3.0,,,3.5,,5.0,4.0,4.0,,,3.5,3.5,3.0,,3.0,,4.0,3.5,,,,5.0,2.0,,,4.0,,,,4.0,2.0,3.5,2.5,,2.0,,3.5,2.5,4.5,4.0,,,3.0,4.5,,,,3.0,4.0,,3.5,3.5,3.0,,,,,4.0,3.5,,3.5,1.5,,3.5,4.5,2.5,,3.5,4.0,,,4.5,2.5,,,,2.5,,4.5,,,3.5,,,4.0,3.0,4.0,2.0,,3.0,,,3.5,3.5,,,,,4.0,,4.5,5.0,,4.5,4.5,3.0,,,3.0,4.5,2.5
359,5.0,,,3.0,2.0,4.0,5.0,4.0,5.0,5.0,,,,4.5,,4.0,,3.5,4.0,5.0,5.0,4.0,4.0,4.0,3.0,,,5.0,3.5,3.0,4.0,3.0,4.0,5.0,,4.0,4.0,5.0,4.5,,,3.0,2.0,4.0,3.0,4.5,,4.0,5.0,,,,4.0,5.0,,,4.0,3.0,3.0,,4.0,4.0,,,5.0,4.0,,3.5,4.5,4.0,4.0,,,,4.5,,4.0,4.0,4.5,,,,,3.5,4.0,4.5,4.5,5.0,,,,4.0,,,5.0,,,,3.0,3.0,4.0,,5.0,,4.0,5.0,4.0,3.5,,1.0,,4.5,3.0,4.0,4.5,,,4.0,4.0,4.0,4.0,,,,,4.0,3.0,3.5,3.5,2.5,2.5,4.0,3.5,4.5,,,3.5,,2.0,,5.0,,,4.0,,,4.0,4.0,,3.0,4.0,3.5,,3.0,3.5,,5.0,,3.5,4.0,3.5,,3.0,2.5,3.5,2.5,4.5,4.0,4.0,4.0,,,4.0
367,3.0,2.0,,3.0,,3.5,4.0,4.5,4.5,,,3.0,,4.0,3.5,3.0,,4.5,3.0,3.5,3.0,,,,3.0,,,4.0,,,4.5,,4.5,4.5,,4.0,,4.0,,,3.5,3.0,3.0,3.0,4.0,,3.0,1.5,3.5,,2.5,3.0,,3.5,,2.5,4.5,4.5,,3.0,,3.0,1.0,,3.5,3.0,,5.0,,4.0,1.0,,3.0,2.0,,,3.5,,3.5,,,,,3.0,4.5,3.0,3.5,4.5,,3.5,,3.0,3.5,1.5,2.0,,3.0,,3.5,,,3.5,,,,4.5,,3.0,,3.0,,3.0,3.5,,,,,,,4.0,,,,,,3.0,,3.0,,4.0,,2.5,,4.0,,3.0,4.0,,,,4.5,,,,,,4.0,,4.0,3.0,,5.0,,,3.5,,,,,,,,4.0,4.0,,3.5,4.0,,4.0,,4.5,4.5,4.5
388,2.5,1.5,,1.0,,3.5,3.0,3.5,4.5,4.5,,2.0,,3.0,2.0,1.5,,4.5,3.0,4.0,2.0,,1.5,,2.0,4.0,,3.5,4.0,,3.0,4.0,3.5,3.0,,4.0,,,3.5,,3.5,3.5,1.0,,3.0,3.0,,1.0,3.5,,2.0,3.0,,4.0,,,3.5,3.5,,3.5,,1.0,3.0,,3.0,4.0,,4.5,,3.0,2.5,,3.0,3.0,,2.5,3.5,2.5,2.0,,1.0,,2.0,3.5,4.0,3.0,3.0,3.5,,3.5,2.0,3.5,2.5,2.0,2.0,2.0,1.5,,4.0,5.0,2.5,,4.5,,,3.0,,1.5,,3.0,2.0,,3.0,,3.5,3.0,,,,3.0,3.5,,,,,,,,,3.0,4.0,2.0,,5.0,3.0,3.0,,,3.5,,3.0,,,,,,5.0,3.5,3.0,3.5,,,3.5,,,,,,,,,3.5,4.0,4.0,,3.0,,3.5,,,,3.0,3.5
413,4.5,,4.5,,4.5,3.5,5.0,4.5,5.0,5.0,2.0,2.0,,,,,5.0,4.0,,4.5,5.0,,,4.5,,5.0,,3.5,3.0,4.5,4.5,4.5,5.0,4.5,2.5,,3.5,3.0,4.0,2.5,4.0,3.5,,4.5,3.5,4.0,,,4.0,,,2.5,3.0,4.5,,,,4.5,,1.5,5.0,,3.0,,3.0,4.0,,4.5,3.5,3.5,3.0,3.5,,,4.0,,2.5,,3.5,4.0,0.5,,,4.5,,3.5,4.5,4.5,,,,1.0,,,,,,4.0,2.5,4.0,4.0,,5.0,,3.5,4.5,,,4.0,,3.5,,2.0,,4.5,4.0,,,3.5,4.0,5.0,,,3.0,,3.5,4.0,,4.5,3.0,4.5,3.0,,5.0,4.0,4.0,4.5,,3.5,4.5,3.5,,3.0,4.5,,,5.0,,4.0,4.0,3.5,4.0,3.0,4.5,4.5,4.0,5.0,,4.0,,4.0,3.5,3.0,4.5,2.5,2.5,3.5,4.0,4.0,,,3.5,
462,3.0,3.5,4.0,3.0,,4.0,4.5,4.5,,4.5,,,4.0,4.0,4.0,3.5,3.0,,4.0,4.0,,,4.0,4.0,4.0,,,,,,4.5,2.0,,4.5,,,3.0,,2.5,,,4.0,3.0,4.0,3.5,,4.0,3.5,2.5,,,4.5,,,,3.5,3.5,4.5,,3.5,3.0,3.0,2.5,,4.5,5.0,,4.0,,4.0,5.0,,,,,2.5,3.5,,4.0,,,3.5,,5.0,,,,2.5,,,3.5,4.5,2.5,,3.0,4.0,,,,5.0,3.5,4.0,3.5,,3.5,5.0,,,,,,,,,4.0,,,,,5.0,4.0,,4.5,,,4.0,3.5,4.0,,4.0,5.0,,,5.0,4.5,5.0,4.0,,,4.5,5.0,,,4.5,,,4.0,4.0,,,,,,,4.0,4.0,4.0,,,,,3.5,4.0,4.0,3.5,,3.5,,,,,4.0,


In [22]:

corr_df = movies_watched_df_pivot.transpose()


In [23]:

corr_df2 = corr_df.corrwith(corr_df[user_id], method='pearson')


In [24]:
corr_df2.head(10)

userId
91     0.351651
116    0.451018
208    0.167091
271    0.152646
294    0.218343
359    0.140954
367    0.407469
388    0.468593
413    0.182098
462    0.331276
dtype: float64

In [25]:
corr_df3 = corr_df2.sort_values(ascending=False).drop_duplicates()
corr_df3.shape

(4598,)

In [26]:
corr_df3.head(10)

userId
4655      1.000000
115226    0.677524
26831     0.633033
112520    0.608344
87537     0.606693
74403     0.605665
33308     0.602234
137993    0.596596
19477     0.593769
24295     0.593594
dtype: float64

In [27]:
# Convert series to dataframe
corr_df4 = corr_df3.to_frame()
corr_df4.head(10)

Unnamed: 0_level_0,0
userId,Unnamed: 1_level_1
4655,1.0
115226,0.677524
26831,0.633033
112520,0.608344
87537,0.606693
74403,0.605665
33308,0.602234
137993,0.596596
19477,0.593769
24295,0.593594


In [29]:
corr_df4.rename( columns={0:'corr'}, inplace=True )
corr_df4.head(10)

Unnamed: 0_level_0,corr
userId,Unnamed: 1_level_1
4655,1.0
115226,0.677524
26831,0.633033
112520,0.608344
87537,0.606693
74403,0.605665
33308,0.602234
137993,0.596596
19477,0.593769
24295,0.593594


In [36]:
# Pull off users that are highly correlated in ratings
# True would prevent userId index from being retained as a column
corr_df5 = corr_df4.drop(axis=0, index = user_id)
top_users = corr_df5[(corr_df5["corr"] >= 0.40)]
top_users.head(10)

Unnamed: 0_level_0,corr
userId,Unnamed: 1_level_1
115226,0.677524
26831,0.633033
112520,0.608344
87537,0.606693
74403,0.605665
33308,0.602234
137993,0.596596
19477,0.593769
24295,0.593594
40798,0.592647


In [37]:
top_users.size

669

In [38]:
top_users.reset_index(drop=False, inplace=True)
top_users.head(10)

Unnamed: 0,userId,corr
0,115226,0.677524
1,26831,0.633033
2,112520,0.608344
3,87537,0.606693
4,74403,0.605665
5,33308,0.602234
6,137993,0.596596
7,19477,0.593769
8,24295,0.593594
9,40798,0.592647


In [39]:
similar_users_df2 = similar_users_df[similar_users_df['userId'].isin(top_users['userId'])]
similar_users_df2.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
13174,116,1,3.0,2005-11-23 02:06:57
13175,116,2,2.0,2005-11-23 06:41:08
13176,116,3,2.0,2005-11-23 06:40:58
13177,116,6,1.5,2005-11-23 16:03:02
13178,116,8,1.0,2005-11-24 00:22:10
13179,116,9,1.5,2005-11-23 20:29:11
13180,116,10,2.0,2005-11-23 16:00:40
13181,116,11,2.0,2005-11-23 16:03:35
13182,116,12,0.5,2005-11-23 23:44:19
13183,116,15,0.5,2005-11-24 03:58:08


In [40]:
similar_users_df2.shape

(530366, 4)

In [41]:
similar_users_df2.to_csv("rating_corr.csv",header=False, index=False)
