In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Data Pre-Processing**

In [2]:
import pandas as pd
import numpy as np 
import warnings

In [3]:
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [4]:
Titles = pd.read_csv('/content/drive/My Drive/Movie Recommender/Netflix Prize data/movie_titles.csv', 
                           encoding = 'ISO-8859-1', header = None, 
                           names = ['Id', 'Year', 'Name']).set_index('Id')

Titles=Titles[0:4500]

In [5]:
print('Null Values in Year Column (if any):')
Titles[Titles['Year'].isnull()]

Null Values in Year Column (if any):


Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
4388,,Ancient Civilizations: Rome and Pompeii


In [6]:
Titles.loc[4388,'Year']=2001

In [7]:
print('Null Values in Name Column (if any):')
Titles[Titles['Name'].isnull()]

Null Values in Name Column (if any):


Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1


In [8]:
Titles['Year'] = Titles['Year'].astype(int)

In [9]:
print('Shape of Titles Dataset: ', Titles.shape)
Titles.head()

Shape of Titles Dataset:  (4500, 2)


Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003,Dinosaur Planet
2,2004,Isle of Man TT 2004 Review
3,1997,Character
4,1994,Paula Abdul's Get Up & Dance
5,2004,The Rise and Fall of ECW


In [10]:
df_raw = pd.read_csv('/content/drive/My Drive/Movie Recommender/Netflix Prize data/combined_data_1.txt',
                      header=None, names=['User', 'Rating'], usecols=[0, 1])

In [11]:
df_raw['Movie']=0

In [12]:
print('Shape of Netflix Ratings Raw Data: ',df_raw.shape)
df_raw.head()

Shape of Netflix Ratings Raw Data:  (24058263, 3)


Unnamed: 0,User,Rating,Movie
0,1:,,0
1,1488844,3.0,0
2,822109,5.0,0
3,885013,4.0,0
4,30878,4.0,0


In [13]:
movie_info = df_raw[df_raw['Rating'].isna()]['User'].reset_index()

In [14]:
print('Number of Movies in the Dataset: ', len(movie_info))
movie_info.head()

Number of Movies in the Dataset:  4499


Unnamed: 0,index,User
0,0,1:
1,548,2:
2,694,3:
3,2707,4:
4,2850,5:


In [15]:
for i in range(len(movie_info)):
  if i<len(movie_info)-1:
    df_raw['Movie'][(movie_info['index'][i]):movie_info['index'][i+1]]=i+1
  else:
    df_raw['Movie'][(movie_info['index'][i]):len(df_raw)]=i+1

In [16]:
df_raw=df_raw.drop(list(movie_info['index']),axis=0)

In [17]:
df_raw.reset_index(drop=True,inplace=True)
df_raw

Unnamed: 0,User,Rating,Movie
0,1488844,3.0,1
1,822109,5.0,1
2,885013,4.0,1
3,30878,4.0,1
4,823519,3.0,1
...,...,...,...
24053759,2591364,2.0,4499
24053760,1791000,2.0,4499
24053761,512536,5.0,4499
24053762,988963,3.0,4499


In [18]:
print('Shape of User-Ratings Table: ',df_raw.shape)

Shape of User-Ratings Table:  (24053764, 3)


In [19]:
min_movie_ratings = 1500

filter_movies = (df_raw['Movie'].value_counts()>min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

filter_movies[:10]

[1905, 2152, 3860, 4432, 571, 3938, 4306, 2452, 1962, 3962]

In [20]:
min_user_ratings = 325

filter_users = (df_raw['User'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

filter_users[:5]

['305344', '387418', '2439493', '1664010', '2118461']

In [21]:
df_filtered = df_raw[(df_raw['Movie'].isin(filter_movies)) & (df_raw['User'].isin(filter_users))]

df_filtered.reset_index(drop=True,inplace=True)

In [22]:
print('Shape of Unfiltered User-Ratings Table: ',df_raw.shape)
print('Shape of Filtered User-Ratings Table: ',df_filtered.shape)

Shape of Unfiltered User-Ratings Table:  (24053764, 3)
Shape of Filtered User-Ratings Table:  (2170509, 3)


In [23]:
print("Number of Unique Movies: ",df_filtered['Movie'].nunique())
print("Number of Unique Users: ",df_filtered['User'].nunique())

Number of Unique Movies:  1466
Number of Unique Users:  5492


In [24]:
df_filtered

Unnamed: 0,User,Rating,Movie
0,712664,5.0,3
1,603277,3.0,3
2,1650301,2.0,3
3,2312349,4.0,3
4,1977959,4.0,3
...,...,...,...
2170504,1388741,4.0,4496
2170505,1704416,4.0,4496
2170506,15737,4.0,4496
2170507,823628,5.0,4496


In [25]:
def show_all_movies():
  
  movielist=df_filtered.Movie.unique()
  
  for i in movielist:
    print('MovieId:',i,'-- Name:',Titles.loc[i,'Name'])

In [26]:
show_all_movies()

MovieId: 3 -- Name: Character
MovieId: 8 -- Name: What the #$*! Do We Know!?
MovieId: 16 -- Name: Screamers
MovieId: 17 -- Name: 7 Seconds
MovieId: 18 -- Name: Immortal Beloved
MovieId: 26 -- Name: Never Die Alone
MovieId: 28 -- Name: Lilo and Stitch
MovieId: 30 -- Name: Something's Gotta Give
MovieId: 32 -- Name: ABC Primetime: Mel Gibson's The Passion of the Christ
MovieId: 33 -- Name: Aqua Teen Hunger Force: Vol. 1
MovieId: 44 -- Name: Spitfire Grill
MovieId: 45 -- Name: The Love Letter
MovieId: 46 -- Name: Rudolph the Red-Nosed Reindeer
MovieId: 47 -- Name: The Bad and the Beautiful
MovieId: 48 -- Name: Justice League
MovieId: 52 -- Name: The Weather Underground
MovieId: 55 -- Name: Jade
MovieId: 56 -- Name: Carandiru
MovieId: 57 -- Name: Richard III
MovieId: 58 -- Name: Dragonheart
MovieId: 68 -- Name: Invader Zim
MovieId: 71 -- Name: Maya Lin: A Strong Clear Vision
MovieId: 76 -- Name: I Love Lucy: Season 2
MovieId: 77 -- Name: Congo
MovieId: 78 -- Name: Jingle All the Way
MovieI

In [39]:
mfk=[(3,273),(5,477),(4,479),(5,1180),(5,1865),(4,1877),(2,2452),(5,2743),
     (5,2782),(5,2862),(5,2942),(5,3290),(2,3925),(5,571),(5,3864),(4,722),(5,3124)]

In [40]:
mgc = [(5,2942),(5,1877),(5,3124),(5,3456),(5,3579),(4,4393),(4,1962),(4,3082),(4,1046),
       (3,1955),(3,1757),(3,825),(3,4149),(3,3938),(2,2452),(2,4216),(2,3949),(2,631),
       (2,118),(2,191),(2,48),(1,4402),(1,4384),(1,4284),(1,2376),(1,3740),(1,3713),
       (1,3225),(1,2699),(1,2395),(1,1832),(1,361),(1,401)]

In [48]:
mcy = [(1,3925 ),(5,273),(5,118),(5,607),(4,658),(5,1073),(1,2102),(5,2104),(5,1877),(2,3949), 
     (5,2942),(5,3290),(5,3376),(3,3864),(4,2782),(5,3124),(5,1877),(5,1865),(5,1180),(5,2743)]

In [49]:
def add_user(name,data):
  
  df = pd.DataFrame(columns=['User', 'Rating', 'Movie'])

  for i in data:
    df = df.append({'User': name, 'Rating': float(i[0]), 'Movie': i[1]},ignore_index=True)

  new_df=pd.concat([df_filtered, df], ignore_index=True)

  return new_df

In [50]:
df_filtered=add_user('Furkan',mfk)
df_filtered[-20:]

Unnamed: 0,User,Rating,Movie
2170703,Cihan,5.0,1865
2170704,Cihan,5.0,1180
2170705,Cihan,5.0,2743
2170706,Furkan,3.0,273
2170707,Furkan,5.0,477
2170708,Furkan,4.0,479
2170709,Furkan,5.0,1180
2170710,Furkan,5.0,1865
2170711,Furkan,4.0,1877
2170712,Furkan,2.0,2452


In [51]:
df_filtered=add_user('Gizem',mgc)
df_filtered[-20:]

Unnamed: 0,User,Rating,Movie
2170736,Gizem,3.0,3938
2170737,Gizem,2.0,2452
2170738,Gizem,2.0,4216
2170739,Gizem,2.0,3949
2170740,Gizem,2.0,631
2170741,Gizem,2.0,118
2170742,Gizem,2.0,191
2170743,Gizem,2.0,48
2170744,Gizem,1.0,4402
2170745,Gizem,1.0,4384


In [52]:
df_filtered=add_user('Cihan',mcy)
df_filtered[-20:]

Unnamed: 0,User,Rating,Movie
2170756,Cihan,1.0,3925
2170757,Cihan,5.0,273
2170758,Cihan,5.0,118
2170759,Cihan,5.0,607
2170760,Cihan,4.0,658
2170761,Cihan,5.0,1073
2170762,Cihan,1.0,2102
2170763,Cihan,5.0,2104
2170764,Cihan,5.0,1877
2170765,Cihan,2.0,3949


In [53]:
def show_rated_movies(name):
  
  df = pd.DataFrame(columns=['Movie Name', 'Rating'])
  info=df_filtered[df_filtered['User']==name].reset_index(drop=True)
  print('User Name:',name)

  for i in range(len(info)):
    df=df.append({'Movie Name':Titles.loc[info.loc[i,'Movie'],'Name'],'Rating':int(info.loc[i,'Rating'])},ignore_index=True)

  return df

In [54]:
show_rated_movies('Furkan')

User Name: Furkan


Unnamed: 0,Movie Name,Rating
0,Taxi,3
1,George Carlin: Personal Favorites,5
2,George Carlin: What Am I Doing in New Jersey?,4
3,A Beautiful Mind,5
4,Eternal Sunshine of the Spotless Mind,5
5,Friends: Season 2,4
6,Lord of the Rings: The Fellowship of the Ring,2
7,The Pianist,5
8,Braveheart,5
9,The Silence of the Lambs,5


In [55]:
show_rated_movies('Gizem')

User Name: Gizem


Unnamed: 0,Movie Name,Rating
0,Friends: Season 6,5
1,Friends: Season 2,5
2,Titanic,5
3,Lost: Season 1,5
4,2 Fast 2 Furious,5
...,...,...
127,The Missing,1
128,Scream,1
129,Rosemary's Baby,1
130,The Phantom of the Opera: Special Edition,1


In [56]:
show_rated_movies('Cihan')

User Name: Cihan


Unnamed: 0,Movie Name,Rating
0,The Matrix: Reloaded,1
1,Taxi,5
2,Rambo: First Blood Part II,5
3,Speed,5
4,Robin Hood: Prince of Thieves,4
...,...,...
79,Titanic,5
80,Friends: Season 2,5
81,Eternal Sunshine of the Spotless Mind,5
82,A Beautiful Mind,5


In [57]:
show_rated_movies('93124')[0:15]

User Name: 93124


Unnamed: 0,Movie Name,Rating
0,Fame,2
1,Reservoir Dogs,4
2,Missing in Action 2: The Beginning / Missing i...,2
3,X2: X-Men United,4
4,The Deer Hunter,4
5,North by Northwest,3
6,The Devil's Own,2
7,Bridget Jones's Diary,3
8,Ed Wood,3
9,High Fidelity,4


## **SVD Model (Singular Value Decomposition)**

In [58]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 5.8MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1670907 sha256=4b562f35939ee9e300a2efe742f15eafaff7d9fc35c3387eb728e5199fc357ac
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [59]:
from surprise import SVD
from surprise import accuracy
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [60]:
reader = Reader(rating_scale=(1, 5))

In [None]:
#subdata = Dataset.load_from_df(df_filtered[['User', 'Movie', 'Rating']].sample(500000), reader)

In [None]:
#param_grid = {'n_epochs': [20, 40], 'lr_all': [0.001, 0.005, 0.01],
              #'reg_all': [0.01, 0.02, 0.05],'n_factors':[50,100]}

In [None]:
#gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs=-1)

#gs.fit(subdata)

In [None]:
#print("Best RMSE Socre:",gs.best_score['rmse'])
#print("Best Parameters:",gs.best_params['rmse'])

In [61]:
data = Dataset.load_from_df(df_filtered[['User', 'Movie', 'Rating']], reader)

trainset, testset = train_test_split(data, test_size=.20)

In [62]:
algo=SVD(n_factors=100, n_epochs=30, biased=True,
         lr_all=0.005, reg_all=0.035)

In [63]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb025e2b780>

In [64]:
pred_testset = algo.test(testset)
print(accuracy.rmse(pred_testset))

RMSE: 0.8096
0.8095954792845049


In [65]:
pred_testset[:5]

[Prediction(uid='1927703', iid=438, r_ui=1.0, est=2.234047311961157, details={'was_impossible': False}),
 Prediction(uid='2234535', iid=4136, r_ui=2.0, est=2.2057686786681634, details={'was_impossible': False}),
 Prediction(uid='750677', iid=1087, r_ui=1.0, est=2.009542896959462, details={'was_impossible': False}),
 Prediction(uid='1314869', iid=3102, r_ui=4.0, est=3.6628483045347524, details={'was_impossible': False}),
 Prediction(uid='943256', iid=2851, r_ui=4.0, est=2.848740654550715, details={'was_impossible': False})]

## **Movie Recommendations**


In [66]:
from collections import defaultdict

In [67]:
data = Dataset.load_from_df(df_filtered[['User', 'Movie', 'Rating']], reader)

In [68]:
all_data = data.build_full_trainset()

data_toPredict = all_data.build_anti_testset()

In [69]:
predictions = algo.test(data_toPredict)

In [70]:
predictions[:5]

[Prediction(uid='712664', iid=8, r_ui=3.391228758748024, est=3.224241309594196, details={'was_impossible': False}),
 Prediction(uid='712664', iid=16, r_ui=3.391228758748024, est=3.106865512742301, details={'was_impossible': False}),
 Prediction(uid='712664', iid=17, r_ui=3.391228758748024, est=2.477379846524379, details={'was_impossible': False}),
 Prediction(uid='712664', iid=28, r_ui=3.391228758748024, est=3.4454537478409564, details={'was_impossible': False}),
 Prediction(uid='712664', iid=30, r_ui=3.391228758748024, est=2.97709381826657, details={'was_impossible': False})]

In [71]:
def top_movies(predictions, n=10):

    recommended = defaultdict(list)

    for userid, movieid, _ , estimation, _ in predictions:
        recommended[userid].append((movieid, estimation))

    for userid, user_ratings in recommended.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        recommended[userid] = user_ratings[:n]

    return recommended

In [72]:
recommended = top_movies(predictions, n=10)

In [73]:
recommended['Furkan']

[(1476, 5),
 (3456, 5),
 (3928, 5),
 (4353, 4.979168956109944),
 (1409, 4.8629671025717),
 (1395, 4.807621964518478),
 (199, 4.788616936493417),
 (1915, 4.778592463694775),
 (3888, 4.758923235360246),
 (4306, 4.734359938908458)]

In [74]:
[movieid for (movieid, estimation) in recommended['93124']]

[3456, 2568, 3446, 3290, 1994, 907, 3965, 4383, 1357, 2102]

In [75]:
def show_all_recommendations():
  for userid, user_ratings in recommended.items():
      print('User:', userid, '-- Recommended Movies:', [movieid for (movieid, estimation) in user_ratings])


In [76]:
show_all_recommendations()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
User: 1704248 -- Recommended Movies: [2057, 3456, 4098, 359, 3078, 3769, 2942, 270, 2585, 2114]
User: 2523912 -- Recommended Movies: [1915, 32, 3456, 4427, 3928, 2927, 2162, 2603, 2300, 2942]
User: 1279012 -- Recommended Movies: [2102, 3456, 2172, 4427, 2195, 634, 4115, 3444, 1495, 1020]
User: 383451 -- Recommended Movies: [3456, 1476, 4427, 2405, 2102, 3928, 4238, 3769, 2019, 1947]
User: 1479892 -- Recommended Movies: [3456, 2162, 3523, 2040, 463, 4207, 2568, 1395, 752, 2172]
User: 199435 -- Recommended Movies: [477, 4115, 889, 3354, 2456, 1300, 3348, 4011, 1476, 1770]
User: 2316337 -- Recommended Movies: [1915, 1409, 1222, 3456, 1800, 2754, 2405, 2548, 3521, 1790]
User: 1123175 -- Recommended Movies: [3456, 985, 2162, 2463, 3949, 4207, 385, 551, 32, 2927]
User: 2274452 -- Recommended Movies: [1642, 1291, 3456, 4207, 1856, 3949, 463, 2102, 504, 2129]
User: 2147714 -- Recommended Movies: [4427, 2102, 722, 3662, 907, 634, 

In [77]:
def Recommend_Movies(User):
  
  print('**Recommended Movies for User ' + User + ':**')
  print()
  order=1

  for movie_est in recommended[User]:
    print(str(order)+')',Titles.loc[movie_est[0],'Name'], '(' + str(Titles.loc[movie_est[0],'Year']) + ')')
    order+=1


In [78]:
Recommend_Movies('93124')

**Recommended Movies for User 93124:**

1) Lost: Season 1 (2004)
2) Stargate SG-1: Season 8 (2004)
3) Spirited Away (2002)
4) The Godfather (1974)
5) The L Word: Season 2 (2005)
6) Animal Crackers (1930)
7) Red Dwarf: Series 5 (1992)
8) Farscape: The Peacekeeper Wars (2004)
9) Stargate SG-1: Season 7 (2003)
10) The Simpsons: Season 6 (1994)


In [79]:
Recommend_Movies('Furkan')

**Recommended Movies for User Furkan:**

1) Six Feet Under: Season 4 (2004)
2) Lost: Season 1 (2004)
3) Nip/Tuck: Season 2 (2004)
4) Curb Your Enthusiasm: Season 3 (2002)
5) The O.C.: Season 1 (2003)
6) Charade (1963)
7) The Deer Hunter (1978)
8) Law & Order: Special Victims Unit: The Second Year (2000)
9) NYPD Blue: Season 2 (1994)
10) The Sixth Sense (1999)


In [80]:
Recommend_Movies('Gizem')

**Recommended Movies for User Gizem:**

1) The Best of Friends: Vol. 4 (1994)
2) The Best of Friends: Season 1 (1994)
3) The Best of Friends: Season 2 (1994)
4) Sex and the City: Season 4 (2001)
5) The O.C.: Season 1 (2003)
6) Everybody Loves Raymond: Season 1 (1996)
7) Alias: Season 4 (2005)
8) The American President (1995)
9) Gilmore Girls: Season 1 (2000)
10) Sleepless in Seattle (1993)


In [81]:
Recommend_Movies('Cihan')

**Recommended Movies for User Cihan:**

1) The Ghost and Mrs. Muir (1947)
2) Charade (1963)
3) The O.C.: Season 1 (2003)
4) I Want to Live! (1958)
5) Gilmore Girls: Season 3 (2002)
6) Fly Away Home (1996)
7) On Golden Pond (1981)
8) Little House on the Prairie: Season 2 (1975)
9) Little House on the Prairie: Season 3 (1976)
10) The Golden Girls: Season 1 (1985)
