In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report,confusion_matrix
from imblearn.over_sampling import SMOTE
from scipy.sparse import csr_matrix

# Load the dataset

In [3]:
ratings = pd.read_csv("archive\\rating.csv")
movies=pd.read_csv("archive\\movie.csv")

# Merging dataset movies and ratings on movieId
df=pd.merge(movies,ratings,on='movieId')
print(df.head())

   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating            timestamp  
0       3     4.0  1999-12-11 13:36:47  
1       6     5.0  1997-03-13 17:50:52  
2       8     4.0  1996-06-05 13:37:51  
3      10     4.0  1999-11-25 02:44:47  
4      11     4.5  2009-01-02 01:13:41  


# Exploratory Data Analysis

In [4]:
df.shape

(20000263, 6)

In [5]:
df.size

120001578

In [6]:
df.columns

Index(['movieId', 'title', 'genres', 'userId', 'rating', 'timestamp'], dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   userId     int64  
 4   rating     float64
 5   timestamp  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 915.5+ MB


In [8]:
df.describe()

Unnamed: 0,movieId,userId,rating
count,20000260.0,20000260.0,20000260.0
mean,9041.567,69045.87,3.525529
std,19789.48,40038.63,1.051989
min,1.0,1.0,0.5
25%,902.0,34395.0,3.0
50%,2167.0,69141.0,3.5
75%,4770.0,103637.0,4.0
max,131262.0,138493.0,5.0


In [9]:
df.isnull().sum()

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
dtype: int64

In [10]:
df.isna().sum()

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
dtype: int64

# Creating user movie matrix for Clustering

In [16]:
top_movies=df['title'].value_counts().nlargest(1200).index
df=df[df['title'].isin(top_movies)]

user_movie_matrix=df.pivot_table(index='userId',columns='title',values='rating').fillna(0)
print(user_movie_matrix)
ss=StandardScaler()
scaled_user_movie_matrix=ss.fit_transform(user_movie_matrix)

title   10 Things I Hate About You (1999)  12 Angry Men (1957)  \
userId                                                           
1                                     0.0                  0.0   
2                                     0.0                  0.0   
3                                     0.0                  0.0   
4                                     0.0                  0.0   
5                                     0.0                  0.0   
...                                   ...                  ...   
138489                                0.0                  4.5   
138490                                0.0                  0.0   
138491                                0.0                  0.0   
138492                                0.0                  0.0   
138493                                0.0                  4.0   

title   2001: A Space Odyssey (1968)  28 Days Later (2002)  300 (2007)  \
userId                                                             

# Kmeans Clustering

In [21]:
kmeans=KMeans(n_clusters=5,random_state=42,n_init=10)
user_clusters=kmeans.fit_predict(scaled_user_movie_matrix)
print(user_clusters)
user_movie_matrix['cluster']=user_clusters
df=df.merge(user_movie_matrix['cluster'],on='userId')
print(df.head())

[0 1 4 ... 1 1 0]
   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating            timestamp  cluster_x  cluster_y  liked  cluster  
0       3     4.0  1999-12-11 13:36:47          4          4      1        4  
1       6     5.0  1997-03-13 17:50:52          1          1      1        1  
2       8     4.0  1996-06-05 13:37:51          3          3      1        3  
3      10     4.0  1999-11-25 02:44:47          1          1      1        1  
4      11     4.5  2009-01-02 01:13:41          0          0      1        0  


# Creating Like label

In [19]:
df['liked']=(df['rating']>=4).astype(int)

# Feature Engineering

In [25]:
X=pd.get_dummies(df[['cluster','genres']],drop_first=True)
y=df['liked']

print(X)

         cluster  genres_Action|Adventure|Animation|Children|Comedy  \
0              4                                              False   
1              1                                              False   
2              3                                              False   
3              1                                              False   
4              0                                              False   
...          ...                                                ...   
9416379        2                                              False   
9416380        1                                              False   
9416381        2                                              False   
9416382        0                                              False   
9416383        0                                              False   

         genres_Action|Adventure|Animation|Drama|Fantasy  \
0                                                  False   
1                          

# Classification model (RandomForestClassifier)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rfc=RandomForestClassifier(random_state=42)
rfc.fit(X_train,y_train)
y_pred=rfc.predict(X_test)

MemoryError: Unable to allocate 5.81 GiB for an array with shape (207, 7533107) and data type float32

In [None]:
print("Classification report:\n\n",classification_report(y_))