In [92]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report,confusion_matrix
from imblearn.over_sampling import SMOTE
from scipy.sparse import csr_matrix

# Load the dataset

In [93]:
ratings = pd.read_csv("archive\\rating.csv",nrows=10000)
movies=pd.read_csv("archive\\movie.csv",nrows=10000)

# Merging dataset movies and ratings on movieId
df=pd.merge(movies,ratings,on='movieId')
print(df.head())

   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating            timestamp  
0       3     4.0  1999-12-11 13:36:47  
1       6     5.0  1997-03-13 17:50:52  
2       8     4.0  1996-06-05 13:37:51  
3      10     4.0  1999-11-25 02:44:47  
4      11     4.5  2009-01-02 01:13:41  


# Exploratory Data Analysis

In [94]:
df.shape

(9273, 6)

In [95]:
df.size

55638

In [96]:
df.columns

Index(['movieId', 'title', 'genres', 'userId', 'rating', 'timestamp'], dtype='object')

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9273 entries, 0 to 9272
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movieId    9273 non-null   int64  
 1   title      9273 non-null   object 
 2   genres     9273 non-null   object 
 3   userId     9273 non-null   int64  
 4   rating     9273 non-null   float64
 5   timestamp  9273 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 434.8+ KB


In [98]:
df.describe()

Unnamed: 0,movieId,userId,rating
count,9273.0,9273.0,9273.0
mean,2250.879974,45.311981,3.632643
std,3225.147611,26.470152,1.041386
min,1.0,1.0,0.5
25%,549.0,23.0,3.0
50%,1393.0,50.0,4.0
75%,2944.0,65.0,4.5
max,32587.0,91.0,5.0


In [99]:
df.isnull().sum()

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
dtype: int64

In [100]:
df.isna().sum()

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
dtype: int64

In [101]:
df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,2009-01-02 01:13:41
...,...,...,...,...,...,...
9268,32031,Robots (2005),Adventure|Animation|Children|Comedy|Fantasy|Sc...,31,4.0,2015-02-23 23:54:40
9269,32213,Cube Zero (2004),Horror|Mystery|Sci-Fi|Thriller,11,4.5,2009-01-01 05:33:35
9270,32587,Sin City (2005),Action|Crime|Film-Noir|Mystery|Thriller,35,5.0,2008-03-28 17:10:17
9271,32587,Sin City (2005),Action|Crime|Film-Noir|Mystery|Thriller,43,3.5,2009-04-09 20:29:22


# Creating user movie matrix for Clustering

In [102]:
user_movie_matrix=df.pivot_table(index='userId',columns='title',values='rating').fillna(0)
print(user_movie_matrix)
ss=StandardScaler()
scaled_user_movie_matrix=ss.fit_transform(user_movie_matrix)

title   'burbs, The (1989)  'night Mother (1986)  \
userId                                             
1                      0.0                   0.0   
2                      0.0                   0.0   
3                      0.0                   0.0   
4                      0.0                   0.0   
5                      0.0                   0.0   
...                    ...                   ...   
87                     0.0                   0.0   
88                     0.0                   0.0   
89                     0.0                   0.0   
90                     0.0                   0.0   
91                     0.0                   0.0   

title   *batteries not included (1987)  ...And Justice for All (1979)  \
userId                                                                  
1                                  0.0                            0.0   
2                                  0.0                            0.0   
3                              

# Kmeans Clustering

In [103]:
kmeans=KMeans(n_clusters=5,random_state=42,n_init=10)
user_clusters=kmeans.fit_predict(scaled_user_movie_matrix)
print(user_clusters)
user_movie_matrix['cluster']=user_clusters
df=df.merge(user_movie_matrix['cluster'],on='userId')
print(df)



[0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 4 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 3 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4]
      movieId             title  \
0           1  Toy Story (1995)   
1           1  Toy Story (1995)   
2           1  Toy Story (1995)   
3           1  Toy Story (1995)   
4           1  Toy Story (1995)   
...       ...               ...   
9268    32031     Robots (2005)   
9269    32213  Cube Zero (2004)   
9270    32587   Sin City (2005)   
9271    32587   Sin City (2005)   
9272    32587   Sin City (2005)   

                                                 genres  userId  rating  \
0           Adventure|Animation|Children|Comedy|Fantasy       3     4.0   
1           Adventure|Animation|Children|Comedy|Fantasy       6     5.0   
2           Adventure|Animation|Children|Comedy|Fantasy       8     4.0   
3           Adventure|Animation|Children|Comedy|Fantasy      10     4.0   
4           Advent

# Creating Like label

In [104]:
df['liked']=(df['rating']>=4).astype(int)

# Feature Engineering

# Classification model (RandomForestClassifier)

In [105]:
print("Classification report:\n\n",classification_report(y_))

NameError: name 'y_' is not defined