link:https://grouplens.org/datasets/movielens/1m/


In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline


# machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")

## Import Datasets

In [None]:
# Import Movies Dataset
# Import Movies Dataset
dfMovies = pd.read_csv('https://raw.githubusercontent.com/droideronline/Business-Analytics-Movielens-Analysis/main/movies.dat',sep="::",names=["MovieID","Title","Genres"],engine='python',encoding='ISO-8859-1')
dfMovies.head()

In [None]:
# Import Ratings Dataset
dfRatings = pd.read_csv("https://raw.githubusercontent.com/droideronline/Business-Analytics-Movielens-Analysis/main/ratings.dat",sep="::",names=["UserID","MovieID","Rating","Timestamp"],engine='python')
dfRatings.head()

In [None]:
# Import Ratings Dataset
dfUsers = pd.read_csv("https://raw.githubusercontent.com/droideronline/Business-Analytics-Movielens-Analysis/main/users.dat",sep="::",names=["UserID","Gender","Age","Occupation","Zip-code"],engine='python')
dfUsers.head()

In [None]:
dfMovies.shape

In [None]:
dfUsers.shape

In [None]:
dfRatings.shape

## Create a new dataset [Master_Data]

In [None]:
dfMovieRatings = pd.merge(pd.merge(dfRatings,dfMovies),dfUsers)
dfMovieRatings.head()


In [None]:
# to check whether merging does not changes any dataset
dfMovieRatings.shape

In [None]:
dfMaster =pd.merge(pd.merge(dfRatings, dfMovies), dfUsers)
dfMaster.head()

In [None]:
dfMaster.to_csv("Master.csv")

## Explore the datasets using visual representations

### User Age Distribution

In [None]:
# Users with Different Age Groups
dfMaster['Age'].value_counts()

In [None]:
# Plot for users with different age groups
dfMaster['Age'].value_counts().plot(kind='bar')
plt.xlabel("Age")
plt.title("User Age Distribution")
plt.ylabel('Users Count')
plt.show()

### User rating of the movie “Toy Story”

In [None]:
# Toy Story
toystoryRating = dfMaster[dfMaster['Title'].str.contains('Toy Story') == True]
toystoryRating

In [None]:
toystoryRating.groupby(["Title","Rating"]).size()

In [None]:
toystoryRating.groupby(["Title","Rating"]).size().unstack().plot(kind='barh',stacked=False,legend=True)
plt.show()

### Top 25 movies by viewership rating

In [None]:
dfTop25 = dfMaster.groupby('Title').size().sort_values(ascending=False)[:25]
dfTop25

In [None]:
dfTop25.plot(kind='barh',alpha=0.6,figsize=(7,7))
plt.xlabel("Viewership Ratings Count")
plt.ylabel("Movies (Top 25)")
plt.title("Top 25 movies by viewership rating")
plt.show()


In [None]:
# Dropping rows with missing values
dfMaster.dropna(axis=0, inplace=True)

# Dropping columns with missing values
dfMaster.dropna(axis=1, inplace=True)

### Find the ratings for all the movies reviewed by for a particular user of user id = 2696

In [None]:
userId = 2696
userRatingById = dfMaster[dfMaster["UserID"] == userId]
userRatingById

Bokeh visualization

In [None]:
!pip install bokeh


In [None]:
dfGenres = pd.read_csv("https://raw.githubusercontent.com/droideronline/Business-Analytics-Movielens-Analysis/main/Geners.dat",sep="::",names=["GenreID","Genre"],engine='python')
dfGenres.head()

In [None]:
dfGenres.to_csv("Genres.csv")

In [None]:
import pandas as pd
from bokeh.layouts import row, column
from bokeh.models import ColumnDataSource, Div
from bokeh.plotting import figure, curdoc
from bokeh.transform import factor_cmap
from bokeh.io import output_notebook, show
from bokeh.palettes import Spectral4

# read the master dataset
df = pd.read_csv('Master.csv')
df1=pd.read_csv('Genres.csv')

# create data sources for the plots
movies_source = ColumnDataSource(data=dict(movie_id=df['MovieID'], title=df['Title'], genres=df1['Genre']))
users_source = ColumnDataSource(data=dict(user_id=df['UserID'], gender=df['Gender'], age=df['Age'], occupation=df['Occupation']))
ratings_source = ColumnDataSource(data=dict(movie_id=df['MovieID'], rating=df['Rating']))

# create a scatter plot for movies by genre
genres = sorted(df['Genres'].unique())
colors = factor_cmap('Genres', palette=Spectral4, factors=genres)
movies_plot = figure(title='Movies by Genre', x_axis_label='MovieID', y_axis_label='Genres', tools='pan,box_select')
movies_plot.scatter('MovieID', 'Genres', source=movies_source, color=colors)

# create a histogram for user ages
age_hist, age_edges = np.histogram(df['Age'], bins=20)
age_plot = figure(title='User Age Distribution', x_axis_label='Age', y_axis_label='Count', tools='pan,box_select')
age_plot.quad(top=age_hist, bottom=0, left=age_edges[:-1], right=age_edges[1:], line_color='white')

# create a bar chart for movie ratings
rating_counts = df['Rating'].value_counts().sort_index()
rating_plot = figure(title='Movie Ratings Distribution', x_axis_label='Rating', y_axis_label='Count', tools='pan,box_select')
rating_plot.vbar(x=rating_counts.index.astype(str), top=rating_counts.values, width=0.9)

# create a summary div to display some stats about the dataset
summary_div = Div(text=f"<h2>Movie Recommendation System</h2>"
                       f"<p><b>Number of movies:</b> {len(df['MovieID'].unique())}</p>"
                       f"<p><b>Number of users:</b> {len(df['UserID'].unique())}</p>"
                       f"<p><b>Number of ratings:</b> {len(df)}</p>"
                       f"<p><b>Average rating:</b> {df['Rating'].mean():.2f}</p>")

# display the plots and summary div in the notebook
output_notebook()
show(column(summary_div, row(movies_plot, age_plot, rating_plot)))


## Feature Engineering

### Find out all the unique genres 

In [None]:
#dfGenres = dfMaster[]
dfGenres = dfMaster['Genres'].str.split("|")

In [None]:
dfGenres

In [None]:

listGenres = set()
for genre in dfGenres:
    listGenres = listGenres.union(set(genre))

In [None]:
# All Unique genres
listGenres

### Create a separate column for each genre category with a one-hot encoding ( 1 and 0) whether or not the movie belongs to that genre. 

In [None]:
ratingsOneHot = dfMaster['Genres'].str.get_dummies("|")

In [None]:
ratingsOneHot.head()

In [None]:
dfMaster = pd.concat([dfMaster,ratingsOneHot],axis=1)

In [None]:
dfMaster.head()

In [None]:
dfMaster.columns

In [None]:
dfMaster.to_csv("Final_Master.csv")

### Determine the features affecting the ratings of any particular movie.

In [None]:
dfMaster[["title","Year"]] = dfMaster.Title.str.extract("(.)\s\((.\d+)",expand=True)

In [None]:
dfMaster = dfMaster.drop(columns=["title"])
dfMaster.head()

In [None]:
dfMaster.info()

In [None]:
dfMaster['Year'] = dfMaster.Year.astype(int)

In [None]:
dfMaster['Movie_Age'] = 2000 - dfMaster.Year
dfMaster.head()

In [None]:
dfMaster['Gender'] = dfMaster.Gender.str.replace('F','1')

In [None]:
dfMaster['Gender'] = dfMaster.Gender.str.replace('M','0')

In [None]:
dfMaster['Gender'] = dfMaster.Gender.astype(int)

In [None]:
dfMaster.head()

In [None]:
dfGenderAffecting = dfMaster.groupby('Gender').size().sort_values(ascending=False)[:25]
dfGenderAffecting

In [None]:
dfMaster.groupby(["Gender","Rating"]).size().unstack().plot(kind='bar',stacked=False,legend=True)
plt.show()

In [None]:
dfMaster.groupby(["Age","Rating"]).size().unstack().plot(kind='bar',stacked=False,legend=True)
plt.show()

In [None]:
dfMaster.groupby(["Occupation","Rating"]).size().unstack().plot(kind='bar',stacked=False,legend=True)
plt.show()

In [None]:
dfMaster.groupby(["Year","Rating"]).size().unstack().plot(kind='bar',stacked=False,legend=True)
plt.show()

In [None]:
dfMaster.groupby(["Movie_Age","Rating"]).size().unstack().plot(kind='bar',stacked=False,legend=True)
plt.show()

# Dimensionality reduction

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

In [None]:
dfMaster = dfMaster.rename(columns={"Children's": 'Children'})

In [None]:
# Define the features and labels
features = dfMaster[['MovieID', 'Age', 'Occupation']].values
labels = dfMaster['Rating'].values

## PCA

In [None]:
# Apply PCA for dimensionality reduction
pca = PCA(n_components=3)
pca_features = pca.fit_transform(features)

In [None]:
labels_pca = pca_features[-1 :]
labels_pca

In [None]:
pca_features

In [None]:
# Plot the PCA-reduced features
plt.scatter(pca_features[:, 0], pca_features[:, 1], c=labels, cmap='viridis')
plt.title('PCA Dimensionality Reduction')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

## SVD

In [None]:
# Apply SVD for dimensionality reduction
svd = TruncatedSVD(n_components=3)
svd_features = svd.fit_transform(features)

In [None]:
# Plot the SVD-reduced features
plt.scatter(svd_features[:, 0], svd_features[:, 1], c=labels, cmap='viridis')
plt.title('SVD Dimensionality Reduction')
plt.xlabel('SV1')
plt.ylabel('SV2')
plt.show()

# **Clustering**

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

# Load a random subset of the ratings dataset
ratings = pd.read_csv('https://raw.githubusercontent.com/droideronline/Business-Analytics-Movielens-Analysis/main/ratings.dat', sep='::', names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')
ratings = ratings.sample(n=100000, random_state=42)

# Load the movies dataset
movies = pd.read_csv('https://raw.githubusercontent.com/droideronline/Business-Analytics-Movielens-Analysis/main/movies.dat', sep='::', names=['movie_id', 'title', 'genres'], engine='python', encoding='ISO-8859-1')

# Merge the movies and ratings datasets
df = pd.merge(movies, ratings, on='movie_id')

# Convert the dataset into a pivot table with users as rows and movies as columns
pivot_table = pd.pivot_table(df, index='user_id', columns='title', values='rating', fill_value=0)

# Scale the pivot table
scaler = MinMaxScaler()
pivot_table_scaled = scaler.fit_transform(pivot_table)

# Compute the KNN graph
n_neighbors = 10
knn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
knn.fit(pivot_table_scaled)
knn_graph = knn.kneighbors_graph(pivot_table_scaled)

# Compute the DBSCAN clusters
eps = 0.5
min_samples = 5
dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
dbscan.fit(knn_graph)

# Compute the KMeans clusters
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(pivot_table_scaled)

# Plot the results
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

# Plot the KNN graph
axs[0].imshow(knn_graph.toarray())
axs[0].set_title(f'KNN Graph (k={n_neighbors})')

# Plot the DBSCAN clusters
unique_labels = np.unique(dbscan.labels_)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for label, color in zip(unique_labels, colors):
    if label == -1:
        color = 'gray'
    class_member_mask = (dbscan.labels_ == label)
    xy = pivot_table_scaled[class_member_mask]
    axs[1].scatter(xy[:, 0], xy[:, 1], c=color, label=f'Cluster {label}')
axs[1].set_title(f'DBSCAN Clustering (eps={eps}, min_samples={min_samples})')
axs[1].legend()

# Plot the KMeans clusters
unique_labels = np.unique(kmeans.labels_)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for label, color in zip(unique_labels, colors):
    class_member_mask = (kmeans.labels_ == label)
    xy = pivot_table_scaled[class_member_mask]
    axs[2].scatter(xy[:, 0], xy[:, 1], c=color, label=f'Cluster {label}')
axs[2].set_title(f'KMeans Clustering (n_clusters={n_clusters})')
axs[2].legend()

plt.show()


# Develop an appropriate model to predict the movie ratings

In [None]:
#First 500 extracted records
first_500 = dfMaster[:1000]

In [None]:
first_500

In [None]:
#Use the following features:movie id,age,occupation
features = first_500[['MovieID','Age','Occupation']].values

In [None]:
#Use rating as label
labels = first_500[['Rating']].values

In [None]:
#Create train and test data set
train, test, train_labels, test_labels = train_test_split(features,labels,test_size=0.33,random_state=42)

# Regression

In [None]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(train, train_labels)
Y_pred = sgd.predict(test)
acc_sgd = round(sgd.score(train, train_labels) * 100, 2)
Y_pred

In [None]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(train, train_labels)
Y_pred = linear_svc.predict(test)
acc_linear_svc = round(linear_svc.score(train, train_labels) * 100, 2)
Y_pred

In [None]:
# Support Vector Machines

svc = SVC()
svc.fit(train, train_labels)
Y_pred = svc.predict(test)
acc_svc = round(svc.score(train, train_labels) * 100, 2)
Y_pred

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines','Linear SVC','Stochastic Gradient Descent'],
    'Score': [acc_svc,acc_linear_svc,acc_sgd]})
models.sort_values(by='Score', ascending=False)

# Classification

In [None]:

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(train, train_labels)
Y_pred = knn.predict(test)
acc_knn = round(knn.score(train, train_labels) * 100, 2)
Y_pred

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

# Generate the confusion matrix
cm_knn = confusion_matrix(test_labels, Y_pred)



# Generate the classification report
report_knn = classification_report(test_labels, Y_pred)
print(report_knn)
cm_knn

In [None]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(train, train_labels)
Y_pred = decision_tree.predict(test)
acc_decision_tree = round(decision_tree.score(train, train_labels) * 100, 2)
Y_pred

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

# Generate the confusion matrix
cm_dt = confusion_matrix(test_labels, Y_pred)



# Generate the classification report
report_dt = classification_report(test_labels, Y_pred)
print(report_dt)
cm_dt

In [None]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(train, train_labels)
Y_pred = random_forest.predict(test)
random_forest.score(train, train_labels)
acc_random_forest = round(random_forest.score(train, train_labels) * 100, 2)
Y_pred

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

# Generate the confusion matrix
cm_rf = confusion_matrix(test_labels, Y_pred)



# Generate the classification report
report_rf = classification_report(test_labels, Y_pred)
print(report_rf)
cm_rf

In [None]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(train, train_labels)
Y_pred = gaussian.predict(test)
acc_gaussian = round(gaussian.score(train, train_labels) * 100, 2)
Y_pred

In [None]:
# Generate the confusion matrix
cm_nb = confusion_matrix(test_labels, Y_pred)



# Generate the classification report
report_nb = classification_report(test_labels, Y_pred)
print(report_nb)
cm_nb

In [None]:
models = pd.DataFrame({
    'Model': ['KNN','Random Forest','Decision Tree','Naive Bayes'],
    'Score': [acc_knn,acc_random_forest, acc_decision_tree,acc_gaussian]})
models.sort_values(by='Score', ascending=False)

# PCA Prediction

In [None]:

features = pca_features[:, :2]


In [None]:
features

In [None]:
#Use rating as label
labels = pca_features[:, 2]

In [None]:
labels

In [None]:
ilabels = np.round(labels).astype(int)
ilabels

In [None]:
ipca_features = np.round(features).astype(int)
ipca_features

In [None]:

# Print the shape of your input data and labels
print('Input data shape:', pca_features.shape)
print('Labels shape:', labels.shape)


In [None]:
from sklearn.preprocessing import KBinsDiscretizer

# Discretize the label data into 5 bins
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
labels_discrete = discretizer.fit_transform(labels.reshape(-1, 1)).flatten()


## Decision Tree

In [None]:
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(pca_features, labels_discrete, test_size=0.33, random_state=42)

# Create a pipeline with data scaling, PCA, and decision tree classifier
pca_dt = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)),
    ('dt', DecisionTreeClassifier())
])

# Fit the pipeline on the training data
pca_dt.fit(X_train, y_train)

# Evaluate the model on the test data
pdt_accuracy = pca_dt.score(X_test, y_test)
print('Accuracy:', pdt_accuracy)


In [None]:
# Generate the predicted labels
y_pred = pca_dt.predict(X_test)

# Generate the confusion matrix
pdt = confusion_matrix(y_test, y_pred)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

# Generate the confusion matrix
pdt = confusion_matrix(y_test, y_pred)



# Generate the classification report
report_pcadt = classification_report(y_test, y_pred)
print(report_pcadt)
pdt

## Random Forest

In [None]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(pca_features, labels_discrete, test_size=0.33, random_state=42)

# Create a pipeline with data scaling, PCA, and random forest classifier
pca_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)),
    ('rf', RandomForestClassifier())
])

# Fit the pipeline on the training data
pca_rf.fit(X_train, y_train)

# Evaluate the model on the test data
prf_accuracy = pca_rf.score(X_test, y_test)
print('Accuracy:', prf_accuracy)

In [None]:
# Generate the predicted labels
y_pred = pca_rf.predict(X_test)

# Generate the confusion matrix
prf = confusion_matrix(y_test, y_pred)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

# Generate the confusion matrix
prf = confusion_matrix(y_test, y_pred)



# Generate the classification report
report_prf = classification_report(y_test, y_pred)
print(report_prf)
prf

# SVD Prediction

In [None]:
features = svd_features[:, :2]

In [None]:
features

In [None]:
labels = svd_features[:, 2]

In [None]:
labels

In [None]:
ilabels = np.round(labels).astype(int)
ilabels

In [None]:
ipca_features = np.round(features).astype(int)
ipca_features

In [None]:
# Print the shape of your input data and labels
print('Input data shape:', pca_features.shape)
print('Labels shape:', labels.shape)


In [None]:
from sklearn.preprocessing import KBinsDiscretizer

# Discretize the label data into 5 bins
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
labels_discrete = discretizer.fit_transform(labels.reshape(-1, 1)).flatten()

## DecisionTree

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(svd_features, labels_discrete, test_size=0.33, random_state=42)

# Create a pipeline with data scaling, TruncatedSVD, and decision tree classifier
svd_dt = Pipeline([
    ('scaler', StandardScaler()),
    ('svd', TruncatedSVD(n_components=1)),
    ('dt', DecisionTreeClassifier())
])

# Fit the pipeline on the training data
svd_dt.fit(X_train, y_train)

# Evaluate the model on the test data
sdt_accuracy = svd_dt.score(X_test, y_test)
print('Accuracy:', sdt_accuracy)


In [None]:
# Generate the predicted labels
y_pred = svd_dt.predict(X_test)

# Generate the confusion matrix
sdt = confusion_matrix(y_test, y_pred)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

# Generate the confusion matrix
sdt = confusion_matrix(y_test, y_pred)



# Generate the classification report
report_sdt = classification_report(y_test, y_pred)
print(report_sdt)
sdt

## Random Forest

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(svd_features, labels_discrete, test_size=0.33, random_state=42)

# Create a pipeline with data scaling, TruncatedSVD, and decision tree classifier
svd_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('svd', TruncatedSVD(n_components=1)),
    ('rf', DecisionTreeClassifier())
])

# Fit the pipeline on the training data
svd_rf.fit(X_train, y_train)

# Evaluate the model on the test data
srf_accuracy = svd_rf.score(X_test, y_test)
print('Accuracy:', srf_accuracy)

In [None]:
# Generate the predicted labels
y_pred = svd_rf.predict(X_test)

# Generate the confusion matrix
srf = confusion_matrix(y_test, y_pred)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

# Generate the confusion matrix
srf = confusion_matrix(y_test, y_pred)



# Generate the classification report
report_srf = classification_report(y_test, y_pred)
print(report_srf)
srf

# result

In [None]:
models = pd.DataFrame({
    'Model': ['PCA Random Forest','PCA Decision Tree','SVD Random Forest','SVD Decision Tree'],
    'Score': [prf_accuracy*100,pdt_accuracy*100, srf_accuracy*100,sdt_accuracy*100]})
models.sort_values(by='Score', ascending=False)

In [None]:
!pip install surprise

In [None]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

# Load the dataset
df_ratings = pd.read_csv('https://raw.githubusercontent.com/droideronline/Business-Analytics-Movielens-Analysis/main/ratings.dat', sep='::', names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')
df_movies = pd.read_csv('https://raw.githubusercontent.com/droideronline/Business-Analytics-Movielens-Analysis/main/movies.dat', sep='::', names=['movie_id', 'title', 'genres'], engine='python', encoding='ISO-8859-1')
df_users = pd.read_csv('https://raw.githubusercontent.com/droideronline/Business-Analytics-Movielens-Analysis/main/users.dat', sep='::', names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')

# Convert the data into the format required by Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[['user_id', 'movie_id', 'rating']], reader)

# Split the dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Use the SVD algorithm for collaborative filtering
algo = SVD()

# Train the algorithm on the training set
algo.fit(trainset)

# Get predictions for the test set
predictions = algo.test(testset)

# Print the RMSE score of the model
from surprise import accuracy
rmse = accuracy.rmse(predictions)
print('RMSE:', rmse)

# Generate recommendations for a specific user
user_id = 1
user_movies = df_ratings[df_ratings['user_id'] == user_id]['movie_id']
user_unseen_movies = df_movies[~df_movies['movie_id'].isin(user_movies)]['movie_id']
user_unseen_movies = list(user_unseen_movies.values)
user_ratings = []
for movie_id in user_unseen_movies:
    user_ratings.append((user_id, movie_id, 4))  # assume a rating of 4 for all unseen movies
user_predictions = algo.test(user_ratings)
user_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True)

# Print the top 10 recommended movies for the user
print(f"Top 10 recommended movies for user {user_id}:")
for i, prediction in enumerate(user_predictions[:10]):
    movie_id = prediction.iid
    movie_title = df_movies[df_movies['movie_id'] == movie_id]['title'].values[0]
    print(f"{i+1}. {movie_title} ({prediction.est:.2f})")


In [None]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Load a random subset of the ratings dataset
ratings = pd.read_csv('https://raw.githubusercontent.com/droideronline/Business-Analytics-Movielens-Analysis/main/ratings.dat', sep='::', names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')
ratings = ratings.sample(n=100000, random_state=42)

# Load the movies dataset
movies = pd.read_csv('https://raw.githubusercontent.com/droideronline/Business-Analytics-Movielens-Analysis/main/movies.dat', sep='::', names=['movie_id', 'title', 'genres'], engine='python', encoding='ISO-8859-1')
# Merge the movies and ratings datasets
df = pd.merge(movies, ratings, on='movie_id')

# Convert the dataset into a transaction format
transactions = df.groupby(['user_id', 'title'])['rating'].max().unstack().reset_index().fillna(0).set_index('user_id')

# Encode the transaction data into a binary matrix
te = TransactionEncoder()
te_ary = te.fit_transform(transactions.values)
basket_sets = pd.DataFrame(te_ary, columns=te.columns_)

# Perform association mining using the Apriori algorithm
frequent_itemsets = apriori(basket_sets, min_support=0.1, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Display the association rules
print(rules)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
df_movies = pd.read_csv('https://raw.githubusercontent.com/droideronline/Business-Analytics-Movielens-Analysis/main/movies.dat', sep='::', names=['movie_id', 'title', 'genres'], engine='python', encoding='ISO-8859-1')

# Create a genre matrix
tfidf = TfidfVectorizer()
genre_matrix = tfidf.fit_transform(df_movies['genres'])

# Compute cosine similarity between all movie pairs
cosine_sim = cosine_similarity(genre_matrix, genre_matrix)

# Define function to get top n similar movies
def get_similar_movies(movie_id, n=10):
    # Get the row index of the given movie
    idx = df_movies[df_movies['movie_id'] == movie_id].index[0]
    # Get the pairwise similarity scores of this movie with all others
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on their similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the movie indices of the top n similar movies
    movie_indices = [i[0] for i in sim_scores[1:n+1]]
    # Return the titles of the top n similar movies
    return df_movies['title'].iloc[movie_indices]

# Example usage: get top 10 movies similar to Toy Story (1995)
get_similar_movies(1, 10)
