In [None]:
!pip install numpy pandas scikit-learn matplotlib seaborn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid", palette="muted", font_scale=1.0)


In [None]:
from google.colab import files
import pandas as pd

print("Upload ratings.csv")
files.upload()
ratings = pd.read_csv("ratings.csv")

print("Ratings shape:", ratings.shape)
ratings.head()

In [None]:
from google.colab import files

print("Upload movies.csv")
files.upload()
movies = pd.read_csv("movies.csv")

print(" Movies shape:", movies.shape)
movies.head()


In [None]:
from google.colab import files
import pandas as pd

print("Upload Posters.csv")
uploaded = files.upload()


filename = list(uploaded.keys())[0]
posters = pd.read_csv(filename)

print("Posters shape:", posters.shape)
posters.head()

In [None]:
print("Missing values in ratings:\n", ratings.isnull().sum())

In [None]:
print("Missing values in movies:\n", movies.isnull().sum())

In [None]:
print("Missing Values in Posters.csv:\n", posters.isnull().sum(), "\n")

In [None]:
ratings.drop_duplicates(inplace=True)

In [None]:
movies.drop_duplicates(inplace=True)

In [None]:
ratings = ratings[(ratings['rating'] >= 0) & (ratings['rating'] <= 5)]


In [None]:
posters.drop_duplicates(inplace=True)

In [None]:
posters = posters[['title', 'posters_url']]

In [None]:
print(movies.columns)

In [None]:
print(posters.columns)

In [None]:
movies = movies.merge(posters, on="title", how="left")

In [None]:
avg_ratings = ratings.groupby("movieId")["rating"].mean().reset_index()
avg_ratings.columns = ["movieId", "avg_rating"]


In [None]:
movie_data = movies.merge(avg_ratings, on="movieId", how="left")
movie_data.dropna(subset=["avg_rating"], inplace=True)


In [None]:
movies["title_clean"] = movies["title"].str.lower().str.replace(r"\s*\(\d{4}\)", "", regex=True).str.strip()
posters["title_clean"] = posters["title"].str.lower().str.replace(r"\s*\(\d{4}\)", "", regex=True).str.strip()

In [None]:
movie_data = movie_data.merge(posters[['title','poster_url']], on="title", how="left")

In [None]:
print("\nData types:\n", ratings.dtypes)

In [None]:
print("\nData types:\n", movies.dtypes)

In [None]:
print("\nData types (posters):\n", posters.dtypes)

In [None]:
movies.head()

In [None]:
ratings.head()

In [None]:
posters.head()

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(ratings['rating'], bins=10, kde=True, color='purple')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()
print("Unique Users:", ratings['userId'].nunique())
print("Unique Movies:", ratings['movieId'].nunique())

In [None]:
user_activity = ratings.groupby('userId')['rating'].count()
plt.figure(figsize=(8,4))
sns.histplot(user_activity, bins=50, color='teal')
plt.title('Ratings Per User')
plt.xlabel('Number of Ratings per User')
plt.ylabel('User Count')
plt.show()



In [None]:
movie_activity = ratings.groupby('movieId')['rating'].count()
plt.figure(figsize=(8,4))
sns.histplot(movie_activity, bins=50, color='coral')
plt.title('Ratings Per Movie')
plt.xlabel('Number of Ratings per Movie')
plt.ylabel('Movie Count')
plt.show()



In [None]:
C = movie_data['avg_rating'].mean()
m = movie_data['num_ratings'].quantile(0.75)
movie_data['weighted_rating'] = (movie_data['num_ratings'] * movie_data['avg_rating'] + m * C) / (movie_data['num_ratings'] + m)

movie_data['popularity'] = np.log1p(movie_data['num_ratings'])

In [None]:
features_adv = ['avg_rating','weighted_rating','popularity']
X_adv = movie_data[features_adv].fillna(0).values
scaler_adv = StandardScaler()
X_adv_scaled = scaler_adv.fit_transform(X_adv)

print("Feature matrix shape:", X_adv_scaled.shape)


In [None]:
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_adv_scaled)
movie_data['pca1'] = X_pca[:,0]
movie_data['pca2'] = X_pca[:,1]

In [None]:
run_tsne = False
if run_tsne:
    tsne = TSNE(n_components=2, random_state=42, init='pca', learning_rate='auto')
    X_tsne = tsne.fit_transform(X_adv_scaled)
    movie_data['tsne1'] = X_tsne[:,0]
    movie_data['tsne2'] = X_tsne[:,1]

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(data=movie_data.sample(2000, random_state=1), x='pca1', y='pca2', hue='avg_rating', palette='viridis', legend=False)
plt.title("PCA of advanced features (sampled 2k movies)")
plt.show()


In [None]:
from tqdm.notebook import trange

k_range = list(range(2,9))
kmeans_sils = []
gmm_bics = []
gmm_aics = []
gmm_sils = []

for k in k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels_k = km.fit_predict(X_adv_scaled)
    sil_k = silhouette_score(X_adv_scaled, labels_k) if len(np.unique(labels_k))>1 else np.nan
    kmeans_sils.append(sil_k)

    gmm = GaussianMixture(n_components=k, covariance_type='full', random_state=42, n_init=3)
    gmm.fit(X_adv_scaled)
    labels_g = gmm.predict(X_adv_scaled)
    bic = gmm.bic(X_adv_scaled)
    aic = gmm.aic(X_adv_scaled)
    sil_g = silhouette_score(X_adv_scaled, labels_g) if len(np.unique(labels_g))>1 else np.nan
    gmm_bics.append(bic)
    gmm_aics.append(aic)
    gmm_sils.append(sil_g)

In [None]:
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(k_range, kmeans_sils, marker='o', label='KMeans silhouette')
plt.plot(k_range, gmm_sils, marker='o', label='GMM silhouette')
plt.xlabel('k'); plt.ylabel('silhouette score'); plt.legend(); plt.title('Silhouette scores')

In [None]:
plt.subplot(1,2,2)
plt.plot(k_range, gmm_aics, marker='o', label='AIC'); plt.plot(k_range, gmm_bics, marker='o', label='BIC')
plt.xlabel('k'); plt.legend(); plt.title('GMM AIC/BIC')
plt.show()


In [None]:
pd.DataFrame({'k':k_range, 'kmeans_sil':kmeans_sils, 'gmm_sil':gmm_sils, 'gmm_aic':gmm_aics, 'gmm_bic':gmm_bics})

In [None]:
movie_avg = ratings.groupby('movieId')['rating'].mean().reset_index()
movie_avg.rename(columns={'rating':'avg_rating'}, inplace=True)


movie_data = pd.merge(movie_avg, movies[['movieId','title']], on='movieId')
print("Movies after merge:", movie_data.shape)
movie_data.head()

In [None]:
X = movie_data[['avg_rating']].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Feature shape:", X_scaled.shape)
print("Scaled mean ≈ ", X_scaled.mean(), " | std ≈ ", X_scaled.std())


In [None]:
best_k_by_sil = k_range[int(np.nanargmax(kmeans_sils))] if len(kmeans_sils)>0 else 5
k = best_k_by_sil
print("Choosing k =", k, " (best by KMeans silhouette)")

kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
movie_data['kmeans_cluster'] = kmeans.fit_predict(X_adv_scaled)
centers = scaler_adv.inverse_transform(kmeans.cluster_centers_)
print("KMeans centers (in original feature scale):\n", centers)

In [None]:
gmm = GaussianMixture(n_components=k, covariance_type='full', random_state=42, n_init=5)
gmm.fit(X_adv_scaled)
movie_data['gmm_cluster'] = gmm.predict(X_adv_scaled)
probs = gmm.predict_proba(X_adv_scaled)
for i in range(k):
    movie_data[f'prob_cluster_{i}'] = probs[:, i]

In [None]:
movie_data['pca1'] = X_pca[:,0]
movie_data['pca2'] = X_pca[:,1]

plt.figure(figsize=(10,3))
sns.scatterplot(data=movie_data.sample(2000, random_state=1), x='pca1', y='pca2', hue='kmeans_cluster', palette='tab10', legend='brief', s=25)
plt.title("KMeans clusters (visualized on PCA)")
plt.show()

In [None]:
plt.figure(figsize=(10,3))
plt.scatter(movie_data['avg_rating'], [0]*len(movie_data),
            c=movie_data['cluster'], cmap='tab10', s=60, alpha=0.7)
plt.scatter(centers[:, 0],
            [0]*k, c='red', marker='X', s=200, label='Centroids')

plt.title('Movie Clusters Based on Average Ratings')
plt.xlabel('Average Rating')
plt.yticks([])
plt.legend()
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.show()

In [None]:
for c in sorted(movie_data['kmeans_cluster'].unique()):
    print(f"\n Cluster {c} Movies:")
    print(movie_data[movie_data['kmeans_cluster']==c]
            .sort_values('avg_rating', ascending=False)
            .head(10)[['title','avg_rating']].to_string(index=False))

In [None]:
movie_data.to_csv("movie_clusters_kmeans_cleaned.csv", index=False)

print(" Cluster file created successfully!\n")
print("Here are the first 10 rows of the clustered movie data:\n")
display(movie_data.head(10))


In [None]:
print(" Movies with SIMILAR Ratings — Grouped by Cluster\n")

for c in sorted(movie_data['cluster'].unique()):
    print(f"\n Cluster {c} — Movies with Similar Ratings:")
    cluster_movies = movie_data[movie_data['cluster'] == c][['title', 'avg_rating']]


    cluster_movies = cluster_movies.sort_values('avg_rating')

    print(cluster_movies.head(10).to_string(index=False))


In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

samples = movie_data.groupby('cluster', group_keys=False).apply(lambda x: x.sample(1, random_state=22)).reset_index(drop=True)
samples = samples[['title', 'avg_rating', 'cluster']].sort_values('avg_rating')
print(samples.to_string(index=False))


In [None]:
movie_name = input(" Enter a movie name: ")

matched = movie_data[movie_data['title'].str.contains(movie_name, case=False, na=False)]

if not matched.empty:
    movie_row = matched.iloc[0]
    cluster_label = movie_row['cluster']
    movie_title = movie_row['title']
    movie_rating = movie_row['avg_rating']
    poster = movie_row['poster_url']

    print(f"\n Movie Found: {movie_title}")
    print(f" Average Rating: {movie_rating:.2f}")
    print(f" Cluster: {cluster_label}")

    if pd.notna(poster):
        print(f" Poster URL: {poster}")
    else:
        print(" Poster URL: Not available")

    print("\n Movies with Similar Ratings:\n")

    similar_movies = movie_data[
        (movie_data['cluster'] == cluster_label) &
        (movie_data['title'] != movie_title)
    ][['title', 'avg_rating', 'poster_url']].sort_values('avg_rating')

    print(similar_movies.head(10).to_string(index=False))

else:
    print(" No movie found with that name. Please try again.")
