In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('imdb_top_1000.csv')


In [None]:
features_to_use = ['Runtime', 'IMDB_Rating', 'Meta_score']

df['Combined_Features'] = df[features_to_use].astype(str).agg(' '.join, axis=1)

In [None]:
def calculate_cosine_similarity(data):
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(data)
    cosine_similarities = tfidf_matrix
    return cosine_similarities

def find_optimal_k(data, max_k=10):
    wcss = []
    for k in range(1, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(data)
        wcss.append(kmeans.inertia_)
    return wcss

In [None]:
cosine_similarities = calculate_cosine_similarity(df['Combined_Features'])

wcss_values = find_optimal_k(cosine_similarities)

plt.plot(range(1, len(wcss_values) + 1), wcss_values, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.show()

In [None]:
optimal_k = 5

kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(cosine_similarities)

cluster_characteristics = df.groupby('Cluster').agg({
    'Runtime': 'mean',
    'IMDB_Rating': 'mean',
    'Meta_score': 'mean',
    'Genre': lambda x: x.mode().iloc[0]
}).reset_index()

print(cluster_characteristics)
