In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import networkx as nx
import math as math
import time
import re

from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [0]:
def create_binary_columns(df, column_name, prefix):
    
    df[column_name] = df[column_name].str.lower().str.replace(' ', '', regex=False)

    
    unique_values = (
        df[column_name]
        .dropna()
        .str.split(',')
        .explode()
        .str.strip()
        .unique()
    )

    for val in unique_values:
        df[f'{prefix}_{val}'] = df[column_name].apply(
            lambda x: int(val in x) if pd.notnull(x) else 0
        )

    df.drop(columns=[column_name], inplace=True)

    return df


In [0]:
df = pd.read_csv("Data/Netflix_movies_and_tv_shows_clustering.csv")

In [0]:
df.head()

In [0]:
print("\nType:\n ", df['type'].unique())
print("\nTitle:\n ", df['title'].unique())
print("\ndirector:\n ", df['director'].unique())
print("\nduration:\n ", df['duration'].unique())



In [0]:
display(df['cast'].nunique())
all_actors = (
    df['cast']
    .dropna()
    .str.lower()
    .str.replace(' ', '', regex=False)
    .str.split(',')
    .explode()
)
actor_counts = Counter(all_actors.str.strip())
display(len(actor_counts))

min_freq = 1
common_actors = {actor for actor, count in actor_counts.items() if count >= min_freq}
display(len(common_actors))

min_freq = 10
common_actors = {actor for actor, count in actor_counts.items() if count >= min_freq}
display(len(common_actors))

df['cast'] = df['cast'].str.lower().str.replace(' ', '', regex=False)
for actor in common_actors:
    df[f'actor_{actor}'] = df['cast'].apply(lambda x: int(actor in x) if pd.notnull(x) else 0)

In [0]:

df['country'] = df['country'].str.lower().str.replace(' ', '', regex=False)

df_listed_exploded = (
    df['country']
    .dropna()  
    .str.split(',')  
    .explode()  
    .unique()  
)

print(f"Número de categorias únicas em country: {len(df_listed_exploded)}")
print(df_listed_exploded)

for category in df_listed_exploded:
    df[f'listed_{category}'] = df['country'].apply(
        lambda x: int(category in x) if pd.notnull(x) else 0
    )

In [0]:

df['listed_in'] = df['listed_in'].str.lower().str.replace(' ', '', regex=False)

df_listed_exploded = (
    df['listed_in']
    .dropna()  
    .str.split(',')  
    .explode()  
    .unique()  
)

print(f"Número de categorias únicas em listed_in: {len(df_listed_exploded)}")
print(df_listed_exploded)

for category in df_listed_exploded:
    df[f'listed_{category}'] = df['listed_in'].apply(
        lambda x: int(category in x) if pd.notnull(x) else 0
    )

In [0]:
rating_map = {
    'TV-MA':'Adults',
    'R':'Adults',
    'PG-13':'Teens',
    'TV-14':'Young Adults',
    'TV-PG':'Older Kids',
    'NR':'Adults',
    'TV-G':'Kids',
    'TV-Y':'Kids',
    'TV-Y7':'Older Kids',
    'PG':'Older Kids',
    'G':'Kids',
    'NC-17':'Adults',
    'TV-Y7-FV':'Older Kids',
    'UR':'Adults'
}


df['rating_group'] = df['rating'].map(rating_map)

df['rating_group'] = df['rating_group'].fillna('Unknown')

df = pd.get_dummies(df, columns=['rating_group'], prefix='rating')

df.head()

In [0]:
df = df.drop(columns = ['show_id', 'date_added', 'description', 'cast', 'director', 'listed_in', 'rating', 'country'])

df['type'] = df['type'].map({'TV Show': 0, 'Movie': 1})
df['duration'] = df['duration'].str.replace(' min', '', regex=False) 

def convert_season_to_minutes(value):
    if 'Season' in value:
        match = re.search(r'\d+', value)
        if match:
            seasons = int(match.group())
            return seasons * 450 #10x45
        else:
            return None
    else:
            return int(value)

df['duration'] = df['duration'].apply(convert_season_to_minutes)

In [0]:

for col, dtype in df.dtypes.items():
    if str(dtype) in ['int8', 'int16', 'uint8', 'uint16']:
        df[col] = df[col].astype('int32')


display(df)


In [0]:
print(df)

In [0]:
X = df.drop(columns=['title']).values
titles = df['title']

kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(12,8))
scatter = plt.scatter(X_pca[:,0], X_pca[:,1], c=clusters, cmap='viridis')


for i in range(0, len(titles), 100):  
    plt.text(X_pca[i,0], X_pca[i,1], titles.iloc[i], fontsize=8)

plt.title("Clusters de filmes (com PCA 2D)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.colorbar(scatter, label='Cluster')
plt.show()


In [0]:
df['cluster'] = clusters
for i in range(5):
    print(f"Cluster {i}:")
    print(df[df['cluster'] == i]['title'].tolist()[:10])  
    print()
