In [1]:
# Evolution des genres dans le temps
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt

In [2]:
# Load movies data
data_folder = '../data/MovieSummaries'
df_metadata = pd.read_csv(data_folder + '/movies_metadata_cleaned_genres.csv')
print(df_metadata.head(1)['Grouped_genres'])

0    ['Action/Adventure', 'Horror', 'Science Fictio...
Name: Grouped_genres, dtype: object


In [3]:
# Drop NAN in
#Print la ligne ou le genre = NAN
df_nan = df_metadata[df_metadata['Movie_release_date'] == 2013.0] 
df_nan = df_nan[['Wikipedia_movie_ID','Grouped_genres']]
df_nan.to_csv("2013.csv")


In [4]:
# By year the top genres

# Expand each genre in the 'Movie_genres' list so each genre has its own row
df_expanded = df_metadata.explode('Grouped_genres')

# Group by 'year' and 'Movie_genres' and count occurrences
genre_distribution = df_expanded.groupby(['Movie_release_date', 'Grouped_genres']).size().reset_index(name="Count")


genre_distribution.to_csv('yo.csv')
genre_distribution

Unnamed: 0,Movie_release_date,Grouped_genres,Count
0,1893.0,"['Short Film', 'Indie', 'Black-and-white', 'Si...",1
1,1894.0,"['Short Film', 'Indie', 'Black-and-white', 'Si...",2
2,1895.0,"['Short Film', 'Black-and-white', 'Documentary...",4
3,1895.0,"['Short Film', 'Comedy', 'Animation', 'Silent ...",1
4,1895.0,"['Short Film', 'Indie', 'Black-and-white', 'Si...",1
...,...,...,...
25303,2013.0,[nan],1
25304,2014.0,['Action/Adventure'],1
25305,2014.0,['Drama'],1
25306,2014.0,"['Fantasy', 'Drama']",1


In [None]:
# Plot the evolution of genres over the years
# Pivot the DataFrame to create a table with years as index and genres as columns
genre_pivot = genre_distribution.pivot(index='Movie_release_date', columns='Grouped_genres', values='Count').fillna(0)

# Plot each genre's evolution over time
plt.figure(figsize=(16, 10))  # Increase figure size
for genre in genre_pivot.columns:
    plt.plot(genre_pivot.index, genre_pivot[genre], label=genre)

# Adding labels and title
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Evolution of Movie Genres Over the Years')
plt.legend(title="Genres", bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust spacing to prevent layout issues
plt.subplots_adjust(left=0.1, right=0.85, top=0.9, bottom=0.1)

plt.show()

In [6]:
# Normalize genres per year
# Expand the genres so each genre has its own row
df_expanded = df.explode('Grouped_genres')

# Group by 'year' and 'Movie_genres' to get the count of each genre per year
genre_distribution = df_expanded.groupby(['Movie_release_date', 'Movie_genres']).size().unstack(fill_value=0)

# Calculate the total number of films per year
total_films_per_year = df.groupby('year').size()

# Normalize by dividing each genre count by the total number of films that year
genre_distribution_normalized = genre_distribution.div(total_films_per_year, axis=0)

# Plot the normalized evolution of genres over the years
plt.figure(figsize=(14, 8))
for genre in genre_distribution_normalized.columns:
    plt.plot(genre_distribution_normalized.index, genre_distribution_normalized[genre], label=genre)

plt.title('Normalized Evolution of Movie Genres Over the Years (by Total Films)')
plt.xlabel('Year')
plt.ylabel('Proportion of Total Films')
plt.legend(title='Genres', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

NameError: name 'df' is not defined

In [None]:
# Step 1: Replace NaNs in 'year' and create a 'decade' column
df = df.dropna(subset=['year'])  # Drop rows with NaN in 'year' if not needed
df['Decade'] = (df['year'] // 10 * 10).astype(int)

# Step 2: Explode the 'Movie_genres' column to have one genre per row
df_expanded = df.explode('Movie_genres')

# Step 3: Count occurrences of each genre by grouping by decade and genre
genre_counts = (
    df_expanded
    .groupby(['Decade', 'Movie_genres'])
    .size()
    .reset_index(name='Count')
)

# Step 4: Identify the top 5 genres per decade
top_genres_per_decade = (
    genre_counts
    .sort_values(['Decade', 'Count'], ascending=[True, False])
    .groupby('Decade')
    .head(5)
)

# Step 5: Pivot the data to get decades as rows, genres as columns, and counts as values
top_genres_pivot = top_genres_per_decade.pivot_table(
    index='Decade',
    columns='Movie_genres',
    values='Count',
    fill_value=0
)

# Normalize by the total number of occurrences per decade to get percentages
top_genres_normalized = top_genres_pivot.div(top_genres_pivot.sum(axis=1), axis=0) * 100

# Plot the data as a stacked bar chart
plt.figure(figsize=(14, 8))
top_genres_normalized.plot(kind='bar', stacked=True, colormap="tab20", width=0.8, ax=plt.gca())

plt.title('Top 5 Movie Genres by Percentage Over Decades')
plt.xlabel('Decade')
plt.ylabel('Percentage of Total Genres (%)')
plt.legend(title='Genres', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.grid()
plt.show()

In [None]:
# Step 1: Replace NaNs in 'year' and create a 'decade' column
df = df.dropna(subset=['year'])  # Drop rows with NaN in 'year' if not needed
df['Decade'] = (df['year'] // 10 * 10).astype(int)

# Step 2: Explode the 'Movie_genres' column to have one genre per row
df_expanded = df.explode('Movie_genres')

# Step 3: Count occurrences of each genre by grouping by decade and genre
genre_counts = (
    df_expanded
    .groupby(['Decade', 'Movie_genres'])
    .size()
    .reset_index(name='Count')
)

# Step 4: Identify the top 5 genres per decade
top_genres_per_decade = (
    genre_counts
    .sort_values(['Decade', 'Count'], ascending=[True, False])
    .groupby('Decade')
    .head(10)
)

# Step 5: Pivot the data to get decades as rows, genres as columns, and counts as values
top_genres_pivot = top_genres_per_decade.pivot_table(
    index='Decade',
    columns='Movie_genres',
    values='Count',
    fill_value=0
)

# Normalize by the total number of occurrences per decade to get percentages
top_genres_normalized = top_genres_pivot.div(top_genres_pivot.sum(axis=1), axis=0) * 100

# Plot the data as a stacked bar chart
plt.figure(figsize=(14, 8))
top_genres_normalized.plot(kind='bar', stacked=True, colormap="tab20", width=0.8, ax=plt.gca())

plt.title('Top 10 Movie Genres by Percentage Over Decades')
plt.xlabel('Decade')
plt.ylabel('Percentage of Total Genres (%)')
plt.legend(title='Genres', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.grid()
plt.show()

In [None]:
# Step 1: Replace NaNs in 'year' and create a 'decade' column
df = df.dropna(subset=['year'])  # Drop rows with NaN in 'year' if not needed
df['Decade'] = (df['year'] // 10 * 10).astype(int)

# Step 2: Explode the 'Movie_genres' column to have one genre per row
df_expanded = df.explode('Movie_genres')

# Step 3: Count occurrences of each genre by grouping by decade and genre
genre_counts = (
    df_expanded
    .groupby(['Decade', 'Movie_genres'])
    .size()
    .reset_index(name='Count')
)

# Step 4: Identify the top 5 genres per decade
top_genres_per_decade = (
    genre_counts
    .sort_values(['Decade', 'Count'], ascending=[True, False])
    .groupby('Decade')
    .head(15)
)

# Step 5: Pivot the data to get decades as rows, genres as columns, and counts as values
top_genres_pivot = top_genres_per_decade.pivot_table(
    index='Decade',
    columns='Movie_genres',
    values='Count',
    fill_value=0
)

# Normalize by the total number of occurrences per decade to get percentages
top_genres_normalized = top_genres_pivot.div(top_genres_pivot.sum(axis=1), axis=0) * 100

# Plot the data as a stacked bar chart
plt.figure(figsize=(14, 8))
top_genres_normalized.plot(kind='bar', stacked=True, colormap="tab20", width=0.8, ax=plt.gca())

plt.title('Top 15 Movie Genres by Percentage Over Decades')
plt.xlabel('Decade')
plt.ylabel('Percentage of Total Genres (%)')
plt.legend(title='Genres', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.grid()
plt.show()

In [None]:
# Explode the 'Movie_genres' column to have one genre per row
df_expanded = df.explode('Movie_genres').dropna()

# Get a list of all unique genres
genres = df_expanded['Movie_genres'].unique().tolist()

genres = sorted(genres)

# Print the list of all genres
print("List of all unique genres:")
print(genres)
print(len(genres))

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd
# Define threshold for similarity
similarity_threshold = 80

# Dictionary to store grouped genres
grouped_genres = {}

# Loop through each genre and group similar genres
for genre in genres:
    # Find the best match in current groups
    found_group = False
    for key in grouped_genres.keys():
        if fuzz.token_set_ratio(genre, key) >= similarity_threshold:
            grouped_genres[key].append(genre)
            found_group = True
            break
    # If no similar group is found, create a new group
    if not found_group:
        grouped_genres[genre] = [genre]


grouped_df = pd.DataFrame([(key, ', '.join(values)) for key, values in grouped_genres.items()], 
                          columns=['Representative Genre', 'Grouped Genres'])

# Display the grouped genres
print("Grouped Genres:")


# Display the DataFrame
grouped_df

In [None]:
# Extracting the list of Representative Genres
representative_genres = grouped_df['Representative Genre'].tolist()

# Extracting the list of Grouped Genres
# Split each string in the 'Grouped Genres' column by commas and strip whitespace
grouped_genres_list = set()  # Use a set to avoid duplicates
for genres in grouped_df['Grouped Genres']:
    for genre in genres.split(','):
        grouped_genres_list.add(genre.strip())

# Converting the grouped_genres_list to a sorted list (optional)
grouped_genres_list = sorted(grouped_genres_list)

# Display results
print("Representative Genres:", representative_genres)
print("Grouped Genres:", grouped_genres_list)

In [None]:
if set(grouped_genres_list) == set(genres):
    print("Same list")
else:
    # Find the differences
    only_in_list1 = set(grouped_genres_list) - set(genres)
    only_in_list2 = set(genres) - set(grouped_genres_list)
    
    print("The two lists contain different strings.")
    print("Strings only in grouped :", only_in_list1)
    print("Strings only in genres:", only_in_list2)


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN

# Load pre-trained SBERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Lightweight SBERT model

# Generate embeddings for each genre
genre_embeddings = model.encode(genres)

#Cluster embeddings using DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=1, metric='cosine')  # Adjust `eps` based on how closely you want to cluster
labels = dbscan.fit_predict(genre_embeddings)

# Organize results into a DataFrame
clustered_genres = pd.DataFrame({'Genre': genres, 'Cluster': labels})
grouped_clusters = clustered_genres.groupby('Cluster')['Genre'].apply(list)
print("Grouped Genres by Semantic Cluster:")
print(grouped_clusters)
