In [1]:
# Distribution of Literary Genres with their co-genres

import pandas as pd
import numpy as np
import plotly.express as px

# Initialize an empty DataFrame to store aggregated results
aggregated_data = pd.DataFrame()
co_occurrence = {}

# Read the CSV file in chunks
chunk_size = 10000  # Adjust this value based on your available memory
for chunk in pd.read_csv("../new_data/books_corrected.csv", delimiter=",", low_memory=False, chunksize=chunk_size):
    # Convert the 'genre_and_votes' column to strings
    chunk['genre_and_votes'] = chunk['genre_and_votes'].astype(str)
    
    # Replace hyphens with spaces
    chunk['genre_and_votes'] = chunk['genre_and_votes'].str.replace('-', ' ')
    
    # Split the literary genres and votes
    df_comb = chunk['genre_and_votes'].str.split(',', expand=True)
    df_comb = df_comb.fillna('None')

    # Initialize lists to store genres and votes
    genres_list = []
    votes_list = []

    # Iterate through each row to extract genres and votes
    for index, row in df_comb.iterrows():
        genres_in_row = []
        for col in df_comb.columns:
            if pd.notna(row[col]):
                parts = row[col].rsplit(' ', 1)
                if len(parts) == 2 and parts[1].isdigit():
                    genre = parts[0].strip()
                    votes = int(parts[1])
                    if votes >= 1:
                        genres_in_row.append(genre)
                        genres_list.append(genre)
                        votes_list.append(votes)
        
        # Update the co-occurrence matrix
        for genre in genres_in_row:
            if genre not in co_occurrence:
                co_occurrence[genre] = {}
            for co_genre in genres_in_row:
                if co_genre != genre:
                    if co_genre not in co_occurrence[genre]:
                        co_occurrence[genre][co_genre] = 0
                    co_occurrence[genre][co_genre] += 1

    # Create a temporary DataFrame for the extracted genres and votes
    temp_df = pd.DataFrame({'genre': genres_list, 'votes': votes_list})

    # Add the aggregated results to the global DataFrame
    aggregated_data = pd.concat([aggregated_data, temp_df], ignore_index=True)

# Aggregate the final results
final_aggregated_data = aggregated_data.groupby('genre')['votes'].sum().reset_index()

# Save the pivot table to a CSV file
final_aggregated_data.to_csv("../new_data/aggregated_genres_votes.csv", index=False)

# Create a DataFrame for the co-occurrence matrix
co_occurrence_df = pd.DataFrame(co_occurrence).fillna(0).astype(int)

# For each genre, select the most represented co-genres
co_occurrence_long = co_occurrence_df.reset_index().melt(id_vars='index', var_name='co_genre', value_name='count')
co_occurrence_long = co_occurrence_long[co_occurrence_long['count'] > 0]

# Create a stacked bar chart with plotly
fig = px.bar(co_occurrence_long, x='index', y='count', color='co_genre', title="Distribution of literary genres with their co-genres", labels={'index': 'Genre', 'count': 'Number of co-genres', 'co_genre': 'Co-genre'})

# Show the chart in the browser
fig.show(renderer="browser")

In [2]:
# Distribution of the 10 literary genres with the 5 most represented co-genres

import pandas as pd
import numpy as np
import plotly.express as px

# Initialize an empty DataFrame to store aggregated results
aggregated_data = pd.DataFrame()
co_occurrence = {}

# Read the CSV file in chunks
chunk_size = 10000  # Adjust this value based on your available memory
for chunk in pd.read_csv("../new_data/books_corrected.csv", delimiter=",", low_memory=False, chunksize=chunk_size):
    # Convert the 'genre_and_votes' column to strings
    chunk['genre_and_votes'] = chunk['genre_and_votes'].astype(str)
    
    # Replace hyphens with spaces
    chunk['genre_and_votes'] = chunk['genre_and_votes'].str.replace('-', ' ')
    
    # Split the literary genres and votes
    df_comb = chunk['genre_and_votes'].str.split(',', expand=True)
    df_comb = df_comb.fillna('None')

    # Initialize lists to store genres and votes
    genres_list = []
    votes_list = []

    # Iterate through each row to extract genres and votes
    for index, row in df_comb.iterrows():
        genres_in_row = []
        for col in df_comb.columns:
            if pd.notna(row[col]):
                parts = row[col].rsplit(' ', 1)
                if len(parts) == 2 and parts[1].isdigit():
                    genre = parts[0].strip()
                    votes = int(parts[1])
                    if votes >= 1:
                        genres_in_row.append(genre)
                        genres_list.append(genre)
                        votes_list.append(votes)
        
        # Update the co-occurrence matrix
        for genre in genres_in_row:
            if genre not in co_occurrence:
                co_occurrence[genre] = {}
            for co_genre in genres_in_row:
                if co_genre != genre:
                    if co_genre not in co_occurrence[genre]:
                        co_occurrence[genre][co_genre] = 0
                    co_occurrence[genre][co_genre] += 1

    # Create a temporary DataFrame for the extracted genres and votes
    temp_df = pd.DataFrame({'genre': genres_list, 'votes': votes_list})

    # Add the aggregated results to the global DataFrame
    aggregated_data = pd.concat([aggregated_data, temp_df], ignore_index=True)

# Aggregate the final results
final_aggregated_data = aggregated_data.groupby('genre')['votes'].sum().reset_index()

# Save the pivot table to a CSV file
final_aggregated_data.to_csv("../new_data/aggregated_genres_votes.csv", index=False)

# Create a DataFrame for the co-occurrence matrix
co_occurrence_df = pd.DataFrame(co_occurrence).fillna(0).astype(int)

# Filter the 5 genres with the most co-genres
top_genres = co_occurrence_df.sum(axis=1).nlargest(5).index

# Filter the co-occurrence matrix to include only the genres with the most co-genres
top_co_occurrence_df = co_occurrence_df.loc[top_genres, :]

# For each genre, select the 5 most represented co-genres
top_co_occurrence_long = pd.DataFrame()
for genre in top_genres:
    top_co_genres = top_co_occurrence_df.loc[genre].nlargest(5).index
    temp_df = top_co_occurrence_df.loc[genre, top_co_genres].reset_index()
    temp_df['genre'] = genre
    temp_df.columns = ['co_genre', 'count', 'genre']
    top_co_occurrence_long = pd.concat([top_co_occurrence_long, temp_df], ignore_index=True)

# Create a stacked bar chart with plotly
fig = px.bar(top_co_occurrence_long, x='genre', y='count', color='co_genre', title="Distribution of the 5 literary genres with the 5 most represented co-genres", labels={'genre': 'Genre', 'count': 'Number of co-genres', 'co_genre': 'Co-genre'})

# Show the chart in the browser
fig.show(renderer="browser")