In [4]:
#  

import pandas as pd
import numpy as np
import plotly.express as px

# Initialize an empty DataFrame to store aggregated results
aggregated_data = pd.DataFrame()

# Read the CSV file in chunks
chunk_size = 10000  # Adjust this value based on your available memory
for chunk in pd.read_csv("books_corrected.csv", delimiter=",", low_memory=False, chunksize=chunk_size):
    # Convert the 'genre_and_votes' column to strings
    chunk['genre_and_votes'] = chunk['genre_and_votes'].astype(str)
    
    # Replace hyphens with spaces
    chunk['genre_and_votes'] = chunk['genre_and_votes'].str.replace('-', ' ')
    
    # Split the literary genres and votes
    df_comb = chunk['genre_and_votes'].str.split(',', expand=True)
    df_comb = df_comb.fillna('None')

    # Initialize lists to store genres and votes
    genres_list = []
    votes_list = []

    # Iterate through each row to extract genres and votes
    for index, row in df_comb.iterrows():
        for col in df_comb.columns:
            if pd.notna(row[col]):
                parts = row[col].rsplit(' ', 1)
                if len(parts) == 2 and parts[1].isdigit():
                    genre = parts[0].strip()
                    votes = int(parts[1])
                    if votes >= 1:
                        genres_list.append(genre)
                        votes_list.append(votes)

    # Create a temporary DataFrame for the extracted genres and votes
    temp_df = pd.DataFrame({'genre': genres_list, 'votes': votes_list})

    # Aggregate the results of the chunk
    chunk_aggregated = temp_df.groupby('genre')['votes'].sum().reset_index()

    # Add the aggregated results to the global DataFrame
    aggregated_data = pd.concat([aggregated_data, chunk_aggregated], ignore_index=True)

# Aggregate the final results
final_aggregated_data = aggregated_data.groupby('genre')['votes'].sum().reset_index()

# Save the pivot table to a CSV file
final_aggregated_data.to_csv("aggregated_genres_votes.csv", index=False)

# Load the aggregated data from the CSV file
df = pd.read_csv("aggregated_genres_votes.csv")

# Create an interactive plot with plotly
fig = px.bar(df, x='genre', y='votes', title="Distribution of Literary Genres", labels={'votes': 'Number of Votes', 'genre': 'Literary Genres'})

# Show the plot in the browser
fig.show(renderer="browser")

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

# Initialize an empty DataFrame to store aggregated results
aggregated_data = pd.DataFrame()

# Read the CSV file in chunks
chunk_size = 10000  # Adjust this value based on your available memory
for chunk in pd.read_csv("books_corrected.csv", delimiter=",", low_memory=False, chunksize=chunk_size):
    # Convert the 'genre_and_votes' column to strings
    chunk['genre_and_votes'] = chunk['genre_and_votes'].astype(str)
    
    # Replace hyphens with spaces
    chunk['genre_and_votes'] = chunk['genre_and_votes'].str.replace('-', ' ')
    
    # Split the literary genres and votes
    df_comb = chunk['genre_and_votes'].str.split(',', expand=True)
    df_comb = df_comb.fillna('None')

    # Initialize lists to store genres and votes
    genres_list = []
    votes_list = []

    # Iterate through each row to extract genres and votes
    for index, row in df_comb.iterrows():
        for col in df_comb.columns:
            if pd.notna(row[col]):
                parts = row[col].rsplit(' ', 1)
                if len(parts) == 2 and parts[1].isdigit():
                    genre = parts[0].strip()
                    votes = int(parts[1])
                    if votes >= 1:
                        genres_list.append(genre)
                        votes_list.append(votes)

    # Create a temporary DataFrame for the extracted genres and votes
    temp_df = pd.DataFrame({'genre': genres_list, 'votes': votes_list})

    # Aggregate the results of the chunk
    chunk_aggregated = temp_df.groupby('genre')['votes'].sum().reset_index()

    # Add the aggregated results to the global DataFrame
    aggregated_data = pd.concat([aggregated_data, chunk_aggregated], ignore_index=True)

# Aggregate the final results
final_aggregated_data = aggregated_data.groupby('genre')['votes'].sum().reset_index()

# Filter the top 5 genres with the most votes
top_5_genres = final_aggregated_data.nlargest(5, 'votes')

# Save the pivot table to a CSV file
top_5_genres.to_csv("top_5_genres_votes.csv", index=False)

# Load the aggregated data from the CSV file
df = pd.read_csv("top_5_genres_votes.csv")

# Create an interactive plot with plotly
fig = px.bar(df, x='genre', y='votes', title="Top 5 Literary Genres by Votes", labels={'votes': 'Number of Votes', 'genre': 'Literary Genres'})

# Show the plot in the browser
fig.show(renderer="browser")