In [86]:
import pandas as pd
import numpy as np 
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

LOADING DATASET

In [87]:
df = pd.read_excel('Books Distribution Expenses.xlsx',index_col=0)

INSPECTING

In [None]:
df.head()

In [None]:
print(f' This dataset contains {df.shape[0]} rows/records and {df.shape[1]} columns/features')

In [None]:
df.info()

PREPROCESSING AND CLEANING

In [91]:
df.dropna(subset=['category'],inplace=True)

In [None]:
df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
print(df['published_date'].head())

DESCRIPTIVES STATS

In [None]:
df.describe()

In [None]:
cat_vars = df.select_dtypes(include=['object']).columns.drop('title')
num_vars = df.select_dtypes(include=['float64','int64']).columns


print(f' Categorical Variables: {cat_vars}')
print(f' Numerical Variables: {num_vars}')

DATA VIZ

In [95]:
def plot_bar(df, column):
    top_modes = df[column].value_counts().nlargest(20).reset_index()
    top_modes.columns = [column, 'count']
    fig = px.bar(top_modes, x=column, y='count', title=f'Distribution of {column}', text='count')
    fig.update_traces(texttemplate='%{text}', textposition='outside')
    fig.update_layout(xaxis_title=column, yaxis_title='Frequency')
    fig.show()


In [None]:
var_to_plot = ['publisher', 'category','distribution_expense']
for i in var_to_plot:
    plot_bar(df, i)

MAIN TOPICS

In [97]:
from wordcloud import WordCloud

def plot_wordcloud(df, column):
    # Split the words
    words = df[column].str.split(" ", expand=True)
    # Combine words into a single string
    all_words = ' '.join(words.fillna('').values.flatten())
    # Add words separated by commas
    comma_separated_words = ','.join(words.fillna('').values.flatten())
    # Combine all words
    combined_words = all_words + ' ' + comma_separated_words
    # Generate the word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(combined_words)
    # Plot the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud)
    plt.axis('off')
    # Save the word cloud
    plt.savefig(f'wordcloud_{column}.png')
    plt.show()

In [None]:
plot_wordcloud(df, 'title')


In [None]:
plot_wordcloud(df, 'subtitle')

In [None]:
plot_wordcloud(df, 'publisher')

In [None]:
plot_wordcloud(df,'authors')

In [None]:
plot_bar(df, 'published_year')