In [None]:
import pandas as pd
import json
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns

In [None]:
literal_path = r"C:\Users\oluwa\meta_Software.jsonl"

with open(literal_path) as j:
    for line in j:
        sft_ware = json.loads(line)

### load the dataset

In [None]:
df = pd.read_json('meta_Software.jsonl', lines=True) 
df.head(1)

### Getting to know more about the data

In [None]:
df.shape 

In [None]:
df.info()

## Data Cleaning

### Drop rows where 'average_rating' is missing

In [None]:
df = df.dropna(subset=['average_rating'])

### cross Checking to see if there are missing values in 'average_rating' after dropping missing values

In [None]:
print(df['average_rating'].isnull().sum())

### Drop rows with empty category in the 'main_category' column

I changed my mind to drop rows that are empty in the main category instead of filling them with unknown because if they are not in any category, its basically doesnt help my research in any way. so its better i dropped them

In [None]:
df = df.dropna(subset=['main_category'])

### cross Checking to see if there are missing values in the 'main_category' column after dropping empty rows

In [None]:
print(df['main_category'].isnull().sum())

### Fill in missing rating numbers with 0

In [None]:
df = df.copy()

In [None]:
df['rating_number'] = df['rating_number'].fillna(0)

### cross Checking to see if there are missing values in 'rating-number' after replacing missing values with 0

In [None]:
print(df['rating_number'].isnull().sum())

### Drop columns 'bought_together','subtitle','author', 'price', 'store'. They all have missen data and not useful for my analysis

In [None]:
df = df.drop(columns = {'bought_together','subtitle','author', 'price', 'store'}) 

df.head(1)

### cross checking the data to see if it is clean

In [None]:
df.isnull().sum()

In [None]:
# Filter categories with a total number of ratings greater than a threshold
filtered_data = df.groupby('main_category').filter(lambda x: x['rating_number'].sum() > 10)

In [None]:
main_category_counts = df['main_category'].value_counts()
main_category_counts.head()

### count the number of main category

### check out content in each category

In [None]:
df['main_category']

Rating number is the total number of ratings that a product has received while Average Rating is the the mean score that customers have given to a product, calculated from individual customer reviews. Usually, it's on a scale of 0 to 5 stars.

### laying foundation and drawing the bar chart

In [None]:
category_ratings = filtered_data.groupby('main_category')['rating_number'].size().sort_values(ascending=False).reset_index()

# Plotting with Plotly
fig = px.bar(category_ratings, x='main_category', y='rating_number', 
             title="Total Ratings by Category",
             labels={'main_category': 'Category', 'rating_number': 'Total Ratings'},
             color='rating_number',
             color_continuous_scale='Blues')

fig.update_layout(xaxis={'categoryorder': 'total descending'}, xaxis_tickangle=-45, font=dict(
        family='Roboto, sans-serif', size=14))

fig.show()

The above bar chart shows that "Appstore for Android" category has more rating numbers than all the the other categories. We can also infer that more products were sold by the category "Appstore for Android" than any other category

### setting up and drawing the Box plot 

In [None]:
# Amazon inspired colors
custom_colors = {
    "Appstore for Android": "#FF9900",
    "Software": "#146EB4", 
    "Gift Cards": "#404040", 
    "Computers": "#9EC8E3",
    "Home Audio & Theatre": "#232F3E",
    "Books": "#00A368",  # Fixed the typo here
    "AMAZON FASHION": "#CCCCCC",
    "Toys & Games": "#6A1B9A"
}

# Box plot setup for Average Rating
fig1 = px.box(
    filtered_data, 
    x="main_category", 
    y="average_rating", 
    title="Category by Average Rating",
    labels={"main_category": "Category", "average_rating": "Average Rating"},
    color="main_category",
    color_discrete_map=custom_colors  # Apply custom colors
)

# Box plot setup for Rating Number
fig2 = px.box(
    filtered_data, 
    x="main_category", 
    y="rating_number", 
    title="Category by Rating Number",
    labels={"main_category": "Category", "rating_number": "Rating Number"},
    color="main_category",
    color_discrete_map=custom_colors  # Apply custom colors
)

fig2.update_yaxes(type="log", title="Rating Number (Log Scale)")

fig1.update_layout(
    font=dict(
        family='Roboto, sans-serif',
        size=14
    )
)

fig2.update_layout(
    font=dict(
        family='Roboto, sans-serif',
        size=14
    )
)

# plots the graph
fig1.show()
fig2.show()

fig1 above shows that gift cards have the highest average rating with a median rating of 4.45 and its lowest rating at 4.1 while fig  shows that categories in the "Appstore for Android" has the highest rating number and customers   

### Grouped histogram showing Count of Category by Average Rating and Count of Category by Rating Number

Grouping average rating into low, medium and high order

In [None]:
# bins and labels for the average rating
bins = [0, 2, 4, 5]
labels = ['Low', 'Medium', 'High']
filtered_data['rating_order'] = pd.cut(filtered_data['average_rating'], bins=bins, labels=labels, include_lowest=True)

Grouping rating number into few, moderate, high and very high

In [None]:
# bins and labels for rating number
bins = [0, 10, 50, 100, df['rating_number'].max()]
labels = ['Few', 'Moderate', 'High', 'Very High']  # grouped based on range
filtered_data['rating_number_group'] = pd.cut(filtered_data['rating_number'], bins=bins, labels=labels, include_lowest=True)

Give each label order colors and draw the histogram

In [None]:
# Amazon inspired colors
amazon_colors = {
    'Low': '#FF9900',       # Orange
    'Medium': '#146EB4',    # Blue
    'High': '#00A368',      # Green
    'Few': '#FF9900',       # Orange
    'Moderate': '#146EB4',    # Blue
    'High (Group)': '#00A368',  # Green
    'Very High': '#404040',  # Dark Gray
}

# Histogram graph setup for average rating
fig3 = px.histogram(filtered_data, x="main_category", title="Count of Category by Average Rating",
                    labels={"main_category": "Category", "count": "Number of Ratings"},
                    color="rating_order", barmode='group',
                    color_discrete_map=amazon_colors)  # add custom colors from above

# histogram graph setup for rating number
fig4 = px.histogram(filtered_data, x="main_category", title="Count of Category by Rating Number",
                    labels={"main_category": "Category", "count": "Number of Ratings"},
                    color="rating_number_group", barmode='group',
                    color_discrete_map=amazon_colors)  # add custom colors from above

# title position
fig3.update_traces(textposition='outside', textfont_size=12)
fig4.update_traces(textposition='outside', textfont_size=12)

# layout and color for display
fig3.update_layout(
                  font=dict(
        family='Roboto, sans-serif',
        size=14
    ))
fig4.update_layout(
                  font=dict(
        family='Roboto, sans-serif',
        size=14
   ))

# plots graph
fig3.show()
fig4.show()

The above histogram graph in Fig3 shows that "Appstore for Android" has a higher average rating count than other categories. It has 56.891k count of medium-rating orders, 8438 counts of high-rating orders, and 3325 counts of low-rating orders. 

In Fig4, it shows that 'Appstore for Android' is more popular than other categories on the Amazon platform and also has the most low ratings with a rating of 29.355k. it also has more number of reviews than other category. 

### Drawing a stacked bar

Counting the number of times each category occurs

In [None]:
main_category_counts = filtered_data['main_category'].value_counts()
main_category_counts.head()

Group the data by 'main_category' and 'rating_order'. Create a table that shows the count of rating for each category

In [None]:
main_category_counts = filtered_data.groupby(['main_category', 'rating_order'], observed=True).size().unstack(fill_value=0)
main_category_counts

Reorders the main_category_counts according to the rating_order

In [None]:
rating_order = ['Low', 'Medium', 'High']
main_category_counts = main_category_counts[rating_order]
main_category_counts.head()

In [None]:
main_category_counts['Total']=main_category_counts.sum(axis=1)
main_category_counts_sorted=main_category_counts.sort_values(by='Total', ascending=False)
main_category_counts_sorted_top10=main_category_counts_sorted.head()
main_category_counts_sorted_top10.head()

In [None]:
category_names=main_category_counts_sorted_top10.index
total_rating=main_category_counts_sorted_top10['Total']

### Information contained in hover. When you hover over the different rating level, it makes you see information contained in the bar

In [None]:
# add colors to the orders 
rating_info = [
    ('Low', '#00A368', 'Low'),   # Green
    ('Medium', '#FF9900', 'Medium'),  # orange
    ('High', '#146EB4', 'High')  # Blue
]

traces=[]

# hover information
for rating_name, color, hover_label in rating_info:
    if rating_name=='High':
        hovertemplate=f"<b>Total Medal:</b>" + total_rating.astype(str) + f"<br><b>{hover_label}:<b/> %{{y}}<extra></extra>"
        opacity=1
        line=dict(
        color='black',
        width=1.5
        )
    else:
        hovertemplate= f"<br><b>{hover_label}:<b/> %{{y}}<extra></extra>"
        opacity=1
        line=dict()

    traces.append(go.Bar(
        x=main_category_counts_sorted_top10.index,
        y=main_category_counts_sorted_top10[rating_name],
        name=rating_name.split()[0],
        hovertemplate=hovertemplate,
        marker=dict(
            color=color,
            opacity=opacity,
            line=line
            )
    )
)
        

In [None]:
stacked_bar_fig5=go.Figure(data=traces)

# bar chart information
stacked_bar_fig5.update_layout(
    barmode='stack',
    title="User Engagement by Category",
    xaxis_title="Category",
    yaxis_title="Number of Ratings",
    xaxis_tickangle=-45,
    hovermode="x unified",
    hoverlabel=dict(
        bgcolor="white",
        font_size=12,
        font_family="Roboto"
    ), 
    font=dict(
        family='Roboto, sans-serif',
        size=14)
)

The above stacked bar in fig5 shows the number of ratings in each order high, medium and low for each category 

### Draw the heatmap for the ratings by category

In [None]:
# Add Amazon-inspired colors
amazon_colors = {
    'Low': '#FF9900',       # Orange
    'Medium': '#146EB4',    # Blue
    'High': '#00A368',      # Green
}

discrete_color_scale = [amazon_colors['Low'], amazon_colors['Medium'], amazon_colors['High']]


# Group the data
heatmap_data = filtered_data.groupby('main_category').agg(
    avg_rating=('average_rating', 'mean'),
    num_ratings=('rating_number', 'sum')
).reset_index()

# Group average ratings into orders
bins = [0, 2, 4, 5]
labels = ['Low', 'Medium', 'High']
heatmap_data['rating_order'] = pd.cut(heatmap_data['avg_rating'], 
                                      bins=bins, labels=labels, 
                                      include_lowest=True)

# Pivot data for the heatmap
heatmap_matrix = heatmap_data.pivot_table(
    index='main_category', 
    columns='rating_order', 
    values='num_ratings', 
    fill_value=0,
    observed=False 
)

# Round values in the heatmap matrix
heatmap_matrix = heatmap_matrix.round(2)

# Styling and layout
fig6 = px.imshow(
    heatmap_matrix,
    labels=dict(x="Average Rating", y="Category", color="Number of Ratings"),
    title="Heatmap of Ratings by Category",
    color_continuous_scale=discrete_color_scale,
    text_auto=True
)

fig6.update_traces(
    texttemplate='%{z}',  # Display rounded values
    textfont_size=14,  # Increase font size inside cells
    hovertemplate="<b>Category:</b> %{y}<br>" +
                  "<b>Rating Order:</b> %{x}<br>" +
                  "<b>Number of Ratings:</b> %{z}<extra></extra>"
)

fig6.update_layout(
    xaxis_title="Average Rating",
    yaxis_title="Main Category",
    font=dict(
        family='Roboto, sans-serif',
        size=16  # Global font size
    ),
    height=800,  # Increase height
    width=1200  # Increase width
)

fig6.show()


The above heatmap shows that 'appstore for android' is leading in the medium range which indicate that the category is popular but not an highly rated product while the category such as 'gift cards' shows that they are high rated products  


In [None]:
amazon_colors = {
    'Low': '#FF9900',       # Orange
    'Medium': '#146EB4',    # Blue
    'High': '#00A368',      # Green
    'Few': '#FF9900',       # Orange
    'Moderate': '#146EB4',    # Blue
    'High (Group)': '#00A368',  # Green
    'Very High': '#404040',  # Dark Gray
}

# Grouped the data for each plot
category_ratings = filtered_data.groupby(['main_category', 'rating_order'], observed=False).size().reset_index(name='count')
category_rating_numbers = filtered_data.groupby(['main_category', 'rating_number_group'], observed=False).size().reset_index(name='count')

# distribution by rating order
fig7 = px.bar(
    category_ratings, 
    x="main_category", 
    y="count", 
    title="Count of Category by Average Rating",
    labels={"main_category": "Category", "count": "Number of Ratings"},
    color="rating_order", 
    barmode='group', 
    text='count',
    color_discrete_map=amazon_colors  # add the colors
)

# distribution by rating number
fig8 = px.bar(
    category_rating_numbers, 
    x="main_category", 
    y="count", 
    title="Count of Category by Rating Number",
    labels={"main_category": "Category", "count": "Number of Ratings"},
    color="rating_number_group", 
    barmode='group', 
    text='count',
    color_discrete_map=amazon_colors  # add the colors
)

# Position the text labels
fig7.update_traces(textposition='outside', textfont_size=12)
fig8.update_traces(textposition='outside', textfont_size=12)

# layout and color for display
fig7.update_layout( 
                  font=dict(
        family='Roboto, sans-serif',
        size=14
    ))
fig8.update_layout(
                  font=dict(
        family='Roboto, sans-serif',
        size=14
   ))

# show the graph
fig7.show()
fig8.show()

### Project Milestone 3 Reflection
In Milestone 3, I created a new Jupyter Notebook and added more graphs to better explain my data. 
Since I’m working with Amazon’s data, I incorporated their brand colors throughout the project: Orange (#FF9900), Blue (#146EB4), Green (#00A368), and Dark Gray (#404040). However, since Amazon’s fonts are not publicly available, I used Roboto as an alternative. I set the height of all the graphs to 500 but it made the file too large to be uploaded on github. After removing the height, the file was still big so I saw online that I can save my file without running the output. I hope that works.

I’m still deciding whether to include all the graphs in the final project or only select a few.

In [None]:
fig1.write_html("fig1.html")

In [None]:
fig2.write_html("fig2.html")

In [None]:
fig6.write_html("fig6.html")

In [None]:
fig7.write_html("fig7.html")

In [None]:
fig8.write_html("fig8.html")