**data visualization for sales analysis**

**installing necessary libraries**

In [None]:
!pip install pandas matplotlib numpy

**IMPORTING LIBRARIES**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

**EDA 1. LOADING EXPLORING STRUCTURE **

In [None]:
# LOADING DATA
import pandas as pd
df = pd.read_csv("Sales_Data.csv")
#Explore the structure of the data
print("Data Head:")
print(df)  # Display the first few rows of the dataset

print("\nData Info:")
print(df.info())  # Get information about the data, including types and missing values

print("\nData Summary:")
print(df.describe())  # Summary statistics of the numeric columns

# Check for missing values in the data
print("\nMissing Values:")
print(df.isnull().sum())

**DATA CLEANING AND MANIPULATION**

In [None]:

#droping rows with missing values if it's more appropriate:
# sales_data.dropna(inplace=True)

# Convert date columns to datetime format (assuming there's a 'date' column)
df['Order Date'] = pd.to_datetime(df['Order Date'], errors='coerce')

# Handle other date-related issues, such as extracting year, month, or day from the 'date' column
df['year'] = df['Order Date'].dt.year
df['month'] = df['Order Date'].dt.month
df['day'] = df['Order Date'].dt.day
#handling metrices
# Calculate metrics (e.g., total revenue, total sales)
# Assuming there are 'quantity_sold' and 'price' columns
df['total_revenue'] = df['Quantity Ordered'] * df['Price Each']

# Check the cleaned data structure
print("\nCleaned Data Head:")
print(df.head())

# Save the cleaned data to a new CSV file (optional)
df.to_csv('cleaned_sales_data).csv', index=False)

**Visualization **

In [None]:



# Sort the data by date to ensure the trend is shown in chronological order
df = df.sort_values(by='Order Date')

# Plotting the total revenue trend over time
plt.figure(figsize=(10, 6))  # Set the figure size
# plt.plot(df['Order Date'], df['total_revenue'], color='r', marker='o', linestyle='--')

# Reduce the number of points in the plot for better readability
sample_df = df.sample(50)  # Reduce to 50 points
plt.plot(sample_df['Order Date'], sample_df['total_revenue'], color='r', marker='o', linestyle='--')
# Add labels and title to the plot
plt.xlabel('Date')
plt.ylabel('Total Revenue')
plt.title('Visualising Sales Trend: Total Revenue Over Time')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Add a grid for better readability
plt.grid(True)

# Show the legend
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

**comparing stores performance country wise**

In [None]:
# Assuming there's a 'store' column and the 'total_revenue' column is already calculated
# Group the data by store and calculate the total revenue for each store
store_performance = df.groupby('City')['total_revenue'].sum().reset_index()

# Sort the stores by total revenue for better visualization (optional)
store_performance = store_performance.sort_values(by='total_revenue', ascending=False)

# Plotting the bar chart to compare store performance
plt.figure(figsize=(10, 6))  # Set the figure size
plt.bar(store_performance['City'], store_performance['total_revenue'], color='c')

# Add labels and title to the plot
plt.xlabel('City')
plt.ylabel('Total Revenue')
plt.title('Store Performance: Total Revenue by Store')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Add a grid for better readability
plt.grid(axis='y')

# Display the plot
plt.tight_layout()
plt.show()

**Product Sales Analysis **

In [None]:
# Assuming there's a 'product' column and the 'total_revenue' column is already calculated
# Group the data by product and calculate the total revenue for each product
product_sales = df.groupby('Product')['total_revenue'].sum().reset_index()

# Plotting the pie chart to show the distribution of total revenue by product
plt.figure(figsize=(8, 8))  # Set the figure size
plt.pie(product_sales['total_revenue'], labels=product_sales['Product'], autopct='%1.1f%%', startangle=140)

# Add a title to the pie chart
plt.title('Product Sales Analysis: Total Revenue Distribution by Product')

# Display the plot
plt.tight_layout()
plt.show()

**EXPLORING SALES DISTRIBUTION **

In [None]:
# Plotting the distribution of total revenue
plt.figure(figsize=(10, 6))  # Set the figure size for the total revenue histogram
plt.hist(df['total_revenue'], bins=20, color='g', edgecolor='black')

# Add labels and title for total revenue histogram
plt.xlabel('Total Revenue')
plt.ylabel('Frequency')
plt.title('Distribution of Total Revenue')

# Display the total revenue histogram
plt.tight_layout()

# Plotting the distribution of quantity sold
plt.figure(figsize=(10, 6))  # Set the figure size for the quantity sold histogram
plt.hist(df['Quantity Ordered'], bins=20, color='b', edgecolor='black')

# Add labels and title for quantity sold histogram
plt.xlabel('Quantity Ordered')
plt.ylabel('Frequency')
plt.title('Distribution of Quantity Ordered')

# Display the quantity sold histogram
plt.tight_layout()
plt.show()

SUBPLOTS FOR COMBINED ANALYSIS

In [None]:

# Create a figure with multiple subplots (2x2 grid layout for example)
fig, axs = plt.subplots(2, 2, figsize=(12, 10))  # 2 rows, 2 columns

# Subplot 1: Total Revenue Over Time (Line plot)
sample_df = df.sample(50)  # Reduce to 50 points
# plt.plot(sample_df['Order Date'], sample_df['total_revenue'], color='r', marker='o', linestyle='--')
# # Add labels and title to the plot
axs[0, 0].plot(sample_df['Order Date'], sample_df['total_revenue'], color='b', marker='o', ls='-')
axs[0, 0].set_title('Total Revenue Over Time')
axs[0, 0].set_xlabel('Date')
axs[0, 0].set_ylabel('Total Revenue')
axs[0, 0].tick_params(axis='x', rotation=45)

# Subplot 2: City Performance (Bar chart)
store_performance = df.groupby('City')['total_revenue'].sum().reset_index()
store_performance = store_performance.sort_values(by='total_revenue', ascending=False)
axs[0, 1].bar(store_performance['City'], store_performance['total_revenue'], color='g')
axs[0, 1].set_title('Store Performance: Total Revenue by Store')
axs[0, 1].set_xlabel('City')
axs[0, 1].set_ylabel('Total Revenue')
axs[0, 1].tick_params(axis='x', rotation=45)

# Subplot 3: Product Sales Distribution (Pie chart)
product_sales = df.groupby('Product')['total_revenue'].sum().reset_index()
axs[1, 0].pie(product_sales['total_revenue'], labels=product_sales['Product'], autopct='%1.1f%%', startangle=140)
axs[1, 0].set_title('Product Sales Distribution')

# Subplot 4: Total Revenue Distribution (Histogram)
axs[1, 1].hist(df['total_revenue'], bins=20, color='r', edgecolor='black')
axs[1, 1].set_title('Distribution of Total Revenue')
axs[1, 1].set_xlabel('Total Revenue')
axs[1, 1].set_ylabel('Frequency')

# Adjust layout to prevent overlap and make it look clean
plt.tight_layout()

# Show the combined figure with all subplots
plt.show()

Conclusions