# Importing Libraries

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Reading the file

In [None]:
df = pd.read_csv("data/Minitab.csv",na_values="*")
print(df.head())
print(df.info())

# Removing redundant columns

In [None]:

df.drop(columns='invoice_no',axis=1,inplace=True)
print(df.head())

# Removing null values

In [None]:

print(df.isnull().sum())
df.dropna(inplace=True)
print(df.info())

# Removing duplicates

In [None]:

print(df.duplicated().sum())
df.drop_duplicates(inplace=True)
print(df.duplicated().sum())

# Fixing date format

In [None]:

df['invoice_date'] = pd.to_datetime(df['invoice_date'])
df = df.sort_values(by='invoice_date')
df.reset_index(drop=True, inplace=True)
print(df.head())

# Data transformations

In [None]:

print(df['gender'].value_counts())
df['gender'] = df['gender'].map({'Male':'M','Female':'F'})
print(df.head())

# Calculate the profit

In [None]:

df['profit'] = (df['selling_price_per_unit'] - df['cost_price_per_unit']) * df['quantity']
print(df.head())
Total_profit = df['profit'].sum()
print(Total_profit)

df.to_csv("data/Cleaned.csv", index=False)


# Bar graph of Profit vs category

In [None]:

pro_vs_cate = df.groupby(['category'])['profit'].sum()
print(pro_vs_cate)
pro_vs_cate.plot(kind='bar')
plt.title('Profit for each category')
plt.xlabel('Categories')
plt.ylabel('Profit')
plt.show()

# Histogram of Age distribution

In [None]:

plt.hist(df['age'],bins=50)
plt.title('Age Distribution of costomers')
plt.xlabel('Age range')
plt.ylabel('Count')
plt.show()

# Pie chart of Payment methods

In [None]:

payment_groups = df['payment_method'].value_counts()
print(payment_groups)
plt.pie(payment_groups,labels=payment_groups.index,autopct='%1.1f%%')
plt.legend(df['payment_method'].unique())
plt.show()

# Line chart of profit over time

In [None]:

profit_by_month = df.groupby(df['invoice_date'].dt.to_period('M'))['profit'].sum()
profit_by_month.plot(kind='line')
plt.xticks(rotation=45)
plt.tight_layout()
plt.title('Profit over months')
plt.xlabel('Months')
plt.ylabel('profit')
plt.show()