In [None]:
import pandas as pd

pd.set_option('display.max_columns', None)

purchases = pd.read_csv('data/amazon-purchases.csv')
survey = pd.read_csv('data/survey.csv')

In [None]:
dat = pd.merge(purchases, survey, on='Survey ResponseID', how='left')

In [None]:
dat.columns

In [None]:
dat.isnull().sum()

In [None]:
print("\nDescriptive statistics for numerical columns:")
purchases.describe()

In [None]:
print("Number of unique values in each column:")
purchases.nunique()

In [None]:
print("Frequency counts for 'Category' column:")
purchases['Category'].value_counts()

In [None]:
print("Frequency counts for 'Title' column (Top 10 Titles):")
purchases['Title'].value_counts().head(10)

In [None]:
purchases['Order Date'] = pd.to_datetime(purchases['Order Date'])
print("Number of purchases per month:")
purchases.groupby(purchases['Order Date'].dt.to_period('M')).size()

In [None]:
purchases['Total Purchase Amount'] = purchases['Purchase Price Per Unit'] * purchases['Quantity']
print("Total revenue by category:")
purchases.groupby('Category')['Total Purchase Amount'].sum().sort_values(ascending=False)

In [None]:
print("Number of purchases by state:")
purchases['Shipping Address State'].value_counts()

In [None]:
print("Average purchase amount per category:")
purchases.groupby('Category')['Total Purchase Amount'].mean().sort_values(ascending=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

purchases['Order Date'] = pd.to_datetime(purchases['Order Date'])

purchases['Total Purchase Amount'] = purchases['Purchase Price Per Unit'] * purchases['Quantity']

print("\nCorrelation between numerical variables:")
correlation_matrix = purchases[['Purchase Price Per Unit', 'Quantity', 'Total Purchase Amount']].corr()
print(correlation_matrix)

plt.figure(figsize=(8,6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Variables')
plt.show()

plt.figure(figsize=(10,6))
sns.scatterplot(x='Purchase Price Per Unit', y='Quantity', data=purchases)
plt.title('Relationship between Purchase Price and Quantity')
plt.xlabel('Purchase Price Per Unit')
plt.ylabel('Quantity Purchased')
plt.show()

plt.figure(figsize=(12,8))
category_sales = purchases.groupby('Category')['Total Purchase Amount'].sum().sort_values(ascending=False).head(10)
sns.barplot(x=category_sales.index, y=category_sales.values)
plt.title('Total Sales by Top 10 Categories')
plt.xticks(rotation=45)
plt.ylabel('Total Purchase Amount')
plt.show()

plt.figure(figsize=(15,8))
state_sales = purchases.groupby('Shipping Address State')['Total Purchase Amount'].sum().sort_values(ascending=False).head(10)
sns.barplot(x=state_sales.index, y=state_sales.values)
plt.title('Total Sales by Top 10 States')
plt.ylabel('Total Purchase Amount')
plt.show()

category_pivot = purchases.pivot_table(values='Purchase Price Per Unit', index='Category', aggfunc='mean').sort_values(by='Purchase Price Per Unit', ascending=False)
print("\nAverage Price Per Unit by Category (Top 10):")
print(category_pivot.head(10))

quantity_pivot = purchases.pivot_table(values='Quantity', index='Category', aggfunc='mean').sort_values(by='Quantity', ascending=False)
print("\nAverage Quantity Purchased by Category (Top 10):")
print(quantity_pivot.head(10))

monthly_sales = purchases.groupby(purchases['Order Date'].dt.to_period('M'))['Total Purchase Amount'].sum()

plt.figure(figsize=(12,6))
monthly_sales.plot(kind='line')
plt.title('Monthly Sales Over Time')
plt.xlabel('Month')
plt.ylabel('Total Purchase Amount')
plt.show()