In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re

In [None]:
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
df = pd.read_csv('cleaned_ebay_deals.csv')

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp')
df['hour'] = df['timestamp'].dt.hour

In [None]:
plt.figure(figsize=(12, 6))
deals_per_hour = df.groupby('hour').size()
deals_per_hour.plot(kind='bar')
plt.title('Number of Deals Scraped per Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Deals')
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

In [None]:
axes[0,0].hist(df['price'].dropna(), bins=50, alpha=0.7)
axes[0,0].set_title('Distribution of Product Prices')
axes[0,0].set_xlabel('Price ($)')
axes[0,0].set_ylabel('Frequency')

In [None]:
axes[0,1].boxplot(df['price'].dropna())
axes[0,1].set_title('Boxplot of Product Prices')
axes[0,1].set_ylabel('Price ($)')

In [None]:
axes[1,0].scatter(df['original_price'], df['price'], alpha=0.6)
axes[1,0].plot([0, df['original_price'].max()], [0, df['original_price'].max()], 'r--')
axes[1,0].set_title('Original Price vs Discounted Price')
axes[1,0].set_xlabel('Original Price ($)')
axes[1,0].set_ylabel('Discounted Price ($)')

In [None]:
axes[1,1].hist(df['discount_percentage'].dropna(), bins=30, alpha=0.7)
axes[1,1].set_title('Distribution of Discount Percentage')
axes[1,1].set_xlabel('Discount Percentage')
axes[1,1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
shipping_counts = df['shipping'].value_counts().head(10)
shipping_counts.plot(kind='bar')
plt.title('Top 10 Shipping Options')
plt.xlabel('Shipping Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
keywords = ['apple', 'samsung', 'laptop', 'iphone', 'tablet', 'gimbal', 'watch', 'headphone', 'camera', 'phone']

keyword_counts = {}
for keyword in keywords:
    count = df['title'].str.contains(keyword, case=False, na=False).sum()
    keyword_counts[keyword] = count

plt.figure(figsize=(12, 6))
plt.bar(keyword_counts.keys(), keyword_counts.values())
plt.title('Keyword Frequency in Product Titles')
plt.xlabel('Keywords')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df['price_difference'] = df['original_price'] - df['price']

plt.figure(figsize=(12, 6))
plt.hist(df['price_difference'].dropna(), bins=50, alpha=0.7)
plt.title('Distribution of Price Differences (Absolute Discount)')
plt.xlabel('Price Difference ($)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
top_discounts = df.nlargest(5, 'discount_percentage')[['title', 'price', 'original_price', 'discount_percentage']]
print("Top 5 Deals by Discount Percentage:")
print(top_discounts.to_string(index=False))