In [31]:
import pandas as pd

In [32]:
# Homework Assignment 1: Analyzing Sales Data
# Load data
sales_df = pd.read_csv('sales_data.csv')

# Convert 'Date' to datetime
sales_df['Date'] = pd.to_datetime(sales_df['Date'])

# Optional: Print first few rows
print(sales_df.head())

        Date       Product     Category  Quantity  Price
0 2023-01-01        Laptop  Electronics        10    800
1 2023-01-01       T-Shirt     Clothing         5     20
2 2023-01-02    Smartphone  Electronics         8    400
3 2023-01-02  Coffee Maker         Home        12     50
4 2023-01-03         Jeans     Clothing        15     30


In [33]:
# 1. Group by Category and compute aggregates
category_stats = sales_df.groupby('Category').agg(
    Total_Quantity_Sold=('Quantity', 'sum'),
    Average_Price_Per_Unit=('Price', 'mean'),
    Max_Quantity_Single_Transaction=('Quantity', 'max')
).reset_index()

print("Category Statistics:\n", category_stats)


Category Statistics:
       Category  Total_Quantity_Sold  Average_Price_Per_Unit  \
0     Clothing                  157               31.176471   
1  Electronics                  183              276.764706   
2         Home                  144               55.000000   

   Max_Quantity_Single_Transaction  
0                               15  
1                               15  
2                               14  


In [34]:
# 2. Top-Selling Product in Each Category
top_products = (
    sales_df.groupby(['Category', 'Product'])['Quantity']
    .sum()
    .reset_index()
    .sort_values(['Category', 'Quantity'], ascending=[True, False])
    .drop_duplicates('Category')
    .rename(columns={'Quantity': 'Total_Quantity_Sold'})
)

print("\nTop-Selling Products by Category:\n", top_products)



Top-Selling Products by Category:
        Category          Product  Total_Quantity_Sold
8      Clothing            Jeans                   15
27  Electronics         Smart TV                   15
46         Home  Pressure Cooker                   14


In [40]:
# 3. Date with Highest Total Sales
sales_df['Total_Sale'] = sales_df['Quantity'] * sales_df['Price']

highest_sales_date = (
    sales_df.groupby('Date')['Total_Sale']
    .sum()
    .reset_index()
    .sort_values('Total_Sale', ascending=False)
    .head(5)
)

print("\nDate with Highest Sales:\n", highest_sales_date)



Date with Highest Sales:
         Date  Total_Sale
6 2023-01-07       15150
0 2023-01-01        8100
9 2023-01-10        5880
4 2023-01-05        3800
1 2023-01-02        3800


In [14]:
# Homework Assignment 2: Examining Customer Orders
import pandas as pd

In [15]:
# Load the dataset
orders_df = pd.read_csv("customer_orders.csv")

In [17]:
# 1. Filter customers with at least 20 orders
customer_order_counts = orders_df.groupby("CustomerID").size()
customers_20_plus_orders = customer_order_counts[customer_order_counts >= 20].index
filtered_customers_20_orders = orders_df[orders_df["CustomerID"].isin(customers_20_plus_orders)]
filtered_customers_20_orders

Unnamed: 0,OrderID,CustomerID,Product,Quantity,Price
0,1,101,Laptop,2,800
1,2,102,Headphones,1,150
2,3,103,Smartphone,3,400
3,4,101,External Hard Drive,2,80
4,5,102,Backpack,1,40
...,...,...,...,...,...
94,95,102,Fitness Tracker,1,60
95,96,103,Blender,1,60
96,97,104,Sport Shoes,2,40
98,99,101,Wireless Earbuds,2,120


In [18]:
# Task 2: Customers who ordered products with average price per unit > $120
avg_price_per_customer = orders_df.groupby("CustomerID")["Price"].mean()
high_avg_price_customers = avg_price_per_customer[avg_price_per_customer > 120].index
filtered_high_avg_price_customers = orders_df[orders_df["CustomerID"].isin(high_avg_price_customers)]
filtered_high_avg_price_customers

Unnamed: 0,OrderID,CustomerID,Product,Quantity,Price
1,2,102,Headphones,1,150
4,5,102,Backpack,1,40
5,6,104,Tablet,2,300
8,9,102,Smart TV,1,1000
9,10,104,Coffee Maker,1,50
12,13,104,Kitchen Scale,1,20
14,15,102,Blender,2,60
16,17,104,Cookware Set,1,60
19,20,102,Summer Dress,2,30
21,22,104,Comforter Set,1,45


In [19]:
# Task 3: Total quantity and total price per product, filtering total quantity >= 5
product_totals = orders_df.groupby("Product").agg(
    Total_Quantity=("Quantity", "sum"),
    Total_Price=("Price", "sum")
)
filtered_products = product_totals[product_totals["Total_Quantity"] >= 5]
filtered_products

Unnamed: 0_level_0,Total_Quantity,Total_Price
Product,Unnamed: 1_level_1,Unnamed: 2_level_1
Cargo Pants,6,120
Dress Shirt,5,50
Formal Shirt,6,105
Smartphone,5,800
Sport Shoes,5,120
Sunglasses,5,60
Wireless Earbuds,6,360


In [20]:
# Homework Assignment 3: Population Salary Analysis
import pandas as pd
import sqlite3
import re

In [2]:
# 1. Load Population Data from SQLite
conn = sqlite3.connect('population.db')

# Load entire population table
population_df = pd.read_sql_query("SELECT * FROM population", conn)

# Close connection
conn.close()

# Preview
print(population_df.head())


   id first_name  last_name                          email  gender   salary  \
0   1      Armin    Coltart           acoltart0@abc.net.au    Male   368693   
1   2        Mia  Tuddenham        mtuddenham1@addthis.com  Female   154398   
2   3   Kirsteni   Brafield         kbrafield2@arizona.edu  Female  1230304   
3   4     Phylis    Furlong  pfurlong3@merriam-webster.com  Female  1567795   
4   5     Wandis  Loveredge       wloveredge4@hatena.ne.jp  Female  1136950   

                  state  
0  District of Columbia  
1               Florida  
2               Georgia  
3            California  
4               Alabama  


In [None]:
# 2. Load Salary Bands from Excel
# Load Excel file with salary bands
salary_bands = pd.read_excel('population_salary_analysis.xlsx')

# Parse the salary bands into min and max numeric values
def parse_band(band):
    band = band.replace(",", "").replace("$", "")
    if "till" in band.lower():
        upper = int(re.search(r"\d+", band).group())
        return (0, upper)
    elif "-" in band:
        parts = re.findall(r"\d+", band)
        return (int(parts[0]), int(parts[1]))
    else:
        return (None, None)

salary_bands[['MinSalary', 'MaxSalary']] = salary_bands['Salary Band'].apply(lambda x: pd.Series(parse_band(x)))

# Match each salary to a band
def assign_band(salary):
    for _, row in salary_bands.iterrows():
        if row['MinSalary'] <= salary <= row['MaxSalary']:
            return row['Salary Band']
    return 'Unknown'

population_df['salary_band'] = population_df['salary'].apply(assign_band)

# Group and calculate required statistics
global_stats = population_df.groupby('salary_band').agg(
    PopulationCount=('salary', 'count'),
    AverageSalary=('salary', 'mean'),
    MedianSalary=('salary', 'median')
)

# Percentage of population
total_population = global_stats['PopulationCount'].sum()
global_stats['PopulationPercent'] = (global_stats['PopulationCount'] / total_population) * 100

print("=== Global Statistics by Salary Band ===")
print(global_stats.round(2))


=== Global Statistics by Salary Band ===
                         PopulationCount  AverageSalary  MedianSalary  \
salary_band                                                             
$1,000,001 - $1,200,000             1227     1098524.07     1097765.0   
$1,200,001 - $1,400,000             1131     1300684.67     1300430.0   
$1,400,001 - $1,600,000             1132     1499605.60     1500623.0   
$1,600,001 - $1,800,000             1120     1698519.17     1697481.5   
$200,001 - $400,000                 1170      299558.07      299882.0   
$400,001 - $600,000                 1234      499163.98      497925.5   
$600,001 - $800,000                 1156      699680.87      701317.0   
$800,001 - $1,000,000               1175      901152.28      899845.0   
Unknown                             1155     1902891.84     1906451.0   
till $200,000                       1151       99283.99       98800.0   

                         PopulationPercent  
salary_band                          

In [41]:
# 3. Statistics by State and Salary Band
# Group by State and Salary Band
state_stats = population_df.groupby(['state', 'salary_band']).agg(
    PopulationCount=('salary', 'count'),
    AverageSalary=('salary', 'mean'),
    MedianSalary=('salary', 'median')
)

# Compute population percentages within each state
state_totals = population_df.groupby('state')['salary'].count()
state_stats['PopulationPercent'] = state_stats.index.get_level_values(0).map(state_totals)
state_stats['PopulationPercent'] = (state_stats['PopulationCount'] / state_stats['PopulationPercent']) * 100

print("\n=== Statistics by State and Salary Band ===")
print(state_stats.round(2))


=== Statistics by State and Salary Band ===
                                   PopulationCount  AverageSalary  \
state     salary_band                                               
Alabama   $1,000,001 - $1,200,000               32     1111522.31   
          $1,200,001 - $1,400,000               24     1295760.67   
          $1,400,001 - $1,600,000               30     1513844.87   
          $1,600,001 - $1,800,000               17     1697464.12   
          $200,001 - $400,000                   25      289234.72   
...                                            ...            ...   
Wisconsin till $200,000                         23       92388.78   
Wyoming   $1,000,001 - $1,200,000                1     1187976.00   
          $1,200,001 - $1,400,000                2     1327830.50   
          $800,001 - $1,000,000                  1      970366.00   
          till $200,000                          1       27728.00   

                                   MedianSalary  Populati