In [1]:
import pandas as pd

In [2]:
# Part 1: Data Loading and Exploration
# Read all CSV files into Pandas DataFrames (read_csv)
customer_df = pd.read_csv('data/wk8-customers.csv', encoding='utf-8')
products_df = pd.read_csv('data/wk8-products.csv', encoding='utf-8')


In [None]:
# Display the first few rows of each DataFrame (.head)
customer_df.head()
products_df.head()


In [None]:
# 2. Exploratory Analysis

# Obtain summary statistics for numerical columns (.describe)
customer_df.describe()
products_df.describe()


In [None]:
# Check for missing values in all dataframes

# Display the data types of each column (.info)
customer_df.info()
products_df.info()


In [3]:
# Part 2: Data Cleaning

# 1. Handling Missing Values

# Create copies of original DataFrames for cleaning
clean_customer_df = customer_df.copy()
clean_products_df = products_df.copy()


In [None]:
# For numerical columns with missing values, replace with column mean (fillna with mean)
# clean_products_df.info()
p_columns = ['price', 'weight', 'cost']
clean_products_df['price'] = clean_products_df['price'].abs()
clean_products_df = clean_products_df.fillna(clean_products_df[p_columns].mean())
clean_products_df



In [17]:
# For categorical columns with missing values, replace with most frequent value (fillna with mode)
#clean_customer_df.info()
#c_columns = ['product_name', 'category', 'subcategory', 'brand']
clean_customer_df = clean_customer_df.fillna(clean_customer_df[['phone', 'email', 'zip_code', 'registration_date']].mode())
clean_customer_df
#customer_df

Unnamed: 0,customer_id,first_name,last_name,email,phone,address,city,state,zip_code,registration_date
0,1,James,Smith,james.smith@gmail.com,555-123-4567,123 Main St,New York,NY,10001,2021-03-15
1,2,Mary,Johnson,blewis23@gmail.com,212.555.6789,456 Park Ave,New York,NY,10022,2020-11-02
2,3,John,Williams,jwilliams@yahoo.com,(555) 987-6543,789 Broadway,Los Angeles,California,90001-1234,2021-05-20
3,4,Patricia,Brown,pbrown@hotmail.com,5551234567,321 Elm St,Chicago,IL,60601,
4,5,Robert,Jones,rjones23@gmail.com,555-987-3456,555 Pine St,Houston,TX,77002,2020-09-12
5,6,Jennifer,Miller,jmiller@outlook.com,212.555.6789,898 Cedar Dr,Phoenix,AZ,85001,2021-02-28
6,7,Michael,Davis,mdavis@gmail.com,555.432.7654,742 Maple Ave,Philadelphia,PA,19103,2021-01-17
7,8,Linda,Garcia,lgarcia@yahoo.com,555 876 2345,953 Oak Ln,San Antonio,Texas,78205,2020-10-05
8,9,William,Rodriguez,wrodriguez@hotmail.com,5558889999,159 Washington Blvd,San Diego,CA,92101,2021-06-30
9,10,Elizabeth,Wilson,ewilson@gmail.com,(555) 333-2211,753 Lincoln Rd,Dallas,TX,75201,2020-08-22


In [20]:
# 2. Removing Duplicates

# Check for duplicates in customers (duplicated and sum)
clean_customer_df.duplicated().sum()


0


In [22]:
# Check for duplicates in products (duplicated and sum)
clean_products_df.duplicated().sum()

0

In [28]:
# 3. Basic Information Retrieval
clean_products_df.info()
# FILTER products dataframe to get a dataframe of products with price over $500 AND weigh less than 1 pound
clean_products_df[(clean_products_df['price'] > 500) & (clean_products_df['weight'] < 1)]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_id    30 non-null     int64  
 1   product_name  30 non-null     object 
 2   category      30 non-null     object 
 3   subcategory   30 non-null     object 
 4   brand         30 non-null     object 
 5   price         30 non-null     float64
 6   cost          30 non-null     float64
 7   weight        28 non-null     float64
dtypes: float64(3), int64(1), object(4)
memory usage: 2.0+ KB


Unnamed: 0,product_id,product_name,category,subcategory,brand,price,cost,weight
0,1,Apple iPhone 13,Electronics,Smartphones,Apple,899.99,649.99,0.45
1,2,Samsung Galaxy S21,electronics,Smartphones,Samsung,799.99,539.99,0.5


In [4]:
# FILTER products to get a dataframe of products that are either in the Beauty OR Grocery category
clean_products_df[(clean_products_df['category'] == 'Beauty') | (clean_products_df['category'] == 'Grocery')]

Unnamed: 0,product_id,product_name,category,subcategory,brand,price,cost,weight
13,14,L'Oreal Revitalift Cream,Beauty,Skincare,L'Oreal,19.99,8.0,0.3
16,17,Hershey's Chocolate Assortment,Grocery,Snacks,Hershey's,15.99,7.0,1.0


In [6]:
# find average cost of products by category (groupby)
clean_products_df.groupby('category')['price'].mean()

category
Apparel                 39.990000
Beauty                  19.990000
Books                   49.990000
Clothing                50.323333
ELECTRONICS            244.990000
Electronics            285.994000
Food                     4.990000
Grocery                 15.990000
HOME & KITCHEN         599.990000
Home & Kitchen         324.990000
Home and Kitchen       179.990000
Sports & Outdoors      224.990000
Sports and Outdoors    149.990000
Toys                   144.990000
beauty                   9.990000
clothing                59.990000
electronics            464.990000
home & kitchen          99.990000
sports & outdoors      129.990000
Name: price, dtype: float64

In [9]:
# How many unique products are in the product catalog? (nunique)
clean_products_df['product_name'].nunique()

30

In [11]:
# What are the top 5 most expensive products? (sort and head)
clean_products_df.sort_values('price', ascending=False)[['product_name', 'price']].head(5)

Unnamed: 0,product_name,price
3,Dell XPS 13,1299.99
0,Apple iPhone 13,899.99
1,Samsung Galaxy S21,799.99
19,Dyson V11 Vacuum,599.99
26,Sony PlayStation 5,499.99


In [19]:
# What is the number of customers from each state? (value count)

state_abbrv = {
    'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL',
    'Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT',
    'Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA',
    'Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY',
}

customer_df['state'] = customer_df['state'].replace(state_abbrv)
cust_by_state = customer_df.groupby('state').size()
print(cust_by_state)

state
AZ    1
CA    4
CO    1
FL    1
IL    1
IN    1
KY    1
MA    1
MD    2
MI    1
MO    1
NC    1
NM    1
NV    1
NY    2
OH    1
OR    1
PA    1
TN    1
TX    5
WA    1
WI    1
dtype: int64
