In [None]:
import pandas as pd


Part 1: Data Loading and Exploration

In [2]:
#1. Loading Data: Import all CSV files into Pandas DataFrames and display the first few rows of each.
customer_df = pd.read_csv('data/wk8-customers.csv')
products_df = pd.read_csv('data/wk8-products.csv')
stores_df = pd.read_csv('data/wk8-stores.csv')
sales_df = pd.read_csv('data/wk8-sales.csv')
inventory_df = pd.read_csv('data/wk8-inventory.csv')

In [None]:
#2. Exploratory Analysis: Use descriptive statistics to get a feel for the data
 # - 2a. Obtain summary statistics of numerical columns in the products, sales, and 
 #       inventory dataframes
products_df.info()
sales_df.info()
inventory_df.info()


In [None]:
# - 2b Obtain summary statistics of numerical columns in the products, sales, 
#      and inventory dataframes
products_df.describe()
sales_df.describe()
inventory_df.describe()

In [None]:
# - 2c Check for missing values in all dataframes
customer_df.isnull().count()
products_df.isnull().count()
stores_df.isnull().count()
sales_df.isnull().count()
inventory_df.isnull().count()


In [None]:
# - 2d Display the data types of each column in all dataframes
customer_df.dtypes
products_df.dtypes
stores_df.dtypes
sales_df.dtypes
inventory_df.dtypes

In [None]:
products_df.info()

In [None]:
#3. Basic Information Retrieval:
 # - 3a. How many unique products are in the product catalog?

num_of_unique_products = len(products_df['product_name'].unique())

print(f"There are {num_of_unique_products} uniquely named products in the product catalog")


In [None]:
#3b. What are the top 5 most expensive products?
most_expensive_products = products_df.sort_values('price', ascending=False)[['product_name', 'price']].reset_index()
print(f"Here are the 5 most expensive products in the product catalog:\n{most_expensive_products.head(5)}")
print(products_df.head(5))

In [None]:
#3c. Which store has the largest floor space?
largest_Floor_space = stores_df.loc[stores_df['size_sqft'].idxmax()]
print(f"{largest_Floor_space['store_name']} has the largest floor space with {largest_Floor_space['size_sqft']} square feet")


In [None]:
#3c. What is the distribution of customers by state?

state_abbrv = {
    'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL',
    'Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT',
    'Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA',
    'Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY',
}

customer_df['state'] = customer_df['state'].replace(state_abbrv)
cust_by_state = customer_df.groupby('state').size()


# #this is a classic example of view vs copy. customer_df_new & customer_df point to the same object in memory so if you change one you change both. 
# #if you don't want to do that the use .copy(). Use a view when your reading or analyzing data (via alias)
# #customer_df_new = customer_df
# customer_df_new['state'] = customer_df_new['state'].replace(state_abbrv)
# customer_df_new['state'].values

Part 2: Data Cleaning

In [3]:
# 1. Handling Missing Values:
# 1a. Identify all missing values in each dataset
def missing_values(df):

    #find all rows that have at least 1 null value
    null_rows = df[df.isnull().any(axis=1)]# this is an example of boolean indexing. It basically just gets a subset of data from a datframe based on the true/false values in a series.

    #find what columns are missing in each row
    null_columns = null_rows.apply(lambda row: row[row.isnull()].index.tolist(), axis=1)

    return null_columns

def print_missing_values(df):
    for i, v in df.items(): 
        print(f"Row {i} is missing a value in column(s): {', '.join([item for item in v])}")
    print("\n") 

print("Customer table missing values:")
customer_values = missing_values(customer_df)
print_missing_values(customer_values)

print("Inventory table missing values:")
inventory_values = missing_values(customer_df)
print_missing_values(inventory_values)

print("Products table missing values:")
product_values = missing_values(customer_df)
print_missing_values(product_values)

print("Sales table missing values:")
sales_values = missing_values(customer_df)
print_missing_values(sales_values)

print("Stores table missing values:")
stores_values = missing_values(customer_df)
print_missing_values(stores_values)


Customer table missing values:
Row 1 is missing a value in column(s): email
Row 3 is missing a value in column(s): registration_date
Row 5 is missing a value in column(s): phone
Row 11 is missing a value in column(s): email, zip_code
Row 13 is missing a value in column(s): phone
Row 15 is missing a value in column(s): registration_date
Row 24 is missing a value in column(s): registration_date
Row 29 is missing a value in column(s): email


Inventory table missing values:
Row 1 is missing a value in column(s): email
Row 3 is missing a value in column(s): registration_date
Row 5 is missing a value in column(s): phone
Row 11 is missing a value in column(s): email, zip_code
Row 13 is missing a value in column(s): phone
Row 15 is missing a value in column(s): registration_date
Row 24 is missing a value in column(s): registration_date
Row 29 is missing a value in column(s): email


Products table missing values:
Row 1 is missing a value in column(s): email
Row 3 is missing a value in column(

In [15]:
# 1b. For numerical columns with missing values, replace them with the column mean
inventory_df[['inventory_id', 'product_id']].isnull().sum()

inventory_id    0
product_id      0
dtype: int64

In [None]:
# 1c. For categorical columns with missing values, replace them with the most frequent value


In [None]:
# 1d For date columns with missing values, use forward fill or backward fill as appropriate

In [None]:
# 2. Removing Duplicates:
# 2a. Check for and remove any duplicate entries in the customers and products dataframes

In [None]:
# 2b. Explain your approach for identifying duplicates