# Importing Libraries

In [5]:
# Import libraries
import pandas as pd
import numpy as np
import os

# Importing Data Bases

In [7]:
# Import orders.csv
# Create a string of the path
# Define the path variable
path=r'/Users/douniaelyoussoufi/Achievement 4 '
# Load the Orders CSV file directly using os.path.join
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col=False)
# Import products.csv
# Load the Products CSV file directly using os.path.join
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col=False)

# Step 1: Exercise 4.5

# Missing Values

In [8]:
# Finding missing values in products.csv data base
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

The only column with missing values is the "product_name" column, and it’s missing 16 values.

In [15]:
# Creating a subset to filter rows where 'product_name' is null
df_nan = df_prods[df_prods['product_name'].isnull() == True]
# Display the result
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [29]:
# Addressing these missing values
# Run the shape function
df_prods.shape

(49693, 5)

In [27]:
# Remove the missing values
# Create a new DataFrame without the missing values in 'product_name'
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]
# Run the shape function
df_prods_clean.shape

(49677, 5)

The new DataFrame has exactly 16 fewer rows than the original DataFrame, corresponding to the number of missing values.

# Duplicates

In [42]:
# Finding duplicates in products.csv data base without the missing values in 'product_name'
df_dups = df_prods_clean[df_prods_clean.duplicated()]
# Display all the duplicate rows
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [35]:
# Addressing Duplicates
# Delete all the duplicate rows
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()
# Run the shape function
df_prods_clean_no_dups.shape

(49672, 5)

The five duplicates have been successfully deleted.

# Tidying Up and Exporting Changes

In [40]:
# Rename the DataFrame
df_products_checked = df_prods_clean_no_dups

In [44]:
# Export the df_products_checked dataframe as “products_checked.csv” 
df_products_checked.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))

# Step 2

In [50]:
# Run the describe() function on the df_ords dataframe
summary_stats = df_ords.describe()
summary_stats

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


Analysis of `df_ords` Summary Statistics:
1. `order_id` and `user_id`:
   - The `min` and `max` values for both columns appear valid as unique identifiers. No issues here.
2. `order_number`:
   - The range is 1–102, which seems reasonable for tracking the number of orders placed by a user. No concerns here.
3. `order_dow` (Day of the Week):
   - The range is 0–6, which matches a typical 7-day week. No concerns here.
4. `order_hour_of_day`:
   - The range is 0–23, which aligns with valid hours of the day. No concerns here.
5. `days_since_prior_order`:
   - The range is 0–30, which seems reasonable for tracking time between orders.

Summary of Issues to Investigate:
- `days_since_prior_order`:
  - Verify the interpretation of `0` (e.g., same-day orders or missing data).
  - Confirm if the `30`-day maximum is a true cap or a data entry limitation.

# Step 3

In [74]:
# Check for mixed-type data in df_ords
for col in df_ords.columns:
    # Identify rows where the data type is inconsistent using map() instead of applymap() to avoid deprecation warning in pandas
    weird = (df_ords[col].map(type) != type(df_ords[col].iloc[0])).any()
    # If mixed types are found, print the column name
    if weird:
        print(f"Mixed data types found in column: {col}")

Mixed data types found in column: order_id
Mixed data types found in column: user_id
Mixed data types found in column: order_number
Mixed data types found in column: order_dow
Mixed data types found in column: order_hour_of_day
Mixed data types found in column: days_since_prior_order


# Step 4

In [97]:
# Change the data types of each column to the appropriate type
df_ords['order_id'] = df_ords['order_id'].astype('str')  # order_id should be a string
df_ords['user_id'] = df_ords['user_id'].astype('str')  # user_id should be a string
df_ords['order_number'] = df_ords['order_number'].astype('str')
df_ords['order_dow'] = df_ords['order_dow'].astype('str')
df_ords['order_hour_of_day'] = df_ords['order_hour_of_day'].astype('str')
df_ords['days_since_prior_order'] = df_ords['days_since_prior_order'].astype('str')
df_ords.dtypes

order_id                  object
user_id                   object
eval_set                  object
order_number              object
order_dow                 object
order_hour_of_day         object
days_since_prior_order    object
dtype: object

Convert columns to 'object' type due to mixed-type issues.
When setting appropriate numeric types (like int or float), mixed-type data was found.
So, we are using 'object' to handle this inconsistency.

In [111]:
# Check for mixed-type data in df_ords
# Flag to check if any mixed-type data is found
mixed_found = False
for col in df_ords.columns:
    # Identify rows where the data type is inconsistent using map() instead of applymap() to avoid deprecation warning in pandas
    weird = (df_ords[col].map(type) != type(df_ords[col].iloc[0])).any()
    # If mixed types are found, print the column name
    if weird:
        print(f"Mixed data types found in column: {col}")
        mixed_found = True  # Set flag to True when mixed types are found
# If no mixed types are found, print a message
if not mixed_found:
    print("No mixed type data found")

No mixed type data found


# Step 5

In [55]:
# Finding missing values in orders.csv data base
df_ords.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [63]:
# Creating a subset to filter rows where 'days_since_prior_order' is null
df_nan2 = df_ords[df_ords['days_since_prior_order'].isnull() == True]
# Display the result
df_nan2

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
11,2168274,2,prior,1,2,11,
26,1374495,3,prior,1,1,14,
39,3343014,4,prior,1,6,11,
45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...
3420930,969311,206205,prior,1,4,12,
3420934,3189322,206206,prior,1,3,18,
3421002,2166133,206207,prior,1,6,19,
3421019,2227043,206208,prior,1,1,15,


There are 206,209 missing values (`NaN`) in the `days_since_prior_order` column, as identified in the `df_ords` DataFrame.
   - These missing values correspond to rows where `order_number` is `1`. This indicates that these orders are likely the **first orders placed by users**.

Explanation:
   - For the first order placed by a user (`order_number = 1`), there is no prior order to calculate the days since the last purchase. As a result, it is logical for the `days_since_prior_order` column to be `NaN` for these rows.

# Step 6

In [124]:
# Addressing the missing values in subset
# Replace NaN with 0 in 'days_since_prior_order' column
df_nan2['days_since_prior_order'].fillna(0)
df_nan2

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,0.0
11,2168274,2,prior,1,2,11,0.0
26,1374495,3,prior,1,1,14,0.0
39,3343014,4,prior,1,6,11,0.0
45,2717275,5,prior,1,3,12,0.0
...,...,...,...,...,...,...,...
3420930,969311,206205,prior,1,4,12,0.0
3420934,3189322,206206,prior,1,3,18,0.0
3421002,2166133,206207,prior,1,6,19,0.0
3421019,2227043,206208,prior,1,1,15,0.0


The missing values in the days_since_prior_order column represent users' first orders, where there is no prior order to compare. Replacing NaN with 0 signifies that these are first orders with no previous purchase, making the data more suitable for analysis and modeling. Alternatively, you could leave the NaN values as-is, as they provide meaningful information.

In [None]:
# Create a new DataFrame where NaN values in 'days_since_prior_order' are replaced with 0
df_ords_clean = df_ords.copy()  # Create a copy of the original DataFrame
df_ords_clean['days_since_prior_order'] = df_ords_clean['days_since_prior_order'].fillna(0)

# Step 7

In [135]:
# Finding duplicates in df_ords
df_dups1 = df_ords_clean[df_ords_clean.duplicated()]
# Display all the duplicate rows
df_dups1

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order


Upon analysing the dataset, no duplicate values were found in the `df_ords_clean` DataFrame. Each row appears to be unique, and there are no repeated records based on the available columns.

### Explanation
Duplicates typically occur when identical rows are unintentionally repeated, which can be caused by errors during data entry, merging, or importing. Since no duplicates were identified in this dataset, it suggests that the data is already clean in this regard. However, it is always a good practice to periodically check for duplicates, especially when merging or appending datasets.

# Step 8
No action is required regarding duplicates in this case.

# Step 9

In [139]:
# Export the df_ords_clean dataframe as “orders_checked.csv” 
df_ords_clean.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'))