## EDA: TRAIN DATA

In [7]:
# importing data manipulation library
import pandas as pd
import numpy as np

In [8]:
data2 = pd.read_csv(r"C:\Users\User\Desktop\Messy-Food-Waste-Prediction\data\raw_data\train.csv")

In [9]:
data2.head()

Unnamed: 0,ID,date,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,staff_experience,waste_category,food_waste_kg
0,0,2022-12-19,196,13,27.887273,45.362854,0,0,7.740587,intermediate,dairy,28.946465
1,1,2023-11-21,244,15,10.317872,64.430475,1,0,42.311779,,MeAt,51.549053
2,4,2022-02-01,148,16,27.7143,69.046113,1,0,41.184305,Beginner,MeAt,53.008323
3,5,2023-03-19,157,19,19.173902,46.292823,6,0,41.543492,Beginner,MeAt,48.621527
4,6,2022-07-18,297,10,26.375233,79.741064,0,0,26.525097,Intermediate,MEAT,44.156984


In [10]:
data2.drop(columns= ["ID", "date"], inplace=True)

In [15]:
data2.head()

Unnamed: 0,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,staff_experience,waste_category,food_waste_kg
0,196,13,27.887273,45.362854,0,0,7.740587,Intermediate,dairy,28.946465
1,244,15,10.317872,64.430475,1,0,42.311779,,meat,51.549053
2,148,16,27.7143,69.046113,1,0,41.184305,Beginner,meat,53.008323
3,157,19,19.173902,46.292823,6,0,41.543492,Beginner,meat,48.621527
4,297,10,26.375233,79.741064,0,0,26.525097,Intermediate,meat,44.156984


In [12]:
def lower_case_columns(data2, cols):
    for col in cols:
        data2[col] = data2[col].astype(str).str.lower()
    return data2

# Usage
data2 = lower_case_columns(data2, ["waste_category"])


In [14]:
data2["staff_experience"] = data2['staff_experience'].replace("intermediate", "Intermediate")

In [16]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   meals_served      911 non-null    int64  
 1   kitchen_staff     911 non-null    int64  
 2   temperature_C     911 non-null    float64
 3   humidity_percent  911 non-null    float64
 4   day_of_week       911 non-null    int64  
 5   special_event     911 non-null    int64  
 6   past_waste_kg     911 non-null    float64
 7   staff_experience  747 non-null    object 
 8   waste_category    911 non-null    object 
 9   food_waste_kg     911 non-null    float64
dtypes: float64(4), int64(4), object(2)
memory usage: 71.3+ KB


In [21]:
cols = ["waste_category", "staff_experience", "special_event"]
data2[cols] = data2[cols].astype("category")

In [22]:
data2.head()

Unnamed: 0,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,staff_experience,waste_category,food_waste_kg
0,196,13,27.887273,45.362854,0,0,7.740587,Intermediate,dairy,28.946465
1,244,15,10.317872,64.430475,1,0,42.311779,,meat,51.549053
2,148,16,27.7143,69.046113,1,0,41.184305,Beginner,meat,53.008323
3,157,19,19.173902,46.292823,6,0,41.543492,Beginner,meat,48.621527
4,297,10,26.375233,79.741064,0,0,26.525097,Intermediate,meat,44.156984


In [26]:
def handle_outlier(data2, columns, method="IQR", threshold=1.5, strategy="replace"):
    for col in columns:
        if method == "IQR":
            Q1 = data2[col].quantile(0.25)
            Q3 = data2[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - threshold * IQR
            upper_bound = Q3 + threshold * IQR

        elif method == "Z-score":
            mean = data2[col].mean()
            std = data2[col].std()
            lower_bound = mean - threshold * std
            upper_bound = mean + threshold * std

        else:
            print("Invalid method")
            continue

        if strategy == "replace":
            # Replace outliers with boundary values (clipping)
            data2[col] = np.clip(data2[col], lower_bound, upper_bound)
        elif strategy == "remove":
            # Remove rows with outliers
            data2 = data2[(data2[col] >= lower_bound) & (data2[col] <= upper_bound)]
        else:
            print("Invalid strategy. Choose 'replace' or 'remove'.")

    return data2

In [27]:
columns_to_process = ["temperature_C"]
data2 = handle_outlier(data2, columns_to_process, method="IQR", threshold=0.75, strategy="replace")

In [29]:
data2.isna().sum()

meals_served          0
kitchen_staff         0
temperature_C         0
humidity_percent      0
day_of_week           0
special_event         0
past_waste_kg         0
staff_experience    164
waste_category        0
food_waste_kg         0
dtype: int64

In [32]:
data2.drop_duplicates(inplace=True)

In [33]:
data2 = data2.dropna(subset=["staff_experience"])

In [34]:
x = data2

In [35]:
x.head()

Unnamed: 0,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,staff_experience,waste_category,food_waste_kg
0,196,13,27.887273,45.362854,0,0,7.740587,Intermediate,dairy,28.946465
2,148,16,27.7143,69.046113,1,0,41.184305,Beginner,meat,53.008323
3,157,19,19.173902,46.292823,6,0,41.543492,Beginner,meat,48.621527
4,297,10,26.375233,79.741064,0,0,26.525097,Intermediate,meat,44.156984
5,241,18,16.863506,79.285919,3,0,11.834878,Intermediate,dairy,27.39367


In [36]:
x.to_csv("cleaned_food_waste_data(EDA2).csv", index=False)