## EDA1: TRAIN DATA



In [None]:
# importing data manipulation library
import pandas as pd
import numpy as np


In [None]:
data =pd.read_csv(r"C:\Users\User\Desktop\Messy-Food-Waste-Prediction\data\raw_data\train.csv")

In [None]:
data.head()

## Exploratory Data Analysis

In [None]:
# checking the information about the data
data.info()

In [None]:
# checking out the number of rows and columns in the data

data.shape

In [None]:
# listing out the columns in the data set
data.columns

In [None]:
# dropping the ID and date columns as they are not useful for analysis
data.drop(columns=['ID', "date"], inplace=True
          )

In [None]:
data.columns

In [None]:
# checking for missing values in the data
for column in data.columns:
    if data[column].isnull().any():
        print(f"columns: {column} | missing values: {data[column].isnull().sum()}")


In [None]:
# checking for duplicate rows in the data
duplicate_rows = data.duplicated().sum()
if duplicate_rows > 0:
    print(f"duplicate row found: {duplicate_rows}")
else:
    print("no duplicate rows found")

In [None]:
# check the data type of staff_experience
print (data["staff_experience"].dtypes)

In [None]:
duplicate_values = data["staff_experience"].value_counts()
print (duplicate_values [duplicate_values > 1])


In [None]:
data["staff_experience"] = data["staff_experience"].replace ("intermediate", "Intermediate")

In [None]:
data["staff_experience"] = data["staff_experience"].replace(
    {
        "EXPERT": "Expert"
    }
)

In [None]:
data.columns

In [None]:
# lets deal with the objects in the columns
# we have staff_experience and waste_category
data["staff_experience"].isna().value_counts()


In [None]:
# we will drop nan values in staff_experience, since the percentage of nan value is very small compared to the data
data = data.dropna(subset=["staff_experience"])


In [None]:
data["waste_category"].isna().sum()


In [None]:
data["waste_category"].value_counts()

In [None]:
# convert the data type of waste_category from int to string
data["waste_category"] = data["waste_category"].astype(str)

In [None]:
data["waste_category"].value_counts()

In [None]:
data["waste_category"] = data["waste_category"].str.lower()

In [None]:
data["waste_category"].value_counts()

In [None]:
data["waste_category"] = data["waste_category"].astype(str)

In [None]:
data["staff_experience"].isna().sum()


In [None]:
data.info()

In [None]:
data["special_event"].value_counts()

In [None]:
cols = ["special_event", "staff_experience", "waste_category"]
data[cols] = data[cols].astype("category")


In [None]:
data.info()

In [None]:
def handle_outlier(data, columns, method="IQR", threshold=1.5, strategy="replace"):
    for col in columns:
        if method == "IQR":
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - threshold * IQR
            upper_bound = Q3 + threshold * IQR

        elif method == "Z-score":
            mean = data[col].mean()
            std = data[col].std()
            lower_bound = mean - threshold * std
            upper_bound = mean + threshold * std

        else:
            print("Invalid method")
            continue

        if strategy == "replace":
            # Replace outliers with boundary values (clipping)
            data[col] = np.clip(data[col], lower_bound, upper_bound)
        elif strategy == "remove":
            # Remove rows with outliers
            data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
        else:
            print("Invalid strategy. Choose 'replace' or 'remove'.")

    return data


In [None]:
columns_to_process = ["temperature_C", "meals_served", "food_waste_kg"]
data = handle_outlier(data, columns_to_process, method="IQR", threshold=0.75, strategy="replace")


In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data

In [None]:
# 


## feature engineering and Encoding

In [None]:
# experience_order = {"Beginner": 0, "Intermediate": 1, "Expert": 2, "Unknown": -1}
# data["staff_experience"] = data["staff_experience"].replace(experience_order)


In [None]:
# data = pd.get_dummies(data, columns=["waste_category", "special_event"], drop_first=True)


In [None]:
# 
data.columns

In [None]:
data.to_csv("cleaned_food_waste_data.csv", index=False)
