## MILESTONE 1

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from scipy import stats

In [6]:
# Load the dataset
url = "C:/Users/divya/Downloads/product+classification+and+clustering/pricerunner_aggregate.csv" 
product_df = pd.read_csv(url)

# Examine the dataset to find fundamental information and missing values.
print("Dataset Info:")
print(product_df.info())
print("\nMissing Values in Each Column:")
print(product_df.isnull().sum())
print("\nSummary Statistics:")
print(product_df.describe())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35311 entries, 0 to 35310
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Product ID       35311 non-null  int64 
 1   Product Title    35311 non-null  object
 2    Merchant ID     35311 non-null  int64 
 3    Cluster ID      35311 non-null  int64 
 4    Cluster Label   35311 non-null  object
 5    Category ID     35311 non-null  int64 
 6    Category Label  35311 non-null  object
dtypes: int64(4), object(3)
memory usage: 1.9+ MB
None

Missing Values in Each Column:
Product ID         0
Product Title      0
 Merchant ID       0
 Cluster ID        0
 Cluster Label     0
 Category ID       0
 Category Label    0
dtype: int64

Summary Statistics:
         Product ID   Merchant ID    Cluster ID   Category ID
count  35311.000000  35311.000000  35311.000000  35311.000000
mean   26150.800176    120.501883  30110.687633   2618.142930
std    13498.191220

In [7]:
# Deal with Missing Values
# Fill in the missing values: use the mode for categories and the median for numeric columns.
numeric_cols = product_df.select_dtypes(include=[np.number]).columns
categorical_cols = product_df.select_dtypes(include=['object']).columns

product_df[numeric_cols] = product_df[numeric_cols].fillna(product_df[numeric_cols].median())
product_df[categorical_cols] = product_df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))

In [8]:
# Adjust Columns of Numerical Data
scaler = MinMaxScaler()

# Assuming that 'Cluster ID' and 'Category ID' are numbers, substitute real feature names as necessary.
features_to_normalize = [' Cluster ID', ' Category ID']  
product_df[features_to_normalize] = scaler.fit_transform(product_df[features_to_normalize])

In [9]:
# Eliminate Outliers 
# Filter out rows where any feature has a z-score greater than three using z-score.
product_df = product_df[(np.abs(stats.zscore(product_df[features_to_normalize])) < 3).all(axis=1)]

# Last Verification: Show the data once it has been cleaned and look for any missing values.
print("\nCleaned Dataset Info:")
print(product_df.info())
print("\nRemaining Missing Values:")
print(product_df.isnull().sum())



Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35311 entries, 0 to 35310
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Product ID       35311 non-null  int64  
 1   Product Title    35311 non-null  object 
 2    Merchant ID     35311 non-null  int64  
 3    Cluster ID      35311 non-null  float64
 4    Cluster Label   35311 non-null  object 
 5    Category ID     35311 non-null  float64
 6    Category Label  35311 non-null  object 
dtypes: float64(2), int64(2), object(3)
memory usage: 1.9+ MB
None

Remaining Missing Values:
Product ID         0
Product Title      0
 Merchant ID       0
 Cluster ID        0
 Cluster Label     0
 Category ID       0
 Category Label    0
dtype: int64
