In [21]:
import pandas as pd
import numpy as np

In [22]:
"""Pandas offers several methods for handling missing values, primarily represented as NaN (Not a Number)."""
#1. Identifying Missing Values:
#The first step is to identify where missing values exist in your DataFrame or Series.

data = pd.read_csv('retail_product_dataset.csv')

print("This is the dataset of retail product customer reviews")

data.head(10) #The dataset contains NaN that means it has null values

This is the dataset of retail product customer reviews


Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,5548.0,1.870322,,0.0
1,,3045.0,4.757798,,38.0
2,,4004.0,,In Stock,0.0
3,,4808.0,1.492085,,33.0
4,,1817.0,,Out of Stock,23.0
5,,3522.0,,,
6,C,667.0,3.668341,In Stock,41.0
7,A,7125.0,4.983998,Out of Stock,7.0
8,A,2777.0,2.678384,In Stock,6.0
9,,463.0,4.626187,,3.0


In [23]:
data['Category'].unique()

array([nan, 'C', 'A', 'B', 'D'], dtype=object)

In [24]:
data['Stock'].unique()

array([nan, 'In Stock', 'Out of Stock'], dtype=object)

In [25]:
data.shape

(4362, 5)

In [26]:
data.info() #From info we are able to understand that each column has null values which means missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4362 entries, 0 to 4361
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Category  1614 non-null   object 
 1   Price     4188 non-null   float64
 2   Rating    2312 non-null   float64
 3   Stock     3010 non-null   object 
 4   Discount  3970 non-null   float64
dtypes: float64(3), object(2)
memory usage: 170.5+ KB


In [27]:
#finding missing values with isna()
data.isna().sum()

Unnamed: 0,0
Category,2748
Price,174
Rating,2050
Stock,1352
Discount,392


In [28]:
"""Dropping Missing Values:"""
#You can remove rows or columns containing missing values using dropna().
data_dropped_rows = data.dropna()
print("\nDataFrame after dropping rows with any missing values:")
data_dropped_rows.head()
data_dropped_rows.shape


DataFrame after dropping rows with any missing values:


(540, 5)

In [29]:
data_dropped_cols = data.dropna(axis=1)
print("\nDataFrame after dropping columns with any missing values:")
data_dropped_cols.head()


DataFrame after dropping columns with any missing values:


0
1
2
3
4


In [30]:
data.head()

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,5548.0,1.870322,,0.0
1,,3045.0,4.757798,,38.0
2,,4004.0,,In Stock,0.0
3,,4808.0,1.492085,,33.0
4,,1817.0,,Out of Stock,23.0


In [31]:
""""Filling Missing Values (Imputation):"""
#You can replace missing values with a specific value or a calculated value (like mean, median) using fillna().
# Fill all NaN with a specific value (0)
filled_zero = data.fillna(0)
print("\nDataFrame after filling NaN with 0:")
filled_zero.head()


DataFrame after filling NaN with 0:


Unnamed: 0,Category,Price,Rating,Stock,Discount
0,0,5548.0,1.870322,0,0.0
1,0,3045.0,4.757798,0,38.0
2,0,4004.0,0.0,In Stock,0.0
3,0,4808.0,1.492085,0,33.0
4,0,1817.0,0.0,Out of Stock,23.0


In [15]:
data.head()

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,5548.0,1.870322,,0.0
1,,3045.0,4.757798,,38.0
2,,4004.0,,In Stock,0.0
3,,4808.0,1.492085,,33.0
4,,1817.0,,Out of Stock,23.0


In [16]:
data.columns

Index(['Category', 'Price', 'Rating', 'Stock', 'Discount'], dtype='object')

In [40]:
mean_Price = data['Price'].mean()
mean_Price

np.float64(5016.970630372492)

In [41]:
# Fill NaN in a specific column with its mean
#You have to fill numerical co,umns with mean values and categorical columns as Mode values

#filled_meanprice = data.copy() # Create a copy to avoid modifying original df
data['Price'] = data['Price'].fillna(mean_Price)
data.head(20)

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,5016.97063,1.870322,,0.0
1,,5016.97063,4.757798,,38.0
2,,5016.97063,,In Stock,0.0
3,,5016.97063,1.492085,,33.0
4,,5016.97063,,Out of Stock,23.0
5,,5016.97063,,,
6,C,5016.97063,3.668341,In Stock,41.0
7,A,5016.97063,4.983998,Out of Stock,7.0
8,A,5016.97063,2.678384,In Stock,6.0
9,,5016.97063,4.626187,,3.0


In [42]:
data.isnull().sum()

Unnamed: 0,0
Category,2748
Price,0
Rating,2050
Stock,1352
Discount,392


In [36]:
mode_Category = data['Category'].mode()
mode_Category

Unnamed: 0,Category
0,C


In [43]:
#You have to fill numerical co,umns with mean values and categorical columns as Mode values
#filled_category = data.copy() # Create a copy to avoid modifying original df
data['Category'] = data['Category'].fillna(data['Category'].mode()[0])
data.head(20)

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,C,5016.97063,1.870322,,0.0
1,C,5016.97063,4.757798,,38.0
2,C,5016.97063,,In Stock,0.0
3,C,5016.97063,1.492085,,33.0
4,C,5016.97063,,Out of Stock,23.0
5,C,5016.97063,,,
6,C,5016.97063,3.668341,In Stock,41.0
7,A,5016.97063,4.983998,Out of Stock,7.0
8,A,5016.97063,2.678384,In Stock,6.0
9,C,5016.97063,4.626187,,3.0


In [44]:
data.isnull().sum()

Unnamed: 0,0
Category,0
Price,0
Rating,2050
Stock,1352
Discount,392


In [45]:
# Fill NaN in a specific column with its mean
#You have to fill numerical columns with mean values and categorical columns as Mode values
mean_Rating = data['Rating'].mean()
print(mean_Rating)
data['Rating'] = data['Rating'].fillna(mean_Rating)
data.head()

3.0382925191086185


Unnamed: 0,Category,Price,Rating,Stock,Discount
0,C,5016.97063,1.870322,,0.0
1,C,5016.97063,4.757798,,38.0
2,C,5016.97063,3.038293,In Stock,0.0
3,C,5016.97063,1.492085,,33.0
4,C,5016.97063,3.038293,Out of Stock,23.0


In [46]:
data.isnull().sum()

Unnamed: 0,0
Category,0
Price,0
Rating,0
Stock,1352
Discount,392


In [47]:
#You have to fill numerical co,umns with mean values and categorical columns as Mode values
#filled_category = data.copy() # Create a copy to avoid modifying original df
data['Stock'] = data['Stock'].fillna(data['Stock'].mode()[0])
data.head(20)

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,C,5016.97063,1.870322,In Stock,0.0
1,C,5016.97063,4.757798,In Stock,38.0
2,C,5016.97063,3.038293,In Stock,0.0
3,C,5016.97063,1.492085,In Stock,33.0
4,C,5016.97063,3.038293,Out of Stock,23.0
5,C,5016.97063,3.038293,In Stock,
6,C,5016.97063,3.668341,In Stock,41.0
7,A,5016.97063,4.983998,Out of Stock,7.0
8,A,5016.97063,2.678384,In Stock,6.0
9,C,5016.97063,4.626187,In Stock,3.0


In [48]:
# Fill NaN in a specific column with its mean
#You have to fill numerical columns with mean values and categorical columns as Mode values
Median_Discount = data['Discount'].median()
print(Median_Discount)
data['Discount'] = data['Discount'].fillna(Median_Discount)
data.head(20)

25.0


Unnamed: 0,Category,Price,Rating,Stock,Discount
0,C,5016.97063,1.870322,In Stock,0.0
1,C,5016.97063,4.757798,In Stock,38.0
2,C,5016.97063,3.038293,In Stock,0.0
3,C,5016.97063,1.492085,In Stock,33.0
4,C,5016.97063,3.038293,Out of Stock,23.0
5,C,5016.97063,3.038293,In Stock,25.0
6,C,5016.97063,3.668341,In Stock,41.0
7,A,5016.97063,4.983998,Out of Stock,7.0
8,A,5016.97063,2.678384,In Stock,6.0
9,C,5016.97063,4.626187,In Stock,3.0


In [49]:
"""By default, drop_duplicates() removes rows that are exact duplicates across all columns.
 It keeps the first occurrence of a duplicate set and discards the rest."""
data_cleaned = data.drop_duplicates()
print("\nDataFrame after dropping all exact duplicates:")
data_cleaned.count()


DataFrame after dropping all exact duplicates:


Unnamed: 0,0
Category,2651
Price,2651
Rating,2651
Stock,2651
Discount,2651


In [50]:
data.shape

(4362, 5)