In [24]:
import pandas as pd
import numpy as np

In [25]:
"""Pandas offers several methods for handling missing values, primarily represented as NaN (Not a Number)."""
#1. Identifying Missing Values:
#The first step is to identify where missing values exist in your DataFrame or Series.

data = pd.read_csv('retail_product_dataset.csv')

print("This is the dataset of retail product customer reviews")

data.head(10) #The dataset contains NaN that means it has null values

This is the dataset of retail product customer reviews


Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,5548.0,1.870322,,0.0
1,,3045.0,4.757798,,38.0
2,,4004.0,,In Stock,0.0
3,,4808.0,1.492085,,33.0
4,,1817.0,,Out of Stock,23.0
5,,3522.0,,,
6,C,667.0,3.668341,In Stock,41.0
7,A,7125.0,4.983998,Out of Stock,7.0
8,A,2777.0,2.678384,In Stock,6.0
9,,463.0,4.626187,,3.0


In [26]:
"""Replacing a single value"""
# this will replace "NaN" with "C"
data['Category'] = data['Category'].replace(np.nan, 'C')
data['Category'].head(20)

Unnamed: 0,Category
0,C
1,C
2,C
3,C
4,C
5,C
6,C
7,A
8,A
9,C


In [27]:
"""replace multiple columns values to one"""
# this will replace "In Stock" and "Out of Stock" with "Stock Limit"
data['Stock'] = data['Stock'].replace(to_replace=["In Stock", "Out of Stock"], value="Stock Limit")
data['Stock'].head(20)

Unnamed: 0,Stock
0,
1,
2,Stock Limit
3,
4,Stock Limit
5,
6,Stock Limit
7,Stock Limit
8,Stock Limit
9,


In [28]:
data['Stock'] = data['Stock'].replace(np.nan,"Within Stock")
data['Stock'].head(20)

Unnamed: 0,Stock
0,Within Stock
1,Within Stock
2,Stock Limit
3,Within Stock
4,Stock Limit
5,Within Stock
6,Stock Limit
7,Stock Limit
8,Stock Limit
9,Within Stock


In [29]:
data['Stock'] = data['Stock'].replace(['Within Stock','Stock Limit'],['In Stock','Out of Limit'])
data['Stock'].head(20)

Unnamed: 0,Stock
0,In Stock
1,In Stock
2,Out of Limit
3,In Stock
4,Out of Limit
5,In Stock
6,Out of Limit
7,Out of Limit
8,Out of Limit
9,In Stock


In [30]:
data.head(5)

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,C,5548.0,1.870322,In Stock,0.0
1,C,3045.0,4.757798,In Stock,38.0
2,C,4004.0,,Out of Limit,0.0
3,C,4808.0,1.492085,In Stock,33.0
4,C,1817.0,,Out of Limit,23.0


In [31]:
# this will replace "NaN" with "C"
data['Discount'] = data['Discount'].replace(0 , 3.5)
data['Discount'] = data['Discount'].replace(np.nan , 10.0)
data['Discount'].head(20)

Unnamed: 0,Discount
0,3.5
1,38.0
2,3.5
3,33.0
4,23.0
5,10.0
6,41.0
7,7.0
8,6.0
9,3.0


In [32]:
data['Rating'] = data['Rating'].replace(np.nan, 4.0)
data['Rating'].head(10)

Unnamed: 0,Rating
0,1.870322
1,4.757798
2,4.0
3,1.492085
4,4.0
5,4.0
6,3.668341
7,4.983998
8,2.678384
9,4.626187


In [33]:
"""Using .astype() for single or multiple columns:"""
# Convert 'Rating' column to integer
data['Rating'] = data['Rating'].astype(int)
data['Rating'].head(10)

Unnamed: 0,Rating
0,1
1,4
2,4
3,1
4,4
5,4
6,3
7,4
8,2
9,4


In [34]:
data['Discount'] = data['Discount'].astype(int)
data['Discount'].head(10)

Unnamed: 0,Discount
0,3
1,38
2,3
3,33
4,23
5,10
6,41
7,7
8,6
9,3


In [35]:
data_multi = data.astype({'Discount': int, 'Rating': int})
data_multi.head()

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,C,5548.0,1,In Stock,3
1,C,3045.0,4,In Stock,38
2,C,4004.0,4,Out of Limit,3
3,C,4808.0,1,In Stock,33
4,C,1817.0,4,Out of Limit,23


In [37]:
#pd.to_numeric(), pd.to_datetime(), pd.to_timedelta()
#converting to numeric, datetime, and timedelta types, respectively, and offer more control over error handling.
data['Stock'] = pd.to_numeric(data['Stock'] , errors='coerce')
data['Stock'].head()

Unnamed: 0,Stock
0,
1,
2,
3,
4,


In [38]:
#str methods
#str.lower() - Convert all strings to lowercase
lower_case_data = data['Category'].str.lower()
lower_case_data.head()

Unnamed: 0,Category
0,c
1,c
2,c
3,c
4,c


In [39]:
#str.upper() - Convert all strings to uppercase
upper_case_data = data['Category'].str.upper()
upper_case_data.head()

Unnamed: 0,Category
0,C
1,C
2,C
3,C
4,C


In [16]:
data.head()

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,C,5548.0,1,,3
1,C,3045.0,4,,38
2,C,4004.0,4,,3
3,C,4808.0,1,,33
4,C,1817.0,4,,23


In [None]:
#str.strip() - Remove leading/trailing whitespace
stripped_data = data['Rating'].str.strip()
stripped_data.head()

In [42]:
#str.contains() - Check if a substring is present (returns boolean Series)
contains_stock = data['Category'].str.contains('C')
contains_stock.head(20)

Unnamed: 0,Category
0,True
1,True
2,True
3,True
4,True
5,True
6,True
7,False
8,False
9,True


In [43]:
replaced_data = data['Category'].str.replace('D' , 'A')
replaced_data.head(50)

Unnamed: 0,Category
0,C
1,C
2,C
3,C
4,C
5,C
6,C
7,A
8,A
9,C
