# Data Preprocessing

This notebook focuses on preparing a retail sales dataset for analysis.
It includes data validation, quality checks, and basic feature engineering
to ensure the dataset is ready for exploratory and aggregation analysis.


In [1]:
import pandas as pd

data = {
    'OrderID': range(1, 16),
    'ProductCategory': ['Electronics','Furniture','Clothing','Electronics','Electronics',
                        'Furniture','Electronics','Clothing','Furniture','Electronics',
                        'Clothing','Furniture','Electronics','Clothing','Furniture'],
    'Region': ['East','West','East','North','South','East','South','East','North','West',
               'South','East','North','West','South'],
    'UnitsSold': [5,2,7,3,8,4,6,5,3,7,2,4,6,5,3],
    'UnitPrice': [200,450,50,220,60,430,210,55,400,230,65,420,205,70,410],
    'CustomerRating': ['High','Medium','Low','High','Low','Medium','High','Low','Medium',
                       'High','Low','Medium','Medium','Low','Medium']
}

df = pd.DataFrame(data)
df.head()


Unnamed: 0,OrderID,ProductCategory,Region,UnitsSold,UnitPrice,CustomerRating
0,1,Electronics,East,5,200,High
1,2,Furniture,West,2,450,Medium
2,3,Clothing,East,7,50,Low
3,4,Electronics,North,3,220,High
4,5,Electronics,South,8,60,Low


In [2]:
# ## Dataset Structure and Types
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   OrderID          15 non-null     int64 
 1   ProductCategory  15 non-null     object
 2   Region           15 non-null     object
 3   UnitsSold        15 non-null     int64 
 4   UnitPrice        15 non-null     int64 
 5   CustomerRating   15 non-null     object
dtypes: int64(3), object(3)
memory usage: 852.0+ bytes


In [3]:
## Missing Values Check
df.isnull().sum()


Unnamed: 0,0
OrderID,0
ProductCategory,0
Region,0
UnitsSold,0
UnitPrice,0
CustomerRating,0


In [4]:
## Duplicate Records Check
df.duplicated().sum()


np.int64(0)

In [5]:
## Feature Engineering

# A new variable, Revenue, is created to support business-focused analysis.
df['Revenue'] = df['UnitsSold'] * df['UnitPrice']
df.head()



Unnamed: 0,OrderID,ProductCategory,Region,UnitsSold,UnitPrice,CustomerRating,Revenue
0,1,Electronics,East,5,200,High,1000
1,2,Furniture,West,2,450,Medium,900
2,3,Clothing,East,7,50,Low,350
3,4,Electronics,North,3,220,High,660
4,5,Electronics,South,8,60,Low,480


In [6]:
## Categorical Value Validation
df['ProductCategory'].value_counts()


Unnamed: 0_level_0,count
ProductCategory,Unnamed: 1_level_1
Electronics,6
Furniture,5
Clothing,4


In [7]:
df['Region'].value_counts()


Unnamed: 0_level_0,count
Region,Unnamed: 1_level_1
East,5
South,4
West,3
North,3


In [8]:
df['CustomerRating'].value_counts()


Unnamed: 0_level_0,count
CustomerRating,Unnamed: 1_level_1
Medium,6
Low,5
High,4


In [9]:
## Final Dataset Summary

#The dataset is clean, well-structured, and ready for exploratory and aggregation-based analysis.
df.describe(include='all')


Unnamed: 0,OrderID,ProductCategory,Region,UnitsSold,UnitPrice,CustomerRating,Revenue
count,15.0,15,15,15.0,15.0,15,15.0
unique,,3,4,,,3,
top,,Electronics,East,,,Medium,
freq,,6,5,,,6,
mean,8.0,,,4.666667,231.666667,,938.333333
std,4.472136,,,1.877181,154.118075,,536.708443
min,1.0,,,2.0,50.0,,130.0
25%,4.5,,,3.0,67.5,,415.0
50%,8.0,,,5.0,210.0,,1000.0
75%,11.5,,,6.0,405.0,,1245.0
