# #Import libraries 

In [39]:
import numpy as np
import pandas as pd

In [40]:
# we can set numbers for how many rows and columns will be displayed

pd.set_option('display.min_rows', 10) 
pd.set_option('display.max_columns', 20)

# Loading Data Into a Pandas Data Frame 

Reading CSV file

In [41]:
# read csv file

df = pd.read_csv('online_store_customer_data.csv')
df.head(3)

Unnamed: 0,Transaction_date,Transaction_ID,Gender,Age,Marital_status,State_names,Segment,Employees_status,Payment_method,Referal,Amount_spent
0,1/1/2019,151200,Female,19.0,Single,Kansas,Basic,Unemployment,Other,1.0,2051.36
1,1/1/2019,151201,Male,49.0,Single,Illinois,Basic,self-employed,Card,0.0,544.04
2,1/1/2019,151202,Male,63.0,Married,New Mexico,Basic,workers,PayPal,1.0,1572.6


# Data Preprocessing

Data preprocessing is the process of making raw data to clean data. This is the most crucial part of data the science. In this section, we will explore data first then we remove unwanted columns, remove duplicates, handle missing data, etc. After this step, we get clean data from raw data.

### Data Exploring

In [42]:
# display first 5 rows
df.head(3)

Unnamed: 0,Transaction_date,Transaction_ID,Gender,Age,Marital_status,State_names,Segment,Employees_status,Payment_method,Referal,Amount_spent
0,1/1/2019,151200,Female,19.0,Single,Kansas,Basic,Unemployment,Other,1.0,2051.36
1,1/1/2019,151201,Male,49.0,Single,Illinois,Basic,self-employed,Card,0.0,544.04
2,1/1/2019,151202,Male,63.0,Married,New Mexico,Basic,workers,PayPal,1.0,1572.6


In [43]:
# display last 4 rows
df.tail(6)

Unnamed: 0,Transaction_date,Transaction_ID,Gender,Age,Marital_status,State_names,Segment,Employees_status,Payment_method,Referal,Amount_spent
2506,4/30/2021,153694,Male,34.0,Single,Florida,Missing,Employees,Other,1.0,286.82
2507,5/1/2021,153695,Female,57.0,Single,South Carolina,Platinum,self-employed,Card,0.0,150.1
2508,5/1/2021,153696,Female,36.0,Married,Hawaii,Silver,self-employed,PayPal,1.0,708.88
2509,5/1/2021,153697,Male,22.0,Single,South Carolina,Basic,workers,PayPal,1.0,2030.07
2510,5/1/2021,153698,,44.0,Single,New York,Basic,Employees,PayPal,0.0,1909.77
2511,5/1/2021,153699,Male,48.0,Single,California,Silver,workers,PayPal,1.0,1073.15


In [44]:
# Display random 6 sample rows
#random sample generting
df.sample(6)

Unnamed: 0,Transaction_date,Transaction_ID,Gender,Age,Marital_status,State_names,Segment,Employees_status,Payment_method,Referal,Amount_spent
1605,6/25/2020,152793,Female,38.0,Single,Maryland,Basic,Employees,Card,1.0,
1403,4/11/2020,152591,Female,,Married,Pennsylvania,Basic,workers,PayPal,1.0,962.34
882,10/15/2019,152070,Male,51.0,Married,Alabama,Platinum,self-employed,Other,1.0,1367.77
1992,10/27/2020,153180,Male,42.0,Married,Delaware,Missing,self-employed,Card,1.0,190.93
1811,8/30/2020,152999,Male,71.0,Married,Wyoming,Basic,Unemployment,PayPal,0.0,1647.3
1235,2/6/2020,152423,Male,51.0,Married,Ohio,Gold,workers,Other,0.0,295.67


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2512 entries, 0 to 2511
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Transaction_date  2512 non-null   object 
 1   Transaction_ID    2512 non-null   int64  
 2   Gender            2484 non-null   object 
 3   Age               2470 non-null   float64
 4   Marital_status    2512 non-null   object 
 5   State_names       2512 non-null   object 
 6   Segment           2512 non-null   object 
 7   Employees_status  2486 non-null   object 
 8   Payment_method    2512 non-null   object 
 9   Referal           2357 non-null   float64
 10  Amount_spent      2270 non-null   float64
dtypes: float64(3), int64(1), object(7)
memory usage: 216.0+ KB


In [46]:
# display datatypes
df.dtypes

Transaction_date     object
Transaction_ID        int64
Gender               object
Age                 float64
Marital_status       object
State_names          object
Segment              object
Employees_status     object
Payment_method       object
Referal             float64
Amount_spent        float64
dtype: object

In [47]:
df.dtypes.value_counts()

object     7
float64    3
int64      1
dtype: int64

In [48]:
df.shape

(2512, 11)

In [49]:
df.columns

Index(['Transaction_date', 'Transaction_ID', 'Gender', 'Age', 'Marital_status',
       'State_names', 'Segment', 'Employees_status', 'Payment_method',
       'Referal', 'Amount_spent'],
      dtype='object')

In [50]:
# display Age columns first 3 rows data
df['Age'].head(3)

0    19.0
1    49.0
2    63.0
Name: Age, dtype: float64

In [51]:
# display first 4 rows of Age, Transaction_date and Gender columns
df[['Age', 'Transaction_date', 'Gender']].head(4)

Unnamed: 0,Age,Transaction_date,Gender
0,19.0,1/1/2019,Female
1,49.0,1/1/2019,Male
2,63.0,1/1/2019,Male
3,18.0,1/1/2019,


In [52]:
# for display 3nd to 10th rows
df[2:7]

# for display starting to 10th
df[:15]

# for display last two rows
df[-2:]

Unnamed: 0,Transaction_date,Transaction_ID,Gender,Age,Marital_status,State_names,Segment,Employees_status,Payment_method,Referal,Amount_spent
2510,5/1/2021,153698,,44.0,Single,New York,Basic,Employees,PayPal,0.0,1909.77
2511,5/1/2021,153699,Male,48.0,Single,California,Silver,workers,PayPal,1.0,1073.15
