# Data transformation

### In this notebook,

- converting string data to categories
- converting continuous variables to categorical features

In [2]:
import pandas as pd

In [3]:
df1=pd.read_csv("Dataset/employees.csv")
df1

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.170,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.340,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,11/23/2014,6:09 AM,132483,16.655,False,Distribution
996,Phillip,Male,1/31/1984,6:30 AM,42392,19.675,False,Finance
997,Russell,Male,5/20/2013,12:39 PM,96914,1.421,False,Product
998,Larry,Male,4/20/2013,4:45 PM,60500,11.985,False,Business Development


In [4]:
df1.shape

(1000, 8)

In [5]:
df1.head(10)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services
5,Dennis,Male,4/18/1987,1:35 AM,115163,10.125,False,Legal
6,Ruby,Female,8/17/1987,4:20 PM,65476,10.012,True,Product
7,,Female,7/20/2015,10:43 AM,45906,11.598,,Finance
8,Angela,Female,11/22/2005,6:29 AM,95570,18.523,True,Engineering
9,Frances,Female,8/8/2002,6:51 AM,139852,7.524,True,Business Development


In [6]:
df1.Gender.value_counts()

Gender
Female    431
Male      424
Name: count, dtype: int64

In [7]:
df1.Gender.isna().sum() # Counting the number of NA values in the "gender" column

145

In [8]:
pd.Categorical(df1.Gender) # converting "gender" columns into categorical variables 

['Male', 'Male', 'Female', 'Male', 'Male', ..., NaN, 'Male', 'Male', 'Male', 'Male']
Length: 1000
Categories (2, object): ['Female', 'Male']

In [9]:
df1['Gender2']=pd.Categorical(df1.Gender)
df1.head() 

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team,Gender2
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing,Male
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,,Male
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance,Female
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance,Male
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services,Male


In [10]:
df1.dtypes # Although the 'Gender' and 'Gender 2' columns may seem identical, 'Gender 2' is a categorical feature.
# Advantages of categorical features: 1. Reduce memory usage ; 2. Facilitate comparison

First Name             object
Gender                 object
Start Date             object
Last Login Time        object
Salary                  int64
Bonus %               float64
Senior Management      object
Team                   object
Gender2              category
dtype: object

In [25]:
df1.memory_usage() # Categorical features consume less memory space

Index                 128
First Name           8000
Gender               8000
Start Date           8000
Last Login Time      8000
Salary               8000
Bonus %              8000
Senior Management    8000
Team                 8000
Gender2              1124
dtype: int64

In [11]:
import random

In [12]:
pd.Series(random.choices(['very low','low', 'medium', 'high','very high'], k=1000)) 
# Creating 1000 random values because our DataFrame has 1000 values

0            low
1       very low
2      very high
3           high
4           high
         ...    
995       medium
996       medium
997         high
998       medium
999     very low
Length: 1000, dtype: object

In [41]:
df1['Score']=pd.Series(random.choices(['very low','low', 'medium', 'high','very hight'], k=1000))
df1.head()  # adding the column "score"on the dataframe

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team,Gender2,Score
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing,Male,medium
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,,Male,low
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance,Female,medium
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance,Male,high
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services,Male,medium


In [42]:
df1.dtypes

First Name             object
Gender                 object
Start Date             object
Last Login Time        object
Salary                  int64
Bonus %               float64
Senior Management      object
Team                   object
Gender2              category
Score                  object
dtype: object

In [43]:
pd.Categorical(df1.Score) # here the order of labels  is not respected.
                        # by default, categorical data are sorted by alphabetical order

['medium', 'low', 'medium', 'high', 'medium', ..., 'low', 'very hight', 'very low', 'low', 'high']
Length: 1000
Categories (5, object): ['high', 'low', 'medium', 'very hight', 'very low']

In [47]:
pd.Categorical(df1.Score, categories=['very low','low', 'medium', 'high','very high'], ordered=True)
 # passing a the value into order

['medium', 'low', 'medium', 'high', 'medium', ..., 'low', NaN, 'very low', 'low', 'high']
Length: 1000
Categories (5, object): ['very low' < 'low' < 'medium' < 'high' < 'very high']

In [48]:
df1['Score2']=pd.Categorical(df1.Score, categories=['very low','low', 'medium', 'high','very high'], ordered=True)
df1.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team,Gender2,Score,Score2
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing,Male,medium,medium
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,,Male,low,low
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance,Female,medium,medium
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance,Male,high,high
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services,Male,medium,medium


In [51]:
df1.dtypes

First Name             object
Gender                 object
Start Date             object
Last Login Time        object
Salary                  int64
Bonus %               float64
Senior Management      object
Team                   object
Gender2              category
Score                  object
Score2               category
dtype: object

In [52]:
sum(df1.Score2>'medium')

176

In [53]:
df1.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team,Gender2,Score,Score2
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing,Male,medium,medium
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,,Male,low,low
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance,Female,medium,medium
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance,Male,high,high
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services,Male,medium,medium


In [54]:
df1.shape

(1000, 11)

In [55]:
df1.Salary.max()

149908

In [56]:
df1.Salary.min()

35013

In [58]:
# changing salary columns into categories
pd.cut(df1.Salary, bins=[35000,50000,75000,100000,125000,150000],
      labels=['very low','low', 'medium', 'high','very high'])

0         medium
1            low
2      very high
3      very high
4           high
         ...    
995    very high
996     very low
997       medium
998          low
999    very high
Name: Salary, Length: 1000, dtype: category
Categories (5, object): ['very low' < 'low' < 'medium' < 'high' < 'very high']

In [59]:
df1['Range_salary']=pd.cut(df1.Salary, bins=[35000,50000,75000,100000,125000,150000],
      labels=['very low','low', 'medium', 'high','very high'])

In [60]:
df1.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team,Gender2,Score,Score2,Range_salary
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing,Male,medium,medium,medium
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,,Male,low,low,low
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance,Female,medium,medium,very high
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance,Male,high,high,very high
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services,Male,medium,medium,high


In [61]:
df1.memory_usage()

Index                 128
First Name           8000
Gender               8000
Start Date           8000
Last Login Time      8000
Salary               8000
Bonus %              8000
Senior Management    8000
Team                 8000
Gender2              1124
Score                8000
Score2               1212
Range_salary         1212
dtype: int64