## - Label Encoder
- Pros:
  - Simple to implement.
  - Does not increase the dimensionality of the dataset.

- Cons:
   - The ordinal nature (i.e., order) that is introduced might not be suitable for all types of algorithms, especially linear models.

In [25]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings 
warnings.filterwarnings('ignore')

In [26]:
df=pd.read_csv(r"C:\Users\Admin\Desktop\Preprocessing\Handle Missing Values\titanic-train.csv")

In [27]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [29]:
train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)

In [30]:
train_data['Cabin'].fillna('Unknown', inplace=True)
test_data['Cabin'].fillna('Unknown', inplace=True)

In [31]:
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(train_data['Age'].median(), inplace=True)

In [32]:
most_frequent_embarked = train_data['Embarked'].mode()[0]
train_data['Embarked'].fillna(most_frequent_embarked, inplace=True)
test_data['Embarked'].fillna(most_frequent_embarked, inplace=True)

In [33]:
label_encoder=LabelEncoder()

In [34]:
train_data['Sex']

445      male
650      male
172    female
450      male
314      male
        ...  
106    female
270      male
860      male
435    female
102      male
Name: Sex, Length: 623, dtype: object

In [35]:
train_data['Sex'].value_counts()

Sex
male      410
female    213
Name: count, dtype: int64

In [36]:
train_data['Sex']

445      male
650      male
172    female
450      male
314      male
        ...  
106    female
270      male
860      male
435    female
102      male
Name: Sex, Length: 623, dtype: object

In [37]:
train_data[['Sex']]           #dataframe

Unnamed: 0,Sex
445,male
650,male
172,female
450,male
314,male
...,...
106,female
270,male
860,male
435,female


In [38]:
train_data['Sex']=label_encoder.fit_transform(train_data[['Sex']])

In [39]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
445,446,1,1,"Dodge, Master. Washington",1,4.0,0,2,33638,81.8583,A34,S
650,651,0,3,"Mitkoff, Mr. Mito",1,28.0,0,0,349221,7.8958,Unknown,S
172,173,1,3,"Johnson, Miss. Eleanor Ileen",0,1.0,1,1,347742,11.1333,Unknown,S
450,451,0,2,"West, Mr. Edwy Arthur",1,36.0,1,2,C.A. 34651,27.7500,Unknown,S
314,315,0,2,"Hart, Mr. Benjamin",1,43.0,1,1,F.C.C. 13529,26.2500,Unknown,S
...,...,...,...,...,...,...,...,...,...,...,...,...
106,107,1,3,"Salkjelsvik, Miss. Anna Kristine",0,21.0,0,0,343120,7.6500,Unknown,S
270,271,0,1,"Cairns, Mr. Alexander",1,28.0,0,0,113798,31.0000,Unknown,S
860,861,0,3,"Hansen, Mr. Claus Peter",1,41.0,2,0,350026,14.1083,Unknown,S
435,436,1,1,"Carter, Miss. Lucile Polk",0,14.0,1,2,113760,120.0000,B96 B98,S


In [40]:
test_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
709,710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,28.0,1,1,2661,15.2458,Unknown,C
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5000,Unknown,S
840,841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.9250,Unknown,S
720,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0000,Unknown,S
39,40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,Unknown,C
...,...,...,...,...,...,...,...,...,...,...,...,...
821,822,1,3,"Lulic, Mr. Nikola",male,27.0,0,0,315098,8.6625,Unknown,S
633,634,0,1,"Parr, Mr. William Henry Marsh",male,28.0,0,0,112052,0.0000,Unknown,S
456,457,0,1,"Millet, Mr. Francis Davis",male,65.0,0,0,13509,26.5500,E38,S
500,501,0,3,"Calic, Mr. Petar",male,17.0,0,0,315086,8.6625,Unknown,S


In [41]:
test_data['Sex'] = label_encoder.transform(test_data['Sex'])

In [42]:
test_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
709,710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",1,28.0,1,1,2661,15.2458,Unknown,C
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",1,31.0,0,0,C.A. 18723,10.5000,Unknown,S
840,841,0,3,"Alhomaki, Mr. Ilmari Rudolf",1,20.0,0,0,SOTON/O2 3101287,7.9250,Unknown,S
720,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",0,6.0,0,1,248727,33.0000,Unknown,S
39,40,1,3,"Nicola-Yarred, Miss. Jamila",0,14.0,1,0,2651,11.2417,Unknown,C
...,...,...,...,...,...,...,...,...,...,...,...,...
821,822,1,3,"Lulic, Mr. Nikola",1,27.0,0,0,315098,8.6625,Unknown,S
633,634,0,1,"Parr, Mr. William Henry Marsh",1,28.0,0,0,112052,0.0000,Unknown,S
456,457,0,1,"Millet, Mr. Francis Davis",1,65.0,0,0,13509,26.5500,E38,S
500,501,0,3,"Calic, Mr. Petar",1,17.0,0,0,315086,8.6625,Unknown,S


## One-Hot Encoding