<a href="https://colab.research.google.com/github/chaitalisaha06/CSI_Assignments/blob/main/Data_Preprocessing_%26_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/titanic.csv")

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Step 1: Handling Missing Values

In [None]:
age_imputer = SimpleImputer(strategy='median')
df['Age'] = age_imputer.fit_transform(df[['Age']])

In [None]:
# # Fill missing Embarked values with the most common value (mode)
most_common_embarked = df['Embarked'].mode()[0]
df['Embarked'].fillna(most_common_embarked, inplace=True)

In [None]:
# Drop the Cabin column due to excessive missing values
df.drop(columns=['Cabin'], inplace=True)

In [None]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

## Step 2: Feature Encoding

In [None]:
# Encode Sex column to binary values
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])
df['Sex'].unique()

array([1, 0])

In [None]:
# One-hot encode Embarked column
label_encoder = LabelEncoder()
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
df['Embarked_Q'] = label_encoder.fit_transform(df['Embarked_Q'])
df['Embarked_S'] = label_encoder.fit_transform(df['Embarked_S'])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked_Q   891 non-null    int64  
 11  Embarked_S   891 non-null    int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 83.7+ KB


## Step 3: Feature Engineering

In [None]:
# Create FamilySize feature
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

In [None]:
# Create a new feature 'IsAlone' from 'FamilySize'
df['IsAlone'] = np.where(df['FamilySize'] > 1, 0, 1)

In [None]:
# Extract titles from the Name column
df['Title'] = df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
# Simplify titles by grouping
title_mapping = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare', 'Mlle': 'Miss', 'Countess': 'Royal',
    'Ms': 'Miss', 'Lady': 'Royal', 'Jonkheer': 'Royal', 'Don': 'Royal', 'Dona': 'Royal',
    'Mme': 'Mrs', 'Capt': 'Rare', 'Sir': 'Royal'
}
df['Title'] = df['Title'].map(title_mapping)
print(df['Title'].unique())
df['Title'] = label_encoder.fit_transform(df['Title'])
df['Title'].unique()

['Mr' 'Mrs' 'Miss' 'Master' 'Royal' 'Rare' nan]


array([2, 3, 1, 0, 5, 4, 6])

In [None]:
# Drop columns that are not useful for prediction
df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

## Step 4: Normalization/Scaling

In [None]:
# Scale 'Age' and 'Fare' columns
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Sex         891 non-null    int64  
 3   Age         891 non-null    float64
 4   SibSp       891 non-null    int64  
 5   Parch       891 non-null    int64  
 6   Fare        891 non-null    float64
 7   Embarked_Q  891 non-null    int64  
 8   Embarked_S  891 non-null    int64  
 9   FamilySize  891 non-null    int64  
 10  IsAlone     891 non-null    int64  
 11  Title       891 non-null    int64  
dtypes: float64(2), int64(10)
memory usage: 83.7 KB


In [None]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,FamilySize,IsAlone,Title
0,0,3,1,-0.565736,1,0,-0.502445,0,1,2,0,2
1,1,1,0,0.663861,1,0,0.786845,0,0,2,0,3
2,1,3,0,-0.258337,0,0,-0.488854,0,1,1,1,1
3,1,1,0,0.433312,1,0,0.42073,0,1,2,0,3
4,0,3,1,0.433312,0,0,-0.486337,0,1,1,1,2
