In [29]:
import pandas as pd
import numpy as np

# Loading the dataset

In [30]:
df=pd.read_csv('Titanic-Dataset.csv')

# Handling Null Values

In [31]:
# Check for null values (1 line)
print(df.isnull().sum())

# Fill null values for 'Embarked' with the mode (1 line)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
print(df.isnull().sum())

# Drop rows with null values in 'Cabin' (1 line)
df = df.dropna(subset=['Cabin'])
print(df.isnull().sum())


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64
PassengerId     0
Survived        0
Pclass          0
Name            0
Sex             0
Age            19
SibSp           0
Parch           0
Ticket          0
Fare            0
Cabin           0
Embarked        0
dtype: int64


# Handling Duplicates

In [32]:
# Check for duplicates (1 line)

print(df.duplicated().sum())

# Drop duplicates if any (1 line)
df.count()


0


PassengerId    204
Survived       204
Pclass         204
Name           204
Sex            204
Age            185
SibSp          204
Parch          204
Ticket         204
Fare           204
Cabin          204
Embarked       204
dtype: int64

# Handling Outliers

In [33]:
import numpy as np

# Function to remove outliers using the IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers from 'Fare' using the remove_outilers function (1 line)

df = remove_outliers(df, 'Fare')

df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 187 entries, 1 to 889
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  187 non-null    int64  
 1   Survived     187 non-null    int64  
 2   Pclass       187 non-null    int64  
 3   Name         187 non-null    object 
 4   Sex          187 non-null    object 
 5   Age          169 non-null    float64
 6   SibSp        187 non-null    int64  
 7   Parch        187 non-null    int64  
 8   Ticket       187 non-null    object 
 9   Fare         187 non-null    float64
 10  Cabin        187 non-null    object 
 11  Embarked     187 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 19.0+ KB


# Scaling and Normalization

In [34]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standard scaling for 'Fare' (2 lines)
scaler = StandardScaler()
df['Fare'] = scaler.fit_transform(df[['Fare']])
# Min-Max scaling for 'Age' (2 lines)
min_max_scaler = MinMaxScaler()
df['Age'] = min_max_scaler.fit_transform(df[['Age']])

# Encoding Categorical Variables

In [35]:
# One-hot encoding for 'Embarked' and 'Sex' (1 line)
one_hot_encoded_df = pd.get_dummies(df, columns=['Embarked', 'Sex'])

In [37]:
one_hot_encoded_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.468892,1,0,PC 17599,0.336381,C85,True,False,False,True,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.430956,1,0,113803,-0.137061,C123,False,False,True,True,False
6,7,0,1,"McCarthy, Mr. Timothy J",0.671219,0,0,17463,-0.169282,E46,False,False,True,False,True
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",0.038948,1,1,PP 9549,-1.084813,G6,False,False,True,True,False
11,12,1,1,"Bonnell, Miss. Elizabeth",0.721801,0,0,113783,-0.828347,C103,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",0.582701,1,1,11751,-0.151272,D35,False,False,True,True,False
872,873,0,1,"Carlsson, Mr. Frans Olof",0.405665,0,0,695,-1.389448,B51 B53 B55,False,False,True,False,True
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",0.696510,0,1,11767,0.645572,C50,True,False,False,True,False
887,888,1,1,"Graham, Miss. Margaret Edith",0.228629,0,0,112053,-0.738519,B42,False,False,True,True,False
