In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [None]:
data = pd.read_csv('/kaggle/input/titanic/train.csv')

In [None]:
data.head()

## Preprocessing

In [None]:
data.count()

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data['Survived'].value_counts()

In [None]:
data['Pclass'].value_counts()

In [None]:
data['Survived'].corr(data['Pclass'])

In [None]:
data['Name'].nunique()

In [None]:
print('mean',data['Age'].mean())
print('median',data['Age'].median())

In [None]:


# Histogram
plt.figure(figsize=(10, 6))
sns.histplot(data['Age'], kde=True)
plt.title('Histogram of the Column')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

# Q-Q plot
plt.figure(figsize=(10, 6))
stats.probplot(data['Age'], dist="norm", plot=plt)
plt.title('Q-Q Plot')
plt.show()

In [None]:
n = 891
nulls = data['Age'].isna().sum()
print(f'Age have {(nulls/n)*100}% nulls')

In [None]:
data['Survived'].corr(data['Age'])

In [None]:
print('mean: ',data['SibSp'].mean())
print('median: ',data['SibSp'].median())

In [None]:
data['SibSp'].value_counts()

In [None]:
data['Survived'].corr(data['SibSp'])

In [None]:
print('mean: ',data['Parch'].mean())
print('median: ',data['Parch'].median())

In [None]:
data['Parch'].value_counts()

In [None]:
data['Survived'].corr(data['Parch'])

In [None]:
data['SibSp'].corr(data['Parch'])

In [None]:
print('mean: ',data['Fare'].mean())
print('median: ',data['Fare'].median())

In [None]:
# Histogram
plt.figure(figsize=(10, 6))
sns.histplot(data['Fare'], kde=True)
plt.title('Histogram of Fare')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

# Q-Q plot
plt.figure(figsize=(10, 6))
stats.probplot(data['Fare'], dist="norm", plot=plt)
plt.title('Q-Q Plot')
plt.show()

In [None]:
data['Cabin'].value_counts()

In [None]:
data['Cabin'].nunique()

In [None]:
n = 891
nulls = data['Cabin'].isna().sum()
print(f'Cabin have {(nulls/n)*100}% nulls')

In [None]:
data['Embarked'].value_counts()

In [None]:
n = 891
nulls = data['Embarked'].isna().sum()
print(f'Embarked have {(nulls/n)*100}% nulls')

In [None]:
data['Survived'].corr(data['PassengerId'])

In [None]:
data.head()

#### Findings from pre-processing

- no duplicated values

- 549 people didn't survived while 342 survived that shows class imbalance. 

- passenger ID can be drop due to very low correlation that is -0.005

- name has all unique values, it needs to be encode. As it contains high cardinality so one hot and label encoder cannot be used.

- Age is right skewed. it needs to be normalize. Age also have 19% nulls. 

- SibSp and Parch have very low co-relation with survived

- Fare is also positively skewed 

- Cabin have 77% nulls. there is no point to fill those nulls.

- Embarked have 0.2% nulls. and it needs to encode using one-hot encoding as it is nominal data.

## Feature engineering and model training

In [None]:
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

In [None]:
data.drop(columns=['Cabin','PassengerId','Name','Ticket'],inplace =True)

In [None]:
X = data.drop('Survived', axis=1)
y = data['Survived']

In [None]:
numeric_features = ['Age', 'Fare']
numeric_transformer = SimpleImputer(strategy='mean')
X[numeric_features] = numeric_transformer.fit_transform(X[numeric_features])



power_transformer = PowerTransformer(method='yeo-johnson')
X[numeric_features] = power_transformer.fit_transform(X[numeric_features])

In [None]:
categorical_features = ['Embarked']
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoded_cat_features = categorical_transformer.fit_transform(X[categorical_features])

encoded_cat_df = pd.DataFrame(encoded_cat_features, columns=categorical_transformer.get_feature_names_out(categorical_features))

X = X.drop(categorical_features, axis=1)
X = pd.concat([X, encoded_cat_df], axis=1)

In [None]:
label_encoder = LabelEncoder()

X['Sex'] = label_encoder.fit_transform(X['Sex'])

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred = model.predict(X_test)

In [None]:
report = classification_report(y_test, y_pred, output_dict=True)

pd.DataFrame(report).transpose()