# Titanic - Machine Learning from Disaster

In [None]:
# import libraries
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# open files
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
# transform to lowercase
train_data = train_data.applymap(lambda x: x.lower() if isinstance(x, str) else x)
train_data.columns = train_data.columns.str.lower()
# see the first 5 rows
train_data.head()

In [None]:
# see the general information 
train_data.info()

<div style="background-color: yellow;">
Missing values detected: age, cabin
</div>

In [None]:
# description statistics summary
train_data.describe(include = 'all')

In [None]:
# fill missing values
train_data['age'].fillna(train_data['age'].mean(), inplace=True)
train_data['age'].isna().sum()

In [None]:
# check outliers in colum age
train_data.boxplot(column = 'age');

In [None]:
# delete columns
train_data.drop(['passengerid','name','ticket','fare','cabin','embarked'], axis = 1, inplace = True)
train_data.shape

<div style="background-color: yellow;">
Сolumns passengerid, name, ticket, fare, cabin, embarked deleted as not significant.
</div>

In [None]:
# encode column sex
train_data = pd.get_dummies(train_data, columns = ['sex'], drop_first = True, dtype= int)

In [None]:
sns.countplot(data=train_data, x='survived');

In [None]:
sns.countplot(data = train_data, x = 'pclass');

In [None]:
sns.countplot(data = train_data, x = 'sex_male');

In [None]:
sns.histplot(data=train_data, x='age',bins=10, kde=True);

In [None]:
sns.countplot(data = train_data, x = 'sibsp');

In [None]:
sns.countplot(data = train_data, x = 'parch');

In [None]:
survived_class = train_data.groupby('pclass')['survived'].sum()
survived_class_ttl = train_data.groupby('pclass')['survived'].count()
survived_class_rate = survived_class/survived_class_ttl 
sns.barplot(x=survived_class_rate.index, y=survived_class_rate.values)
plt.title('Survival Rate by Class')
plt.xlabel('Class')
plt.ylabel('Survival Rate');

In [None]:
sns.countplot(x='sex_male', hue='survived', data=train_data)
plt.xlabel('Gender (0: Female, 1: Male)')
plt.ylabel('Count')
plt.title('Survival Count by Gender')
plt.legend(title='Survived', labels=['No', 'Yes'])
plt.show()

In [None]:
# correlation matrix
corr_matrix = train_data.corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()

In [None]:
# add a new column fam_on_board and remove parch and sipsp
train_data['fam_on_board'] = train_data['parch'] + train_data['sibsp']
train_data.drop(['parch', 'sibsp'], axis=1, inplace=True)
train_data

<div style="background-color: yellow;">
Сolumns parch (# of parents / children aboard the Titanic) and sibsp (# of siblings / spouses aboard the Titanic) replaced with a sum fam_on_board (parch+sibsp).
</div>

In [None]:
X = train_data.drop('survived', axis=1)
y = train_data['survived']

In [None]:
# split on train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1209)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
# standardize features by removing the mean and scaling to unit variance.
scaler = StandardScaler()
scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model = SVC(class_weight='balanced', random_state=1309)
params = {'C':[0.01, 0.1, 1, 10, 100], 'gamma':[0.1, 1, 10]}
grid_search = GridSearchCV(model, params, cv=5)
grid_search.fit(X_train_scaled, y_train)
grid_search.best_estimator_

In [None]:
y_pred = grid_search.predict(X_test_scaled)

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
test_data

In [None]:
test_data = test_data.applymap(lambda x: x.lower() if isinstance(x, str) else x)
test_data.columns = test_data.columns.str.lower()
test_data['age'].fillna(test_data['age'].mean(), inplace=True)
features = ['pclass', 'sex', 'age', 'sibsp','parch']
X_test = test_data[features]
X_test = pd.get_dummies(X_test, columns = ['sex'], drop_first = True, dtype= int)
X_test['fam_on_board'] = test_data['parch'] + test_data['sibsp']
X_test = X_test.drop(['parch', 'sibsp'], axis=1)
X_test_scaled = scaler.transform(X_test)

In [None]:
predictions = grid_search.predict(X_test_scaled)

output = pd.DataFrame({'PassengerId': test_data.passengerid, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")