In [1]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


In [2]:
df= pd.read_csv("crash.csv")
df

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Class,Name,Sex,Age,Ticket Price,Safety
0,0,1,Didn't Survive,Economy,"Braund, Mr. Owen Harris",male,22.0,7.2500,0.336957
1,1,2,Survived,First Class,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,0.553571
2,2,3,Survived,Economy,"Heikkinen, Miss. Laina",female,26.0,7.9250,0.336957
3,3,4,Survived,First Class,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1000,0.336957
4,4,5,Didn't Survive,Economy,"Allen, Mr. William Henry",male,35.0,8.0500,0.336957
...,...,...,...,...,...,...,...,...,...
886,886,887,Didn't Survive,Business,"Montvila, Rev. Juozas",male,27.0,13.0000,0.336957
887,887,888,Survived,First Class,"Graham, Miss. Margaret Edith",female,19.0,30.0000,0.336957
888,888,889,Didn't Survive,Economy,"Johnston, Miss. Catherine Helen ""Carrie""",female,,23.4500,0.336957
889,889,890,Survived,First Class,"Behr, Mr. Karl Howell",male,26.0,30.0000,0.553571


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    891 non-null    int64  
 1   PassengerId   891 non-null    int64  
 2   Survived      891 non-null    object 
 3   Class         891 non-null    object 
 4   Name          891 non-null    object 
 5   Sex           891 non-null    object 
 6   Age           714 non-null    float64
 7   Ticket Price  891 non-null    float64
 8   Safety        889 non-null    float64
dtypes: float64(3), int64(2), object(4)
memory usage: 62.8+ KB


In [4]:
df.isna().sum()

Unnamed: 0        0
PassengerId       0
Survived          0
Class             0
Name              0
Sex               0
Age             177
Ticket Price      0
Safety            2
dtype: int64

### we have some missings 

## Droping useless columns

In [5]:
df.drop(["Unnamed: 0", "PassengerId", "Name"] , axis=1, inplace=True)

## Checking the value counts 

In [6]:
for col in ["Class", "Survived", "Sex"]:
        print(f"{col}: {df[col].value_counts()}")

Class: Class
Economy        491
First Class    216
Business       184
Name: count, dtype: int64
Survived: Survived
Didn't Survive    549
Survived          342
Name: count, dtype: int64
Sex: Sex
male      577
female    314
Name: count, dtype: int64


### check if there are 0s where 0 is meaningless

In [7]:
for i in df.columns:
    print(i,len(df[df[i] == 0]))

Survived 0
Class 0
Sex 0
Age 0
Ticket Price 15
Safety 0


### check if we have "?" in the values

In [8]:
for i in df.columns:
    print(i,len(df[df[i] == "?"]))

Survived 0
Class 0
Sex 0
Age 0
Ticket Price 0
Safety 0


### 15 missing values at Ticket Price

### Creating columns list

In [9]:
numerical_cols = ["Age" ,  "Ticket Price", "Safety"]
categorical_cols = ["Survived",	"Class", "Sex"]

## Mapping 0 and 1 for Sex and Survived

In [10]:
df["Sex"] = df["Sex"].map({"male":1 , "female":0})

In [11]:
df["Survived"] = df["Survived"].map({"Didn't Survive": 0 , "Survived":1})

## one hot encoding the Class

In [12]:
# Create an instance of the OneHotEncoder
encoder = OneHotEncoder()

# Fit the encoder on the "Class" column and transform it
class_encoded = encoder.fit_transform(df[['Class']])

# Convert the encoded result to an array and create a DataFrame
class_encoded_df = pd.DataFrame(class_encoded.toarray(), columns=encoder.get_feature_names_out(['Class']))

# Concatenate the encoded DataFrame with the original DataFrame
df = pd.concat([df.drop(columns=['Class']), class_encoded_df], axis=1)

## Outlier detection 

I will use IQR outlier detection

In [13]:
# Handling outliers using the IQR method
def handle_outliers_with_IQR(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 -  1.5*IQR
    upper_bound = Q3 +  1.5*IQR
    df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)

for col in numerical_cols:
    handle_outliers_with_IQR(df, col)

In [14]:
df.shape

(891, 8)

### No outliers detected 

## Handling missing values

In [15]:
#replace "?" with Nan
df["Ticket Price"]= df["Ticket Price"].replace({0: None})
# Replace 'None' values with NaN
df.fillna(value=np.nan, inplace=True)

  df.fillna(value=np.nan, inplace=True)


let's create a pipeline for sake of ease

### median imputation

In [16]:
# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(df.drop(["Survived"], axis=1), df["Survived"], train_size=0.6, test_size=0.4,random_state=0)

In [17]:
# Preprocessing for categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ("scaler", MinMaxScaler())
])

In [18]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
    ])

In [19]:
model = RandomForestClassifier(n_estimators=100, random_state=0)

In [20]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

In [21]:
# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

In [22]:
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

In [23]:
auc = roc_auc_score(y_valid, preds)
auc

0.6363122171945701

### KNN imputation

In [24]:
# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(df.drop(["Survived"], axis=1), df["Survived"], train_size=0.6, test_size=0.4,random_state=0)

In [25]:
# Preprocessing for categorical data
numerical_transformer = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ('imputer', KNNImputer(n_neighbors=29))
])

In [26]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
    ])

In [27]:
model = RandomForestClassifier(n_estimators=100, random_state=0)

In [28]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

In [29]:
# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

In [30]:
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

In [31]:
auc = roc_auc_score(y_valid, preds)
auc

0.6315045248868778

## Median imputer worked better 

In [32]:
# Create a SimpleImputer instance
imputer = SimpleImputer(strategy='median')

# Fit and transform your data with the imputer
imputed_array = imputer.fit_transform(df)
df = pd.DataFrame(imputed_array, columns=df.columns)

## Now we are good to go for training SVM 

In [33]:
# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(df.drop(["Survived"], axis=1), df["Survived"], train_size=0.8, test_size=0.2,random_state=0)

In [34]:
my_pipeline = Pipeline(steps=[('preprocessor', MinMaxScaler()),
                              ('model', SVC())
                             ])

In [35]:
# Train the SVM model
my_pipeline.fit(X_train, y_train)
# Predict the labels for the test set
y_pred = my_pipeline.predict(X_valid)

In [36]:
# Calculate accuracy
accuracy = accuracy_score(y_valid, y_pred)

# Calculate precision
precision = precision_score(y_valid, y_pred)

# Calculate recall
recall = recall_score(y_valid, y_pred)


In [37]:
# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.8212290502793296
Precision: 0.8775510204081632
Recall: 0.6231884057971014


## Grid search CV for hyper parameters tunning

In [38]:
param_grid = {
    'model__C': [0.1, 1, 10, 100],
    'model__kernel': ['linear', 'rbf', 'poly'],
    'model__degree': [2, 3,4]
}

In [39]:
my_pipeline = Pipeline(steps=[('preprocessor', MinMaxScaler()),
                              ('model', SVC())
                             ])

In [40]:
# Initialize GridSearchCV with the SVM classifier and parameter grid
grid_search = GridSearchCV(my_pipeline, param_grid, cv=5, scoring='accuracy')
# Perform grid search with cross-validation
grid_search.fit(X_train, y_train)
# Get the best hyperparameters
best_params = grid_search.best_params_

In [42]:
best_params

{'model__C': 10, 'model__degree': 3, 'model__kernel': 'poly'}

## Train the model with optimized hyper parameters

In [43]:
my_pipeline = Pipeline(steps=[('preprocessor', MinMaxScaler()),
                              ('model', SVC(C = 1, degree=3 , kernel="poly"))
                             ])

In [44]:
# Train the SVM model
my_pipeline.fit(X_train, y_train)
# Predict the labels for the test set
y_pred = my_pipeline.predict(X_valid)

In [45]:
# Calculate accuracy
accuracy = accuracy_score(y_valid, y_pred)
# Calculate precision
precision = precision_score(y_valid, y_pred)
# Calculate recall
recall = recall_score(y_valid, y_pred)

In [46]:
# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.7988826815642458
Precision: 0.7796610169491526
Recall: 0.6666666666666666
