In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
df = pd.read_csv('/media/prince/5A4E832F4E83034D/TItanic Project/1.Training/v2data.csv')


In [3]:
df.info()
df['Fare'].mean()
df['Embarked'].unique()
# S: Southampton
# C: Cherbourg
# Q: Queenstown (now Cobh) 
df['Sex']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Survived     891 non-null    float64
 1   Pclass       891 non-null    float64
 2   Sex          891 non-null    object 
 3   Age          891 non-null    float64
 4   Fare         891 non-null    float64
 5   Embarked     891 non-null    object 
 6   Family_size  891 non-null    float64
dtypes: float64(5), object(2)
memory usage: 48.9+ KB


0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object

In [4]:
label = df['Survived']
predictors = df.drop(columns='Survived')
X_train, X_test, y_train, y_test = train_test_split(
    predictors, label, test_size=0.2, random_state=42
)

In [5]:
num_cols = ['Age', 'Fare', 'Family_size']
cat_cols = ['Pclass', 'Sex', 'Embarked']


In [6]:
numeric_pipeline = Pipeline([
    ("log", FunctionTransformer(np.log1p, feature_names_out="one-to-one")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("onehot", OneHotEncoder(
        handle_unknown="ignore",
        sparse_output=False
    ))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])

lin_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model',LogisticRegression(max_iter=1000) )
])

reg_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model',RandomForestClassifier()) 
    ])

In [7]:
lin_pipeline.fit(X_train, y_train)
reg_pipeline.fit(X_train, y_train)

In [15]:
y_pred_lin = lin_pipeline.predict(X_test)
y_pred_rf  = reg_pipeline.predict(X_test)
X_test.head()


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family_size
709,3.0,male,25.0,15.2458,C,3.0
439,2.0,male,31.0,10.5,S,1.0
840,3.0,male,20.0,7.925,S,1.0
720,2.0,female,6.0,33.0,S,2.0
39,3.0,female,14.0,11.2417,C,2.0


In [9]:
acc_lin = accuracy_score(y_test, y_pred_lin)
acc_rf  = accuracy_score(y_test, y_pred_rf)

print("Logistic Regression Accuracy:", acc_lin)
print("Random Forest Accuracy:", acc_rf)

Logistic Regression Accuracy: 0.8156424581005587
Random Forest Accuracy: 0.8212290502793296


In [10]:
cv_lin = cross_val_score(lin_pipeline, X_train, y_train, cv=5, scoring="accuracy")
cv_rf  = cross_val_score(reg_pipeline, X_train, y_train, cv=5, scoring="accuracy")

print("Logistic CV Mean:", cv_lin.mean())
print("RF CV Mean:", cv_rf.mean())

Logistic CV Mean: 0.8075642667191962
RF CV Mean: 0.7865261499064315


In [11]:
final_model = lin_pipeline
final_model.fit(X_train, y_train)

In [12]:
import joblib
# joblib.dump(final_model, "Titanic_survival.pkl")
# joblib.dump(lin_pipeline, "titanic_pipeline.pkl")
