In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

tr = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train = tr.copy()
print(train.columns)
train["FamilySize"] =  test["SibSp"] + train["Parch"] + 1
train["IsAlone"] = (train["FamilySize"]== 1).astype(int)

x= train.drop(columns="Survived")
y = train["Survived"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state= 42, stratify= y)

x_train = x_train.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])
x_test = x_test.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

num_cols = ["Age", "Fare", "SibSp", "Parch", "FamilySize", "IsAlone"]
cat_cols = ["Sex", "Embarked", "Pclass"]

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop= "first", handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

pipe = Pipeline([
    ("prep", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

pipe.fit(x_train, y_train)
train_pred = pipe.predict(x_train)
test_pred = pipe.predict(x_test)

print("Train Accuracy: ", accuracy_score(y_train, train_pred))
print("Test Accuracy: ", accuracy_score(y_test, test_pred))

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Train Accuracy:  0.8089887640449438
Test Accuracy:  0.8044692737430168
