In [69]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score
import warnings
warnings.filterwarnings("ignore")

In [70]:
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [87]:
df.fare.dtype


dtype('float64')

In [71]:
X = df.drop("survived", axis=1)
y = df.survived

In [72]:
class PreProcer(BaseEstimator, TransformerMixin):
    def fit(self,X, y=None):
        self.ageimpute = SimpleImputer()
        self.ageimpute.fit(X[["age"]])
        return self
    def transform(self,X, y=None):
        X["age"] = self.ageimpute.transform(X[["age"]])
        X["cabinClass"] = X["cabin"].fillna("M").apply(lambda x:str(x).replace(" ","")).apply(lambda x: re.sub(r"[^a-zA-Z]", '',x))
        X["cabinNumber"] = X["cabin"].fillna("M").apply(lambda x:str(x).replace(" ","")).apply(lambda x: re.sub(r"[^0-9]", '',x)).replace("",0)
        X["embarked"] = X["embarked"].fillna("M")
        X = X.drop(["name", "ticket", "cabin", "boat", "body","home.dest"], axis=1, errors="ignore")
        return X

In [73]:
preprocer = PreProcer()
numeric_pipe = Pipeline([("Scaler", StandardScaler())])
cat_pipe = Pipeline([("OneHot", OneHotEncoder(handle_unknown="ignore"))])
transform = ColumnTransformer([("num", numeric_pipe, ["pclass", "age", "sibsp", "parch", "fare", "cabinNumber"]),("cat", cat_pipe, ["sex", "embarked", "cabinClass"])])

In [74]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1234)

In [75]:
X.shape

(1309, 13)

In [76]:
x_train

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
588,2,"Wells, Miss. Joan",female,4.0,1,1,29103,23.0000,,S,14,,"Cornwall / Akron, OH"
351,2,"Brown, Mr. Thomas William Solomon",male,60.0,1,1,29750,39.0000,,S,,,"Cape Town, South Africa / Seattle, WA"
99,1,"Duff Gordon, Lady. (Lucille Christiana Sutherl...",female,48.0,1,0,11755,39.6000,A16,C,1,,London / Paris
398,2,"Drew, Master. Marshall Brines",male,8.0,0,2,28220,32.5000,,S,10,,"Greenport, NY"
522,2,"Otter, Mr. Richard",male,39.0,0,0,28213,13.0000,,S,,,"Middleburg Heights, OH"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,3,"Barah, Mr. Hanna Assi",male,20.0,0,0,2663,7.2292,,C,15,,
1228,3,"Stranden, Mr. Juho",male,31.0,0,0,STON/O 2. 3101288,7.9250,,S,9,,
1077,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.7500,,Q,D,,
723,3,"Conlon, Mr. Thomas Henry",male,31.0,0,0,21332,7.7333,,Q,,,"Philadelphia, PA"


In [77]:
mlp = Pipeline(
    [
        ("InitialPreProcer", PreProcer()),
        ("Transformer", transform),
        ("xgb", XGBClassifier())
    ]
    
)

In [78]:
mlp.fit(x_train,y_train)

In [79]:
y_hat = mlp.predict(x_test)

In [80]:
precision_score(y_test,y_hat)

0.7857142857142857

In [81]:
import joblib

In [82]:
joblib.dump(mlp, "xgbpipe.joblib")

['xgbpipe.joblib']