In [45]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 

def breakit():
    print("\n".join(["", 50*"=",  ""]))

In [62]:
titanic = fetch_openml(data_id=40945, parser="auto") # laod dataset
df = titanic["frame"] # /!\ WARNNING: it contains the target, some do not forget to remove it to build X (train/test data)

breakit()
print("URL:", titanic["url"])
breakit()
print("what inside the dict return by fetch_openml:", list(titanic))
breakit()



URL: https://www.openml.org/d/40945


what inside the dict return by fetch_openml: ['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url']




In [3]:
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [64]:
# Show nan values per columnns
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [5]:
# Data statistics
df.describe().round(1)

Unnamed: 0,pclass,age,sibsp,parch,fare,body
count,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.3,29.9,0.5,0.4,33.3,160.8
std,0.8,14.4,1.0,0.9,51.8,97.7
min,1.0,0.2,0.0,0.0,0.0,1.0
25%,2.0,21.0,0.0,0.0,7.9,72.0
50%,3.0,28.0,0.0,0.0,14.5,155.0
75%,3.0,39.0,1.0,0.0,31.3,256.0
max,3.0,80.0,8.0,9.0,512.3,328.0


In [65]:
# Replace nan values
df['age'].fillna(value=df["age"].mean(), inplace=True)
df['fare'].fillna(value=df["fare"].mean(), inplace=True)


# Show the types of columns
print("Type for each columns")
df.dtypes

Type for each columns


pclass          int64
survived     category
name           object
sex          category
age           float64
sibsp           int64
parch           int64
ticket         object
fare          float64
cabin          object
embarked     category
boat           object
body          float64
home.dest      object
dtype: object

In [85]:
# Define the preprocessing steps for numerical and categorical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object', "category"]).columns

breakit()
print("Numerical:", numerical_features)
print("Categorical:", categorical_features)
breakit()

# It a nice way to show the data, but its does not work out of the box, for
# example, pclass is a categorical value. While name is a unique identifier and
# does no add usefull information to predict the outcomes (thouh one can try to predict to survival based an first name for fun)
# Let's define the column we will use manually
numerical_features = ["age", "sibsp", "parch", "fare"]
categorical_features = ["sex", "embarked", "pclass"] # cabin



Numerical: Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'body'], dtype='object')
Categorical: Index(['survived', 'name', 'sex', 'ticket', 'cabin', 'embarked', 'boat',
       'home.dest'],
      dtype='object')




In [86]:
# Use pipeline to preprocess the data

numerical_transformer = Pipeline(steps=[
   ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='most_frequent')), --> alternative strategie to fill nan values
    ('encoder', OneHotEncoder())
])

# Combine the preprocessing steps for numerical and categorical features
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

X = preprocessor.fit_transform(df)
y = np.array(df["survived"]).astype(int)

breakit()
print("Feature names after preprocessing:")
print(preprocessor.get_feature_names_out())
breakit()



Feature names after preprocessing:
['num__age' 'num__sibsp' 'num__parch' 'num__fare' 'cat__sex_female'
 'cat__sex_male' 'cat__embarked_C' 'cat__embarked_Q' 'cat__embarked_S'
 'cat__embarked_nan' 'cat__pclass_1' 'cat__pclass_2' 'cat__pclass_3']




In [87]:
# One hot encoder example

# Create a sample dataframe
data = {'color': ['red', 'blue', 'green', 'blue', 'red']}
dft = pd.DataFrame(data)

# Apply one-hot encoding
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(dft[['color']]).toarray()

encoded_data

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [80]:
models = {
    "svm": {"model": SVC, "grid": {..} },
    "logistic": LogisticRegression,
    "tree": DecisionTreeClassifier,
    "forest": RandomForestClassifier
}

scores = [
    accuracy_score,
    precision_score,
    recall_score, 
    f1_score
]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

results_tab = {}
for model_name, model in models.items():
    m = model()
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    results = []
    for score in scores:
        results.append(score(y_test, y_pred))
    results_tab[model_name] = results

pd.DataFrame.from_dict(results_tab, orient="index", columns=[str(n).split()[1] for n in scores]).round(2)

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
svm,0.82,0.81,0.69,0.75
logistic,0.78,0.77,0.61,0.68
tree,0.76,0.68,0.69,0.68
forest,0.8,0.79,0.64,0.71


In [88]:
models = {
    "svm": SVC,
    "logistic": LogisticRegression,
    "tree": DecisionTreeClassifier,
    "forest": RandomForestClassifier
}

scores = [
    accuracy_score,
    precision_score,
    recall_score, 
    f1_score
]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

results_tab = {}
for model_name, model in models.items():
    
    m = model()
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    results = []
    for score in scores:
        results.append(score(y_test, y_pred))
    results_tab[model_name] = results

pd.DataFrame.from_dict(results_tab, orient="index", columns=[str(n).split()[1] for n in scores]).round(2)

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
svm,0.82,0.81,0.69,0.75
logistic,0.79,0.77,0.64,0.7
tree,0.76,0.7,0.64,0.67
forest,0.79,0.79,0.6,0.68
