In [5]:
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn import set_config
set_config(transform_output='pandas')

titanic_data = pd.read_csv('train.csv')

titanic_data.info()

numeric = ["Ticket", "Fare"]
categoricals = ["Pclass", "Sex"]


initial_transformation = ColumnTransformer( 
    [
    ("imputer", SimpleImputer(strategy="most_frequent"), ["Sex"]),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False), [" PassengerId "]),
        ("scaler", StandardScaler(), numeric),
    ]
)

pipeline_sex = Pipeline( 
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False, drop="if_binary"))
    ]
)

final_transformation = ColumnTransformer(
    [
        ("transform twice Sex-col", pipeline_sex, ["Sex"]),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False), ["PassengerId"]),
        ("scaler", StandardScaler(), numeric),
    ]
)

def check_column_type(column):
    unique_values = column.unique()
    num_unique = len(unique_values)
    if num_unique == 2:
        return "Binary"
    elif num_unique < len(column) / 2:
        return "Categorical"
    else:
        return "Metric"


for column_name in titanic_data.columns:
    column = titanic_data[column_name]
    column_type = check_column_type(column)
    print(f"Column '{column_name}' is {column_type}")
    
for column_name in titanic_data.columns:
    if titanic_data[column_name].isnull().any():
        print(f"Column '{column_name}' contains missing values")
    else:
        print(f"Column '{column_name}' does not contain missing values")
def fillna_preprocessor(column):
    return column.fillna(0)  

def label_encoder_preprocessor(column):
    # Implement your label encoding logic here
    return encoded_column

def merge_columns(data):
    data['Ticket_Fare'] = data['Ticket'].astype(str) + '_' + data['Fare'].astype(str)
    data.drop(['Ticket', 'Fare'], axis=1, inplace=True)
    return data

preprocessors = []

for column_name in titanic_data.columns:
    if titanic_data[column_name].isnull().any():
        preprocessors.append(("imputer_" + column_name, SimpleImputer(), [column_name]))

print("List of preprocessors for handling missing values:")
print(preprocessors)

def label_encoder_preprocessor(Ticket):
    le = LabelEncoder()
    encoded_column = le.fit_transform(Ticket)
    return encoded_column

ticket_fare = Pipeline( 
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False, drop="if_binary"))
    ]
)

ticket_transformation = ColumnTransformer(
    [
        ("transform twice Sex-col", pipeline_sex, ["Ticket"]),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False), ["Fare"]),
        ("scaler", StandardScaler(), numeric),
    ]
)
titanic_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Column 'PassengerId' is Metric
Column 'Survived' is Binary
Column 'Pclass' is Categorical
Column 'Name' is Metric
Column 'Sex' is Binary
Column 'Age' is Categorical
Column 'SibSp' is Categorical
Column 'Parch' is Categorical
Column 'Ticket' is 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
titanic_data['Ticket_Prefix'] = titanic_data['Ticket'].str.extract('([A-Za-z]+)')

titanic_data['Ticket_Type'] = titanic_data['Ticket_Prefix'].apply(lambda x: 'Unknown' if pd.isnull(x) else x)

titanic_data.drop(['Ticket', 'Ticket_Prefix'], axis=1, inplace=True)


In [4]:
titanic_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Ticket_Type
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,,S,A
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C,PC
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,,S,STON
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,C123,S,Unknown
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,,S,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,,S,Unknown
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,B42,S,Unknown
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,,S,W
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C148,C,Unknown


In [6]:

num_bins = 5
fare_labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
titanic_data['Fare_Category'] = pd.qcut(titanic_data['Fare'], num_bins, labels=fare_labels)


titanic_data.drop(['Fare'], axis=1, inplace=True)


In [7]:
titanic_data


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,Embarked,Fare_Category
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,,S,Very Low
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85,C,Very High
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,,S,Low
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123,S,Very High
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,,S,Low
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,,S,Medium
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,B42,S,High
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,,S,High
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,C148,C,High


In [8]:

titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)


In [9]:

titanic_data['Cabin_Missing'] = titanic_data['Cabin'].isnull().astype(int)

titanic_data.drop(['Cabin'], axis=1, inplace=True)

In [10]:
titanic_data


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Embarked,Fare_Category,Cabin_Missing
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,S,Very Low,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,C,Very High,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,S,Low,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,S,Very High,0
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,S,Low,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,S,Medium,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,S,High,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,S,High,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,C,High,0


In [12]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
titanic_data['Pclass_Label'] = label_encoder.fit_transform(titanic_data['Pclass'])

titanic_data['Ticket_Type_Label'] = label_encoder.fit_transform(titanic_data['Ticket'])


titanic_data['Fare_Category_Label'] = label_encoder.fit_transform(titanic_data['Fare_Category'])


In [15]:

pclass_order = {'First': 3, 'Second': 2, 'Third': 1}
titanic_data['Pclass_Ordinal'] = titanic_data['Pclass'].map(pclass_order)

ticket_type_order = {'Unknown': 0, 'Type1': 1, 'Type2': 2, 'Type3': 3}
titanic_data['Ticket_Type_Ordinal'] = titanic_data['Ticket'].map(ticket_type_order)

fare_category_order = {'Very Low': 1, 'Low': 2, 'Medium': 3, 'High': 4, 'Very High': 5}
titanic_data['Fare_Category_Ordinal'] = titanic_data['Fare_Category'].map(fare_category_order)


In [16]:
mean_target_pclass = titanic_data.groupby('Pclass')['Survived'].mean()
titanic_data['Pclass_Target_Encoded'] = titanic_data['Pclass'].map(mean_target_pclass)


In [21]:
numeric = ["Fare", "Age"]


initial_transformation = ColumnTransformer(
    [
        ("imputer", SimpleImputer(strategy="most_frequent"), ["Sex"]),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False), ["PassengerId"]),
        ("scaler", StandardScaler(), numeric),
    ]
)


In [22]:
titanic_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Embarked,Fare_Category,Cabin_Missing,Pclass_Label,Ticket_Type_Label,Fare_Category_Label,Pclass_Ordinal,Ticket_Type_Ordinal,Fare_Category_Ordinal,Pclass_Target_Encoded,Pclass_Fare_Interaction
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,S,Very Low,1,2,523,4,,,1,0.242363,12
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,C,Very High,0,0,596,3,,,5,0.629630,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,S,Low,1,2,669,1,,,2,0.242363,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,S,Very High,0,0,49,3,,,5,0.629630,3
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,S,Low,1,2,472,1,,,2,0.242363,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,S,Medium,1,1,101,2,,,3,0.472826,4
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,S,High,0,0,14,0,,,4,0.629630,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,S,High,1,2,675,0,,,4,0.242363,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,C,High,0,0,8,0,,,4,0.629630,0


In [25]:
preprocessors = [
    ('imputer_Age', SimpleImputer(), ['Age']),
    ('imputer_Cabin', SimpleImputer(), ['Cabin']),
    ('imputer_Embarked', SimpleImputer(), ['Embarked'])
]


initial_transformation = ColumnTransformer(
    [
        ("imputer", SimpleImputer(strategy="most_frequent"), ["Sex"]),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False), ["PassengerId"]),
        ("scaler", StandardScaler(), numeric),
    ] + preprocessors 
)


In [26]:
titanic_data


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Embarked,Fare_Category,Cabin_Missing,Pclass_Label,Ticket_Type_Label,Fare_Category_Label,Pclass_Ordinal,Ticket_Type_Ordinal,Fare_Category_Ordinal,Pclass_Target_Encoded,Pclass_Fare_Interaction
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,S,Very Low,1,2,523,4,,,1,0.242363,12
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,C,Very High,0,0,596,3,,,5,0.629630,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,S,Low,1,2,669,1,,,2,0.242363,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,S,Very High,0,0,49,3,,,5,0.629630,3
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,S,Low,1,2,472,1,,,2,0.242363,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,S,Medium,1,1,101,2,,,3,0.472826,4
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,S,High,0,0,14,0,,,4,0.629630,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,S,High,1,2,675,0,,,4,0.242363,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,C,High,0,0,8,0,,,4,0.629630,0
