In [1]:
import pycaret.classification as pc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
# Initialize the PyCaret classification setup without experiment logging
classification_setup = pc.setup(
    data=train_set,
    target='Survived',
    session_id=124,
    normalize=True,
    polynomial_features=True,
    remove_multicollinearity=True,
    log_experiment=False,  # Disable experiment logging
    experiment_name='Survived_classification',
    verbose=False  # Suppress logs
)

In [5]:
pc.models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [7]:
best_model = pc.compare_models(
    n_select=1,
    sort='Accuracy',
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.7872,0.8117,0.5465,0.8317,0.6557,0.5126,0.5382,0.154
svm,SVM - Linear Kernel,0.7069,0.7149,0.283,0.8403,0.4069,0.278,0.3545,0.089
lr,Logistic Regression,0.691,0.8523,0.1877,0.93,0.3027,0.2133,0.3214,1.125
et,Extra Trees Classifier,0.6908,0.783,0.2082,0.905,0.3163,0.2182,0.3138,0.163
dt,Decision Tree Classifier,0.6669,0.5864,0.2632,0.4472,0.328,0.1873,0.2015,0.093
rf,Random Forest Classifier,0.6607,0.8033,0.112,0.6214,0.1852,0.1215,0.195,0.199
gbc,Gradient Boosting Classifier,0.6426,0.6363,0.1652,0.2989,0.2104,0.1032,0.1137,0.144
ada,Ada Boost Classifier,0.6365,0.5219,0.0632,0.1351,0.0851,0.0487,0.0558,0.092
nb,Naive Bayes,0.6245,0.5,0.0,0.0,0.0,0.0,0.0,0.104
ridge,Ridge Classifier,0.6245,0.5965,0.0,0.0,0.0,0.0,0.0,0.093


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

In [9]:
best_model

In [10]:
# Tune the best model for better performance
tuned_model = pc.tune_model(best_model, 
                         optimize='Accuracy',  # Optimize for Mean Absolute Error
                         )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.78,0.9028,0.4444,0.8889,0.5926,0.4639,0.5162
1,0.84,0.82,0.6316,0.9231,0.75,0.6383,0.6632
2,0.8,0.8557,0.5789,0.8462,0.6875,0.5479,0.5693
3,0.78,0.7233,0.4737,0.9,0.6207,0.486,0.5357
4,0.74,0.8472,0.3684,0.875,0.5185,0.3786,0.4451
5,0.8,0.8803,0.5789,0.8462,0.6875,0.5479,0.5693
6,0.84,0.8744,0.5789,1.0,0.7333,0.6303,0.6784
7,0.74,0.7725,0.4737,0.75,0.5806,0.4059,0.4284
8,0.8367,0.8477,0.7222,0.8125,0.7647,0.6404,0.643
9,0.7551,0.8781,0.5,0.75,0.6,0.4335,0.452


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [12]:
pc.evaluate_model(tuned_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [13]:
# Finalize the model for deployment
final_model = pc.finalize_model(tuned_model)
final_model

In [14]:
# Predict on the test dataset
predictions = pc.predict_model(final_model, data=test_set)
predictions

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.7933,0.8835,0.6351,0.8246,0.7176,0.5588,0.5707


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,prediction_label,prediction_score
709,710,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.245800,,C,1,0,0.7087
439,440,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.500000,,S,0,0,1.0000
840,841,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925000,,S,0,0,1.0000
720,721,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.000000,,S,1,1,0.9141
39,40,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.241700,,C,1,0,0.5813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,434,3,"Kallio, Mr. Nikolai Erland",male,17.0,0,0,STON/O 2. 3101274,7.125000,,S,0,0,1.0000
773,774,3,"Elias, Mr. Dibo",male,,0,0,2674,7.225000,,C,0,0,0.9284
25,26,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...",female,38.0,1,5,347077,31.387501,,S,1,0,0.8294
84,85,2,"Ilett, Miss. Bertha",female,17.0,0,0,SO/C 14885,10.500000,,S,1,0,0.5236


In [15]:
# Extract true and predicted values
true_values = test_set['Survived']
# Verify the actual column name in predictions
print(predictions.columns)

# Replace 'Label' with the correct column name based on the output above
predicted_values = predictions['prediction_label']  # Update 'Label' if the column name is different

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Survived', 'prediction_label',
       'prediction_score'],
      dtype='object')


In [17]:
# evaluate the model's performance
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy = accuracy_score(true_values, predicted_values)
conf_matrix = confusion_matrix(true_values, predicted_values)
class_report = classification_report(true_values, predicted_values)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
# Print classification report
print("Classification Report:")
print(class_report)

Accuracy: 0.7932960893854749
Confusion Matrix:
[[95 10]
 [27 47]]
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       105
           1       0.82      0.64      0.72        74

    accuracy                           0.79       179
   macro avg       0.80      0.77      0.78       179
weighted avg       0.80      0.79      0.79       179



In [18]:
pc.save_model(final_model, 'final_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['PassengerId', 'Pclass', 'Age',
                                              'SibSp', 'Parch', 'Fare'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None...
                                     transformer=RemoveMulticollinearity(threshold=0.9))),
                 ('normalize',
         

In [19]:
model = pc.load_model('final_model')


Transformation Pipeline and Model Successfully Loaded


In [None]:
test_df_orig = pd.read_csv("data/test.csv")
test_df_orig.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [23]:
# List all steps in the final_model pipeline (including preprocessing)
print(final_model.named_steps)

{'numerical_imputer': TransformerWrapper(exclude=None,
                   include=['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch',
                            'Fare'],
                   transformer=SimpleImputer(add_indicator=False, copy=True,
                                             fill_value=None,
                                             keep_empty_features=False,
                                             missing_values=nan,
                                             strategy='mean')), 'categorical_imputer': TransformerWrapper(exclude=None,
                   include=['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'],
                   transformer=SimpleImputer(add_indicator=False, copy=True,
                                             fill_value=None,
                                             keep_empty_features=False,
                                             missing_values=nan,
                                             strategy='most_frequent')), 'ordinal_e

In [27]:
final_model.named_steps['actual_estimator']

In [24]:
from sklearn.pipeline import Pipeline
preprocessor_steps = list(final_model.named_steps.items())[:-1]
preprocessor_steps

[('numerical_imputer',
  TransformerWrapper(exclude=None,
                     include=['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch',
                              'Fare'],
                     transformer=SimpleImputer(add_indicator=False, copy=True,
                                               fill_value=None,
                                               keep_empty_features=False,
                                               missing_values=nan,
                                               strategy='mean'))),
 ('categorical_imputer',
  TransformerWrapper(exclude=None,
                     include=['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'],
                     transformer=SimpleImputer(add_indicator=False, copy=True,
                                               fill_value=None,
                                               keep_empty_features=False,
                                               missing_values=nan,
                                               stra

In [25]:
preprocessor = Pipeline(preprocessor_steps)

In [31]:
X_preprocessed = preprocessor.transform(test_df_orig)
X_preprocessed.columns

Index(['Name', 'PassengerId Pclass', 'PassengerId Name', 'PassengerId Sex',
       'PassengerId Age', 'PassengerId SibSp', 'PassengerId Parch',
       'PassengerId Fare', 'PassengerId Embarked_S', 'PassengerId Embarked_C',
       'PassengerId Embarked_Q', 'Pclass Name', 'Pclass Sex', 'Pclass Age',
       'Pclass Cabin', 'Pclass Embarked_S', 'Pclass Embarked_C', 'Name Sex',
       'Name Age', 'Name SibSp', 'Name Parch', 'Name Fare', 'Name Cabin',
       'Name Embarked_S', 'Name Embarked_C', 'Name Embarked_Q', 'Sex Age',
       'Sex SibSp', 'Sex Parch', 'Sex Fare', 'Sex Embarked_S',
       'Sex Embarked_C', 'Sex Embarked_Q', 'Age SibSp', 'Age Parch',
       'Age Embarked_S', 'SibSp Fare', 'SibSp Embarked_C', 'SibSp Embarked_Q',
       'Parch^2', 'Parch Fare', 'Parch Embarked_C', 'Parch Embarked_Q',
       'Fare^2', 'Fare Embarked_S', 'Fare Embarked_C', 'Fare Embarked_Q',
       'Embarked_S Embarked_C', 'Embarked_S Embarked_Q',
       'Embarked_C Embarked_Q'],
      dtype='object')

In [32]:
len(X_preprocessed.columns)

50

In [36]:
# Predict using the preprocessed test data
model_predictions_1 = model.predict(test_df_orig)

# Add predictions to the original test dataframe using the correct index
model_predictions_1

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [35]:
# Get the estimator (the actual model) from the pipeline
estimator = model.named_steps['actual_estimator']

# Predict using the preprocessed data
model_predictions = estimator.predict(X_preprocessed)
model_predictions

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [None]:
submission_df = pd.DataFrame({
    'PassengerId': test_df_orig['PassengerId'],
    'Survived': model_predictions
})
submission_df.to_csv('data/submission.csv', index=False)