# My Playground

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import re
import math
import pickle
from datetime import datetime
import time
import warnings
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
start_time = time.time()

In [2]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, learning_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, f1_score, precision_score, plot_roc_curve
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression 
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier

In [3]:
df_train2 = pd.read_csv('data/train.csv')
df_test2 = pd.read_csv('data/test.csv')
df = pd.concat([df_train2, df_test2], axis=0)

# Preprocessing

In [4]:
df.sample()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
427,428,1.0,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louis...",female,19.0,0,0,250655,26.0,,S


In [5]:
title_mapping = {
    'Mr.': 'Mr.',
    'Mrs.': 'Mrs.',
    'Miss.': 'Miss',
    'Ms.': 'Miss',
    'Master.': 'Master.',
    'Mlle.': 'Miss',
    'Mme.': 'Miss',
    'Dr.': 'Special_title',
    'Sir': 'Special_title',
    'Col.': 'Special_title',
    'Capt.': 'Special_title',
    'Don.': 'Special_title',
    'Major.': 'Special_title',
    'Jonkheer.': 'Special_title',
    'Rev.': 'Special_title',
    'Countess.': 'Special_title',
    'Lady.': 'Special_title',
}
df['Title'] = df['Name'].apply(lambda name: next((title_mapping[title] for title in title_mapping if title in name), None))

In [6]:
median_age_titles = df.groupby('Title')['Age'].transform('median').round(0)
df['Age'] = df['Age'].fillna(median_age_titles)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df['Fare'].fillna(8.0, inplace=True)
df['Title'].fillna(df['Title'].mode()[0], inplace=True)

In [7]:
df.drop(columns=['PassengerId', 'Ticket', 'SibSp', 'Parch', 'Cabin', 'Name', 'Age', 'Embarked'], inplace=True)

**==================================================================================================================**

In [8]:
idx_split = len(df_train2)

df_train = df.iloc[:idx_split]
df_test = df.iloc[idx_split:].drop(columns=['Survived'])

X = df_train.drop(columns=['Survived'])
y = df_train['Survived']
X_validation = df_test.copy()

print(X.shape)
print(y.shape)
print(X_validation.shape)

(891, 4)
(891,)
(418, 4)


In [9]:
X.sample()

Unnamed: 0,Pclass,Sex,Fare,Title
286,3,male,9.5,Mr.


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
algorithms_scaled_data = [
    LogisticRegression(),
    SVC(random_state=42),
    KNeighborsClassifier()
]

algorithms_unscaled_data = [
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    XGBClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    AdaBoostClassifier(random_state=42)
]

algorithms = algorithms_scaled_data + algorithms_unscaled_data

In [12]:
pipelines = {}
num_features = X.select_dtypes(include=['number']).columns
cat_features = X.select_dtypes(include=['object', 'category']).columns

for algorithm in algorithms:
    
    # extract algorithm name as a string
    algorithm_name = algorithm.__class__.__name__
    
    # Create a column transformer for handling categorical and numerical features separately 
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_features), 
            ('cat', OneHotEncoder(drop='first', handle_unknown="ignore"), cat_features) 
        ]
    )
    
    # make pipelines for algorithms need scaling and onehotencoding (defined in preprocessor)
    if algorithm in algorithms_scaled_data:
        pipeline = Pipeline(steps = [
                ('preprocessor', preprocessor),
                (algorithm_name, algorithm)
            ])

    # make pipelines for tree based algorithms (no need for scaling and onehot encoding, but only ordinalencoding)
    else:
        pipeline = Pipeline(steps=[
                ("ordinalencoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
                (algorithm_name, algorithm)
            ])
    
    # REUSABLE models
    # fill the dictionary with algo names and their corresponding pipelines for using later
    pipelines[algorithm_name] = pipeline

# For Example
pipelines['LogisticRegression']

In [13]:
model_scores = pd.DataFrame(columns=['Algorithm', 'Train_Accuracy', 'Test_Accuracy', 'Cross_Validate_Accuracy'])
algorithm_score = {}

# iterate through our stored pipelines in pipeline dictionary
for model_name, pipeline in pipelines.items():
    
    # fit the pipelines
    pipeline.fit(X_train, y_train)

    # make predictions
    y_pred = pipeline.predict(X_test)
    y_train_pred = pipeline.predict(X_train)
    
    # calculate the scores
    accuracy_test = accuracy_score(y_test, y_pred)
    accuracy_train = accuracy_score(y_train, y_train_pred)
    cv_score = cross_validate(pipeline, X_train, y_train, scoring='accuracy')['test_score'].mean()  
    
    # fill the scores dictionary with scores
    algorithm_score['Algorithm'] = model_name
    algorithm_score['Train_Accuracy'] = accuracy_train
    algorithm_score['Test_Accuracy'] = accuracy_test
    algorithm_score['Cross_Validate_Accuracy'] = cv_score
    
    # convert scores dictionary to dataframe
    model_scores = model_scores.append(algorithm_score, ignore_index=True)
    model_scores = model_scores.sort_values(by='Cross_Validate_Accuracy', ascending=False)
    
model_scores

Unnamed: 0,Algorithm,Train_Accuracy,Test_Accuracy,Cross_Validate_Accuracy
0,XGBClassifier,0.904494,0.849162,0.816045
1,GradientBoostingClassifier,0.894663,0.810056,0.81176
2,DecisionTreeClassifier,0.919944,0.843575,0.81043
7,AdaBoostClassifier,0.849719,0.804469,0.800601
3,LogisticRegression,0.80618,0.793296,0.799094
4,RandomForestClassifier,0.919944,0.815642,0.797754
5,SVC,0.816011,0.748603,0.796287
6,KNeighborsClassifier,0.86236,0.832402,0.790752


In [14]:
XGBClassifier()

In [15]:
xgb_model = pipelines['XGBClassifier']

xgb_param_grid = {
    'XGBClassifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'XGBClassifier__n_estimators': [50, 100, 200],
    'XGBClassifier__max_depth': [3, 5, 7, 8, 9],
    'XGBClassifier__subsample': [0.8, 0.9, 1.0],
    'XGBClassifier__colsample_bytree': [0.8, 0.9, 1.0, 1.2],
}

xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [16]:
xgb_grid_search.fit(X_train, y_train)

In [17]:
xgb_grid_search.best_score_

0.8426671919629666

In [18]:
xgb_grid_search.best_params_

{'XGBClassifier__colsample_bytree': 1.0,
 'XGBClassifier__learning_rate': 0.05,
 'XGBClassifier__max_depth': 9,
 'XGBClassifier__n_estimators': 100,
 'XGBClassifier__subsample': 0.9}

In [19]:
best_model = xgb_grid_search.best_estimator_
best_model

In [20]:
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Test Set:", accuracy)

Accuracy on Test Set: 0.8491620111731844
