In [None]:
#Q2.
"""  Project Description
The Titanic Problem is based on the sinking of the ‘Unsinkable’ ship Titanic in early 1912. It gives you information about multiple people like their ages, sexes, sibling counts, embarkment points, and whether or not they survived the disaster. 
Based on these features, you have to predict if an arbitrary passenger on Titanic would survive the sinking or not. 

Attribute Information
Passenger id- Unique Id of the passenger
Pclass- Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
Survived- Survived (0 = No; 1 = Yes)
Name- Name of the passenger
Sex- Sex of the passenger (Male, Female)
Age- Age of the passenger
Sibsp- Number of Siblings/Spouses Aboard
Parch- Number of Parents/Children Aboard
Ticket- Ticket Number
Fare- Passenger Fare (British pound)
Cabin- Cabin
Embarked- Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

Dataset Link-
https://github.com/dsrscientist/dataset1/blob/master/titanic_train.csv
 

"""

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Load the dataset
df = pd.read_csv("https://raw.githubusercontent.com/dsrscientist/dataset1/master/titanic_train.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# Display the first few rows of the dataset
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [4]:
df.shape # to check the dimension of the data set (rows, columns)

(891, 12)

In [5]:
df.dtypes #Checking the types of columns

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
df.info() #detailed information about the data frame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
#Checking the correlation between feature and the target. NOTE: Range is from (-1 to +1) & 0 represent no correlation
cor=df.corr()["Survived"].sort_values()
cor

Pclass        -0.338481
Age           -0.077221
SibSp         -0.035322
PassengerId   -0.005007
Parch          0.081629
Fare           0.257307
Survived       1.000000
Name: Survived, dtype: float64

In [9]:
# Data Preprocessing
# Drop irrelevant columns
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [10]:
#checking the null Values ( already its visible in 'df.info()' that age & Embarked columns have missing values)
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [11]:
# Handle missing values
# Impute missing values for age using median (medaian is basically used when there are lots of outlayers are present )
age= SimpleImputer(strategy='median')
df['Age'] = age.fit_transform(df[['Age']]) 

In [12]:
# Impute missing values for embarked using the most common value
embarked = SimpleImputer(strategy='most_frequent')
df['Embarked'] = embarked.fit_transform(df[['Embarked']])


In [13]:
#Again checking the null Values
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [14]:
# Convert categorical variables to numerical
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True) # This code is used to perform one-hot encoding.


In [15]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.2500,0,1
1,1,1,1,38.0,1,0,71.2833,0,0
2,1,3,1,26.0,0,0,7.9250,0,1
3,1,1,1,35.0,1,0,53.1000,0,1
4,0,3,0,35.0,0,0,8.0500,0,1
...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,0,1
887,1,1,1,19.0,0,0,30.0000,0,1
888,0,3,1,28.0,1,2,23.4500,0,1
889,1,1,0,26.0,0,0,30.0000,0,0


In [16]:
#Again checking detailed information about the data frame after changes
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Sex         891 non-null    int64  
 3   Age         891 non-null    float64
 4   SibSp       891 non-null    int64  
 5   Parch       891 non-null    int64  
 6   Fare        891 non-null    float64
 7   Embarked_Q  891 non-null    uint8  
 8   Embarked_S  891 non-null    uint8  
dtypes: float64(2), int64(5), uint8(2)
memory usage: 50.6 KB


In [17]:
# Split the dataset into features (x) and target variable (y)
x = df.drop(['Survived'], axis=1)
y = df['Survived']

In [18]:
# Finding the best random state

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

maxAccu = 0
maxRS = 0

for i in range(1, 200):
    # Split the dataset into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=i)
    
    # Initialize RandomForestClassifier
    RFR = RandomForestClassifier()
    
    # Train the model
    RFR.fit(x_train, y_train)
    
    # Make predictions
    pred = RFR.predict(x_test)
    
    # Calculate accuracy
    acc = accuracy_score(y_test, pred)
    
    # Update max accuracy and corresponding random state if the current accuracy is higher
    if acc > maxAccu:
        maxAccu = acc
        maxRS = i

print("Best accuracy is", maxAccu, "at random_state", maxRS)


Best accuracy is 0.8582089552238806 at random_state 174


In [19]:
#creating train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state= i)


In [22]:
#Classification algorithms
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, accuracy_score
from sklearn.model_selection import cross_val_score


In [20]:
#Checking accuracy for RandomForestClassifier

# Initialize the RandomForestClassifier
RFC = RandomForestClassifier()

# Train the model on the training set
RFC.fit(x_train, y_train)
# Make predictions on the testing set
predRFC= RFC.predict(x_test)

print('Accuracy:', accuracy_score(y_test ,predRFC))
print('\nConfusion Matrix:\n',confusion_matrix(y_test ,predRFC))
print('\nClassification Report:\n',classification_report(y_test ,predRFC))

Accuracy: 0.7835820895522388

Confusion Matrix:
 [[144  20]
 [ 38  66]]

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.88      0.83       164
           1       0.77      0.63      0.69       104

    accuracy                           0.78       268
   macro avg       0.78      0.76      0.76       268
weighted avg       0.78      0.78      0.78       268



In [24]:
#Checking accuracy for LogisticRegression

# Initialize the LogisticRegression
LR = LogisticRegression()

# Train the model on the training set
LR.fit(x_train, y_train)
# Make predictions on the testing set
predLR= LR.predict(x_test)

print('Accuracy:', accuracy_score(y_test ,predLR))
print('\nConfusion Matrix:\n',confusion_matrix(y_test ,predLR))
print('\nClassification Report:\n',classification_report(y_test ,predLR))

Accuracy: 0.7723880597014925

Confusion Matrix:
 [[137  27]
 [ 34  70]]

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.84      0.82       164
           1       0.72      0.67      0.70       104

    accuracy                           0.77       268
   macro avg       0.76      0.75      0.76       268
weighted avg       0.77      0.77      0.77       268



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
#Checking accuracy for support vector machine classifier

# Initialize the SVC
svc = SVC()

# Train the model on the training set
svc.fit(x_train, y_train)
# Make predictions on the testing set
predsvc= svc.predict(x_test)

print('Accuracy:', accuracy_score(y_test ,predsvc))
print('\nConfusion Matrix:\n',confusion_matrix(y_test ,predsvc))
print('\nClassification Report:\n',classification_report(y_test ,predsvc))

Accuracy: 0.6567164179104478

Confusion Matrix:
 [[142  22]
 [ 70  34]]

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.87      0.76       164
           1       0.61      0.33      0.42       104

    accuracy                           0.66       268
   macro avg       0.64      0.60      0.59       268
weighted avg       0.65      0.66      0.63       268



In [26]:
#Checking accuracy for Gradient Boosting classifier

# Initialize the Gradient Boosting classifier
GB = GradientBoostingClassifier()

# Train the model on the training set
GB.fit(x_train, y_train)
# Make predictions on the testing set
predGB= GB.predict(x_test)

print('Accuracy:', accuracy_score(y_test ,predGB))
print('\nConfusion Matrix:\n',confusion_matrix(y_test ,predGB))
print('\nClassification Report:\n',classification_report(y_test ,predGB))

Accuracy: 0.7985074626865671

Confusion Matrix:
 [[153  11]
 [ 43  61]]

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.93      0.85       164
           1       0.85      0.59      0.69       104

    accuracy                           0.80       268
   macro avg       0.81      0.76      0.77       268
weighted avg       0.81      0.80      0.79       268



In [27]:
#Checking accuracy for AdaBoost classifier

# Initialize the AdaBoost classifier 
ABC = AdaBoostClassifier()

# Train the model on the training set
ABC.fit(x_train, y_train)
# Make predictions on the testing set
predABC= ABC.predict(x_test)

print('Accuracy:', accuracy_score(y_test ,predABC))
print('\nConfusion Matrix:\n',confusion_matrix(y_test ,predABC))
print('\nClassification Report:\n',classification_report(y_test ,predABC))

Accuracy: 0.7910447761194029

Confusion Matrix:
 [[142  22]
 [ 34  70]]

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.87      0.84       164
           1       0.76      0.67      0.71       104

    accuracy                           0.79       268
   macro avg       0.78      0.77      0.77       268
weighted avg       0.79      0.79      0.79       268



In [28]:
#Checking accuracy for ExtraTreesClassifier

# Initialize the ExtraTreesClassifier  
ET = ExtraTreesClassifier()

# Train the model on the training set
ET.fit(x_train, y_train)
# Make predictions on the testing set
predET= ET.predict(x_test)

print('Accuracy:', accuracy_score(y_test ,predET))
print('\nConfusion Matrix:\n',confusion_matrix(y_test ,predET))
print('\nClassification Report:\n',classification_report(y_test ,predET))

Accuracy: 0.7574626865671642

Confusion Matrix:
 [[138  26]
 [ 39  65]]

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.84      0.81       164
           1       0.71      0.62      0.67       104

    accuracy                           0.76       268
   macro avg       0.75      0.73      0.74       268
weighted avg       0.75      0.76      0.75       268



In [29]:
#Checking accuracy for BaggingClassifier

# Initialize the BaggingClassifier  
BC = BaggingClassifier()

# Train the model on the training set
BC.fit(x_train, y_train)
# Make predictions on the testing set
predBC= BC.predict(x_test)

print('Accuracy:', accuracy_score(y_test ,predBC))
print('\nConfusion Matrix:\n',confusion_matrix(y_test ,predBC))
print('\nClassification Report:\n',classification_report(y_test ,predBC))

Accuracy: 0.7910447761194029

Confusion Matrix:
 [[149  15]
 [ 41  63]]

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.91      0.84       164
           1       0.81      0.61      0.69       104

    accuracy                           0.79       268
   macro avg       0.80      0.76      0.77       268
weighted avg       0.79      0.79      0.78       268



In [None]:
#Cross Validation Score
""""cv=5------its a fold value  1, 2, 3, 4 , 5 
cross_val_score(modelname, features, targetvariable, cv=foldvalue, scoring= 'accuracy')   

[cv=5 foldvalue, means my data will be divided into 5 equal parts & each part will fold randomly selected data, (n-1) will be selected for the Training purpose and the remaining one will be used for the Test purpose]

NOTE: In classification algorithms- default scoring parameter- accuracy
      for regression- default scoring parameter is- r2 score """
        

In [30]:
#Checking cv score for Random Forest Classification
score= cross_val_score(RFC,x,y)
print('score:',score)
print('\n mean:',score.mean())
print('\n difference between Accuracy score and Cross validation score is :', accuracy_score(y_test, predRFC) - score.mean())

score: [0.78212291 0.81460674 0.84831461 0.7752809  0.8258427 ]

 mean: 0.8092335697696316

 difference between Accuracy score and Cross validation score is : -0.02565148021739272


In [31]:
#Checking cv score for LogisticRegression
score= cross_val_score(LR,x,y)
print('score:',score)
print('\n mean:',score.mean())
print('\n difference between Accuracy score and Cross validation score is :', accuracy_score(y_test, predLR) - score.mean())

score: [0.77653631 0.78651685 0.78089888 0.76966292 0.8258427 ]

 mean: 0.7878915322327538

 difference between Accuracy score and Cross validation score is : -0.015503472531261275


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [32]:
#Checking cv score for SVC
score= cross_val_score(svc,x,y)
print('score:',score)
print('\n mean:',score.mean())
print('\n difference between Accuracy score and Cross validation score is :', accuracy_score(y_test, predsvc) - score.mean())

score: [0.58100559 0.71348315 0.69101124 0.68539326 0.69101124]

 mean: 0.6723808925993346

 difference between Accuracy score and Cross validation score is : -0.015664474688886876


In [33]:
#Checking cv score for Gradient Boosting classifier
score= cross_val_score(GB,x,y)
print('score:',score)
print('\n mean:',score.mean())
print('\n difference between Accuracy score and Cross validation score is :', accuracy_score(y_test, predGB) - score.mean())

score: [0.81005587 0.80898876 0.83146067 0.80898876 0.85393258]

 mean: 0.8226853304877284

 difference between Accuracy score and Cross validation score is : -0.024177867801161268


In [34]:
#Checking cv score for AdaBoost classifier 
score= cross_val_score(ABC,x,y)
print('score:',score)
print('\n mean:',score.mean())
print('\n difference between Accuracy score and Cross validation score is :', accuracy_score(y_test, predABC) - score.mean())

score: [0.75418994 0.82022472 0.82022472 0.8258427  0.84831461]

 mean: 0.8137593371414225

 difference between Accuracy score and Cross validation score is : -0.02271456102201952


In [35]:
#Checking cv score for ExtraTreesClassifier  
score= cross_val_score(ET,x,y)
print('score:',score)
print('\n mean:',score.mean())
print('\n difference between Accuracy score and Cross validation score is :', accuracy_score(y_test, predET) - score.mean())

score: [0.78212291 0.78089888 0.84269663 0.7752809  0.80898876]

 mean: 0.7979976147134517

 difference between Accuracy score and Cross validation score is : -0.04053492814628745


In [36]:
#Checking cv score for BaggingClassifier  
score= cross_val_score(BC,x,y)
print('score:',score)
print('\n mean:',score.mean())
print('\n difference between Accuracy score and Cross validation score is :', accuracy_score(y_test, predBC) - score.mean())

score: [0.75977654 0.80337079 0.83707865 0.76966292 0.80898876]

 mean: 0.7957755319816711

 difference between Accuracy score and Cross validation score is : -0.004730755862268121


In [38]:

# Hyperparameter tuning using GridSearchCV
# Define the parameter grid for BaggingClassifier
param_grid_bagging = {
    'n_estimators': [50, 100, 200],
    'max_samples': [1.0, 0.8, 0.6],
    'max_features': [1.0, 0.8, 0.6],
    'bootstrap': [True, False],
    'bootstrap_features': [True, False],
}

In [39]:
# Initialize BaggingClassifier
bagging_classifier = BaggingClassifier(base_estimator=None, random_state=42)


In [40]:
# Initialize GridSearchCV for BaggingClassifier
grid_search_bagging = GridSearchCV(bagging_classifier, param_grid_bagging, cv=5, scoring='accuracy', n_jobs=-1)



In [43]:
grid_search_bagging.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=BaggingClassifier(random_state=42), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'bootstrap_features': [True, False],
                         'max_features': [1.0, 0.8, 0.6],
                         'max_samples': [1.0, 0.8, 0.6],
                         'n_estimators': [50, 100, 200]},
             scoring='accuracy')

In [45]:
# Get the best BaggingClassifier model
grid_search_bagging.best_estimator_


BaggingClassifier(bootstrap_features=True, max_features=0.8, max_samples=0.8,
                  n_estimators=100, random_state=42)

In [46]:
# Make predictions on the test set with the best model
y_pred_bagging_best = grid_search_bagging.best_estimator_.predict(x_test)


In [47]:
# Print evaluation metrics for the best BaggingClassifier model
print("\nBest BaggingClassifier Model - Accuracy:", accuracy_score(y_test, y_pred_bagging_best))
print("\nBest BaggingClassifier Model - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_bagging_best))
print("\nBest BaggingClassifier Model - Classification Report:\n", classification_report(y_test, y_pred_bagging_best))


Best BaggingClassifier Model - Accuracy: 0.8097014925373134

Best BaggingClassifier Model - Confusion Matrix:
 [[150  14]
 [ 37  67]]

Best BaggingClassifier Model - Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.91      0.85       164
           1       0.83      0.64      0.72       104

    accuracy                           0.81       268
   macro avg       0.81      0.78      0.79       268
weighted avg       0.81      0.81      0.80       268

