# Setup

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20.

In [50]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
from sklearn.model_selection import GridSearchCV
# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [51]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.dummy import DummyClassifier

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix,classification_report

### **Tackle** the Titanic dataset

---



The goal is to predict whether or not a passenger survived based on attributes such as their age, sex, passenger class, where they embarked and so on.

Let's fetch the data and load it:

In [3]:
import os
import urllib.request

TITANIC_PATH = os.path.join("datasets", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"

def fetch_titanic_data(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename in ("train.csv", "test.csv"):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            print("Downloading", filename)
            urllib.request.urlretrieve(url + filename, filepath)

fetch_titanic_data()    

Downloading train.csv
Downloading test.csv


In [4]:
import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [5]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

The data is already split into a training set and a test set. However, the test data does *not* contain the labels: your goal is to train the best model you can using the training data, then make your predictions on the test data and upload them to Kaggle to see your final score.

Let's take a peek at the top few rows of the training set:

In [6]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


The attributes have the following meaning:
* **PassengerId**: a unique identifier for each passenger
* **Survived**: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
* **Pclass**: passenger class.
* **Name**, **Sex**, **Age**: self-explanatory
* **SibSp**: how many siblings & spouses of the passenger aboard the Titanic.
* **Parch**: how many children & parents of the passenger aboard the Titanic.
* **Ticket**: ticket id
* **Fare**: price paid (in pounds)
* **Cabin**: passenger's cabin number
* **Embarked**: where the passenger embarked the Titanic

Let's explicitly set the `PassengerId` column as the index column:

In [7]:
#train_data = train_data.set_index("PassengerId")
#test_data = test_data.set_index("PassengerId")

Let's get more info to see how much data is missing:

In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [9]:
train_data[train_data["Sex"]=="female"]["Age"].median()

27.0

Okay, the **Age**, **Cabin** and **Embarked** attributes are sometimes null (less than 891 non-null), especially the **Cabin** (77% are null). We will ignore the **Cabin** for now and focus on the rest. The **Age** attribute has about 19% null values, so we will need to decide what to do with them. Replacing null values with the median age seems reasonable. We could be a bit smarter by predicting the age based on the other columns (for example, the median age is 37 in 1st class, 29 in 2nd class and 24 in 3rd class), but we'll keep things simple and just use the overall median age.

The **Name** and **Ticket** attributes may have some value, but they will be a bit tricky to convert into useful numbers that a model can consume. So for now, we will ignore them.

Let's take a look at the numerical attributes:

In [10]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699113,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526507,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.4167,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


* Yikes, only 38% **Survived**! 😭 That's close enough to 40%, so accuracy will be a reasonable metric to evaluate our model.
* The mean **Fare** was £32.20, which does not seem so expensive (but it was probably a lot of money back then).
* The mean **Age** was less than 30 years old.

Let's check that the target is indeed 0 or 1:

In [11]:
train_data["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

Now let's take a quick look at all the categorical attributes:

In [12]:
train_data["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [13]:
train_data["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

The Embarked attribute tells us where the passenger embarked: C=Cherbourg, Q=Queenstown, S=Southampton.

In [14]:
train_data["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

### **Feature Engineering**

Try to convert numerical attributes to categorical attributes: for example, different age groups had very different survival rates (see below), so it may help to create an age bucket category and use it instead of the age. Similarly, it may be useful to have a special category for people traveling alone since only 30% of them survived (see below).

In [15]:
train_data["AgeBucket"] = train_data["Age"] // 15 * 15
train_data[["AgeBucket", "Survived"]].groupby(['AgeBucket']).mean()

test_data["AgeBucket"] = test_data["Age"] // 15 * 15


In [16]:
train_data.shape

(891, 13)

In [17]:
train_data["RelativesOnboard"] = train_data["SibSp"] + train_data["Parch"]
train_data[["RelativesOnboard", "Survived"]].groupby(['RelativesOnboard']).mean()
test_data["RelativesOnboard"] = test_data["SibSp"] + test_data["Parch"]

Replace **SibSp** and **Parch** with their sum.

In [18]:
train_data['Family'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['Family'] = test_data['SibSp'] + test_data['Parch'] + 1

In [19]:
train_data['Single']=train_data.Family<2
train_data['Small']=(train_data.Family>1)&(train_data.Family<5)
train_data['Medium']=(train_data.Family>4)&(train_data.Family<7)
train_data['Large']=train_data.Family>6

test_data['Single']=test_data.Family<2
test_data['Small']=(test_data.Family>1)&(test_data.Family<5)
test_data['Medium']=(test_data.Family>4)&(test_data.Family<7)
test_data['Large']=test_data.Family>6

  * Try to identify parts of names that correlate well with the **Survived** attribute.

In [20]:
train_data["Title"] = train_data["Name"].str.extract("([A-Z][a-z]*)\.", expand=False)
train_data['Title']=train_data['Title'].replace(['Ms','Mlle'],'Miss')
train_data['Title']=train_data['Title'].replace(['Mme','Countess','Lady','Dona'],'Mrs')
train_data['Title']=train_data['Title'].replace(['Dr','Major','Col','Sir','Rev','Jonkheer','Capt','Don'],'Mr')
train_data['Senior']=train_data['Age'].map(lambda s:1 if s>70 else 0)

test_data["Title"] = test_data["Name"].str.extract("([A-Z][a-z]*)\.", expand=False)
test_data['Title']=test_data['Title'].replace(['Ms','Mlle'],'Miss')
test_data['Title']=test_data['Title'].replace(['Mme','Countess','Lady','Dona'],'Mrs')
test_data['Title']=test_data['Title'].replace(['Dr','Major','Col','Sir','Rev','Jonkheer','Capt','Don'],'Mr')
test_data['Senior']=test_data['Age'].map(lambda s:1 if s>70 else 0)

In [21]:
train_data.Cabin = train_data.Cabin.fillna('U')
# map first letter of cabin to itself
train_data.Cabin = train_data.Cabin.map(lambda x: x[0])
train_data

test_data.Cabin = test_data.Cabin.fillna('U')
# map first letter of cabin to itself
test_data.Cabin = test_data.Cabin.map(lambda x: x[0])
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeBucket,RelativesOnboard,Family,Single,Small,Medium,Large,Title,Senior
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,U,Q,30.0,0,1,True,False,False,False,Mr,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,U,S,45.0,1,2,False,True,False,False,Mrs,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,U,Q,60.0,0,1,True,False,False,False,Mr,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,U,S,15.0,0,1,True,False,False,False,Mr,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,U,S,15.0,2,3,False,True,False,False,Mrs,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,U,S,,0,1,True,False,False,False,Mr,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C,C,30.0,0,1,True,False,False,False,Mrs,0
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,U,S,30.0,0,1,True,False,False,False,Mr,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,U,S,,0,1,True,False,False,False,Mr,0


### **Preprocessing Pipeline**

Now let's build our preprocessing pipelines, starting with the pipeline for numerical attributes:

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

Now we can build the pipeline for the categorical attributes:

In [23]:
from sklearn.preprocessing import OneHotEncoder

In [24]:
cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

Finally, let's join the numerical and categorical pipelines:

In [25]:
from sklearn.compose import ColumnTransformer

num_attribs = ["Age", "SibSp", "Parch", "Fare",  "Single","Small", "Medium", "Large"]
cat_attribs = ["Pclass", "Sex", "Embarked", "Cabin", "Title"]

preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

Cool! Now we have a nice preprocessing pipeline that takes the raw data and outputs numerical input features that we can feed to any Machine Learning model we want.

In [26]:
X_train = preprocess_pipeline.fit_transform(
    train_data[num_attribs + cat_attribs])
X_train

array([[-0.56573582,  0.43279337, -0.47367361, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.6638609 ,  0.43279337, -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       [-0.25833664, -0.4745452 , -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.10463705,  0.43279337,  2.00893337, ...,  1.        ,
         0.        ,  0.        ],
       [-0.25833664, -0.4745452 , -0.47367361, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.20276213, -0.4745452 , -0.47367361, ...,  0.        ,
         1.        ,  0.        ]])

Let's not forget to get the labels:

In [27]:
y_train = train_data["Survived"]

We are now ready to train a classifier. Let's start with a `RandomForestClassifier`:

In [28]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

Great, our model is trained, let's use it to make predictions on the test set:

In [29]:
X_test = preprocess_pipeline.transform(test_data[num_attribs + cat_attribs])
y_pred = forest_clf.predict(X_test)

And now we could just build a CSV file with these predictions (respecting the format excepted by Kaggle), then upload it and hope for the best. But wait! We can do better than hope. Why don't we use cross-validation to have an idea of how good our model is?

In [30]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.809238451935081

In [31]:
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeBucket,RelativesOnboard,Family,Single,Small,Medium,Large,Title,Senior
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,U,Q,30.0,0,1,True,False,False,False,Mr,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,U,S,45.0,1,2,False,True,False,False,Mrs,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,U,Q,60.0,0,1,True,False,False,False,Mr,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,U,S,15.0,0,1,True,False,False,False,Mr,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,U,S,15.0,2,3,False,True,False,False,Mrs,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,U,S,,0,1,True,False,False,False,Mr,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C,C,30.0,0,1,True,False,False,False,Mrs,0
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,U,S,30.0,0,1,True,False,False,False,Mr,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,U,S,,0,1,True,False,False,False,Mr,0


In [32]:
result=test_data[["PassengerId"]]
result["Survived"]=y_pred
result.to_csv("submission_randomforest.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [33]:
# create param grid object 
forrest_params = dict(     
    max_depth = [n for n in range(9, 14)],     
    min_samples_split = [n for n in range(4, 11)], 
    min_samples_leaf = [n for n in range(2, 5)],     
    n_estimators = [n for n in range(10, 60, 10)],
)
# instantiate Random Forest model
forrest = RandomForestClassifier()
# build and fit model 
forest_cv = GridSearchCV(estimator=forrest,param_grid=forrest_params, cv=5) 
forest_cv.fit(X_train, y_train)
print("Best score: {}".format(forest_cv.best_score_))
print("Optimal params: {}".format(forest_cv.best_estimator_))

Best score: 0.8439897056054235
Optimal params: RandomForestClassifier(max_depth=13, min_samples_leaf=2, min_samples_split=4,
                       n_estimators=40)


In [34]:
result=test_data[["PassengerId"]]
result["Survived"]=y_pred
result.to_csv("submission_randomforest_gridsearch.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Okay, not too bad! Looking at the [leaderboard](https://www.kaggle.com/c/titanic/leaderboard) for the Titanic competition on Kaggle, you can see that our score is in the top 2%, woohoo! Some Kagglers reached 100% accuracy, but since you can easily find the [list of victims](https://www.encyclopedia-titanica.org/titanic-victims/) of the Titanic, it seems likely that there was little Machine Learning involved in their performance! 😆

In [35]:
    g=GaussianNB()
    b=BernoulliNB()
    k=KNeighborsClassifier()
    svc=SVC()
    d=DecisionTreeClassifier()
    log=LogisticRegression()
    gbc=GradientBoostingClassifier()
    mn=MultinomialNB()
    r=RandomForestClassifier()
    ab=AdaBoostClassifier()
    sgdc = SGDClassifier()
    dc=DummyClassifier()
    xgbc=XGBClassifier()

algorithms = [g,b,k,svc,log,gbc,r,d,sgdc,dc,xgbc]
names=['GaussianNB','BernoulliNB','K Nearest','Support Vector Machine','Logistic Regression','GradientBoosting','RandomForest','Decision Tree','SGDClassifier', 'DummyClassifier','XGBClassifier()']

In [36]:
def algo_test(X,y,algorithms=algorithms, names=names):
    
    for i in range(len(algorithms)):
        algorithms[i]=algorithms[i].fit(X,y)
    accuracy=[]
    precision=[]
    recall=[]
    f1=[]

    for i in range(len(algorithms)):
        accuracy.append(accuracy_score(y,algorithms[i].predict(X)))
        precision.append(precision_score(y,algorithms[i].predict(X)))
        recall.append(recall_score(y,algorithms[i].predict(X)))
        f1.append(f1_score(y,algorithms[i].predict(X)))
    metrics = pd.DataFrame(columns=["Accuracy","Precision","Recall","F1"],index=names)
    metrics['Accuracy']=accuracy
    metrics['Precision']=precision
    metrics['Recall']=recall
    metrics['F1']=f1
    return metrics.sort_values('F1', ascending=False)

In [37]:
algo_test(X_train,y_train)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Accuracy,Precision,Recall,F1
RandomForest,0.983165,0.98806,0.967836,0.977843
Decision Tree,0.983165,0.99696,0.959064,0.977645
GradientBoosting,0.905724,0.913462,0.833333,0.87156
XGBClassifier(),0.877666,0.877023,0.792398,0.832565
K Nearest,0.868687,0.85489,0.792398,0.822458
Logistic Regression,0.836139,0.79697,0.769006,0.782738
Support Vector Machine,0.833895,0.812903,0.736842,0.773006
GaussianNB,0.791246,0.683962,0.847953,0.75718
SGDClassifier,0.808081,0.749271,0.751462,0.750365
BernoulliNB,0.79349,0.71123,0.777778,0.743017


In [38]:
result=test_data[["PassengerId"]]
y_pred=algorithms[6].predict(X_test)
result["Survived"]=y_pred
result.to_csv("submission_bestmodel.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
#another way to create submission file
submission = pd.DataFrame({'PassengerId':test_data['PassengerId'],'Survived':y_pred})
submission
filename = 'Titanic_Predictions.csv'
submission.to_csv(filename, index=False)
print('Saved')

Saved


In [40]:
y_pred

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

### Performance Measure

In [41]:
y_test= algorithms[6].predict(X_train)
print(classification_report(y_test,y_train))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       556
           1       0.97      0.99      0.98       335

    accuracy                           0.98       891
   macro avg       0.98      0.98      0.98       891
weighted avg       0.98      0.98      0.98       891



In [42]:
confusion_matrix(y_test,y_train)

array([[545,  11],
       [  4, 331]])

In [43]:
expected=y_test
prediction = y_train
tn, fp, fn, tp = confusion_matrix(y_test, y_train).ravel()
specificity = tn / (tn+fp)
sensitivity = tp/(tp+fn)

print ('Accuracy:', accuracy_score(expected, prediction))
print ('F1 score: ',f1_score(expected, prediction))
print ('Recall:', recall_score(expected, prediction))
print ('Precision:', precision_score(expected, prediction))
print('Sensitivity : ', sensitivity )
print('Specificity : ', specificity)
print("Classification Error",1 - accuracy_score(expected, prediction))
print ('\n clasification report:\n', classification_report(expected,prediction))
print ('\n confussion matrix:\n', confusion_matrix(expected, prediction))
print("---------------------------")
print("++++++++++++++++++++++++++++")

Accuracy: 0.9831649831649831
F1 score:  0.9778434268833088
Recall: 0.9880597014925373
Precision: 0.9678362573099415
Sensitivity :  0.9880597014925373
Specificity :  0.9802158273381295
Classification Error 0.01683501683501687

 clasification report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       556
           1       0.97      0.99      0.98       335

    accuracy                           0.98       891
   macro avg       0.98      0.98      0.98       891
weighted avg       0.98      0.98      0.98       891


 confussion matrix:
 [[545  11]
 [  4 331]]
---------------------------
++++++++++++++++++++++++++++


**### Gradient Boosting Classifier with Parameter Tuning**

In [44]:
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, zero_one_loss, accuracy_score
import sklearn as skl
from sklearn import tree
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [45]:
gbc = GradientBoostingClassifier()
parameters = {
    "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}

In [46]:
from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(gbc,parameters,cv=5)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

print("Train Size = {} | Test Size = {} ".format(len(X_train),len(X_test) ))
cv.fit(X_train,y_train.values.ravel())

Train Size = 712 | Test Size = 179 


GridSearchCV(cv=5, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.01, 0.1, 1, 10, 100],
                         'max_depth': [1, 3, 5, 7, 9],
                         'n_estimators': [5, 50, 250, 500]})

In [47]:
def display(results):
    print(f'Best parameters are: {results.best_params_}')
    print("\n")
    mean_score = results.cv_results_['mean_test_score']
    std_score = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    for mean,std,params in zip(mean_score,std_score,params):
        print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')

In [48]:
display(cv)

Best parameters are: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 500}


0.617 + or -0.003 for the {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 5}
0.782 + or -0.021 for the {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50}
0.799 + or -0.02 for the {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 250}
0.812 + or -0.021 for the {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 500}
0.617 + or -0.003 for the {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5}
0.795 + or -0.031 for the {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
0.808 + or -0.031 for the {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 250}
0.81 + or -0.031 for the {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}
0.617 + or -0.003 for the {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 5}
0.799 + or -0.033 for the {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}
0.808 + or -0.033 for the {'learning_rate': 0.01, 'max_dep

In [49]:
#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, stratify=y, random_state=42)

print("Train Size = {} | Test Size = {} ".format(len(X_train),len(X_test) ))

import time 
print("\nNow testing algorithms")

# old Best param {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 250}
clf = GradientBoostingClassifier(learning_rate= 1, max_depth= 1, n_estimators= 500)
#ccp_alpha=0.0,
                                                  #criterion='friedman_mse',
                                                  #init=None, learning_rate=0.1,
                                                  #loss='deviance', max_depth=3,
                                                  #max_features=None,
                                                  #max_leaf_nodes=None,
                                                  #min_impurity_decrease=0.0,
                                                  #min_impurity_split=None,
                                                  #min_samples_leaf=1,
                                                  #min_samples_split=2,
                                                  #min_weight_fraction_leaf=0.0,
                                                  #n_estimators=100,
                                                  #n_iter_no_change=None,
                                                  #presort='deprecated',
                                                  #random_state=None,
                                                  #subsample=1.0, tol=0.0001,
                                                  #validation_fraction=0.1,
start = time.time()                                                 #verbose=0, warm_start=False
clf.fit(X_train, y_train)#fit may be called as 'trained'
stop= time.time()
print(f"Training time: {stop - start}s")
score = clf.score(X_test, y_test)
print("%s : %f %%" % ("GradientBoostingClassifier", score*100))
expected = y_test
prediction = clf.predict(X_test)

Train Size = 712 | Test Size = 179 

Now testing algorithms
Training time: 0.35951828956604004s
GradientBoostingClassifier : 81.564246 %
