# Exercise - Ensemble

In this exercise, we will focus on underage drinking. The data set contains data about high school students. Each row represents a single student. The columns include the characteristics of deidentified students. This is a binary classification task: predict whether a student drinks alcohol or not (this is the **alc** column: 1=Yes, 0=No). This is an important prediction task to detect underage drinking and deploy intervention techniques. 

## Description of Variables

The description of variables are provided in "Alcohol - Data Dictionary.docx"

## Goal

Use the **alcohol.csv** data set and build a model to predict **alc**. 

# Read and Prepare the Data

In [5]:
# Common imports

import pandas as pd
import numpy as np

np.random.seed(42)

# Get the data

In [6]:
#We will predict the "price" value in the data set:

alcohol = pd.read_csv("alcohol.csv")
alcohol.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,health,absences,gender,alc
0,18,2,1,4,2,0,5,4,2,5,2,M,1
1,18,4,3,1,0,0,4,4,2,3,9,M,1
2,15,4,3,2,3,0,5,3,4,5,0,F,0
3,15,3,3,1,4,0,4,3,3,3,10,F,0
4,17,3,2,1,2,0,5,3,5,5,2,M,1


# Split data (train/test)

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(alcohol, test_size=0.3)

# Data Prep

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

## Separate the target variable 

In [9]:
train_target = train['alc']
test_target = test['alc']

train_inputs = train.drop(['alc'], axis=1)
test_inputs = test.drop(['alc'], axis=1)

## Feature Engineering: Derive a new column

Examples:
- Ratio of study time to travel time
- Student is younger than 18 or not
- Average of father's and mother's level of education
- (etc.)

In [10]:
def new_col(df):
    #Create a copy so that we don't overwrite the existing dataframe
    df1 = df.copy()
    
    df1['parent_avg_edu'] = round((df1['Medu']+df1['Fedu'])/2)
    
    
    return df1[['parent_avg_edu']]
    # You can use this to check whether the calculation is made correctly:
    #return df1

In [11]:
new_col(train)

Unnamed: 0,parent_avg_edu
12759,4.0
4374,2.0
8561,4.0
10697,4.0
19424,5.0
...,...
16850,2.0
6265,2.0
11284,0.0
860,2.0


##  Identify the numeric, binary, and categorical columns

In [12]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [13]:
numeric_columns

['age',
 'Medu',
 'Fedu',
 'traveltime',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'health',
 'absences']

In [14]:
categorical_columns

['gender']

In [15]:
feat_eng_columns = ['Medu','Fedu']

# Pipeline

In [16]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [17]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [18]:
# Create a pipeline for the transformed column here

parent_avg_edu = Pipeline(steps=[('parent_avg_edu', FunctionTransformer(new_col)),
                               ('scaler', StandardScaler())])


In [19]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('trans', parent_avg_edu, feat_eng_columns)
    
        ],
        remainder='passthrough')



# Transform: fit_transform() for TRAIN

In [20]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[ 0.66643886,  0.96597412,  0.90362635, ...,  1.        ,
         0.        ,  0.59309811],
       [ 0.66643886, -0.93881619, -1.68666277, ...,  0.        ,
         1.        , -0.91274091],
       [ 0.66643886,  0.33104402,  0.04019664, ...,  0.        ,
         1.        ,  0.59309811],
       ...,
       [ 0.66643886, -2.20867639, -2.55009248, ...,  1.        ,
         0.        , -2.41857993],
       [ 1.6195814 , -0.30388608, -1.68666277, ...,  0.        ,
         1.        , -0.91274091],
       [ 1.6195814 , -0.30388608, -2.55009248, ...,  0.        ,
         1.        , -0.91274091]])

In [21]:
train_x.shape

(23800, 14)

# Tranform: transform() for TEST

In [22]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[-1.23984621,  0.33104402,  1.76705606, ...,  1.        ,
         0.        ,  0.59309811],
       [-1.23984621, -0.30388608,  0.04019664, ...,  0.        ,
         1.        , -0.1598214 ],
       [-0.28670367,  0.33104402,  0.04019664, ...,  0.        ,
         1.        ,  0.59309811],
       ...,
       [ 0.66643886, -0.30388608,  0.04019664, ...,  1.        ,
         0.        , -0.1598214 ],
       [-1.23984621, -0.93881619,  0.04019664, ...,  0.        ,
         1.        , -0.91274091],
       [-1.23984621,  0.96597412,  0.04019664, ...,  1.        ,
         0.        ,  0.59309811]])

In [23]:
test_x.shape

(10200, 14)

# Calculate the Baseline

In [24]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_target)

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_target, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.5234873949579832


In [27]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_target, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.5194117647058824


# Train a voting classifier 

In [28]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier


dtree_clf = DecisionTreeClassifier(max_depth=5)
log_clf = LogisticRegression(multi_class='multinomial', solver = 'lbfgs', C=1, max_iter=1000)
sgd_clf = SGDClassifier(max_iter=10000, tol=1e-3)

voting_clf = VotingClassifier(
            estimators=[('dt', dtree_clf), 
                        ('lr', log_clf), 
                        ('sgd', sgd_clf)],
            voting='hard')

voting_clf.fit(train_x, train_target)

In [29]:
from sklearn.metrics import accuracy_score

In [30]:
#Train accuracy

train_target_pred = voting_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_target_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8214705882352941


In [31]:
#Test accuracy - to fix over fitting go back to hard voting parameters - max depth and c

test_target_pred = voting_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_target_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8187254901960784


In [32]:
from sklearn.metrics import confusion_matrix

In [33]:
confusion_matrix(test_target, test_target_pred)

array([[4450,  848],
       [1001, 3901]])

# Train a bagging classifier

In [34]:
from sklearn.ensemble import BaggingClassifier 


bag_clf = BaggingClassifier( 
            SGDClassifier(), n_estimators=50, 
            max_samples=1000, bootstrap=True, n_jobs=-1) 

bag_clf.fit(train_x, train_target)



In [35]:
#Train accuracy

train_target_pred = bag_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_target_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8218487394957983


In [36]:
#Test accuracy

test_target_pred = bag_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_target_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8226470588235294


In [52]:
confusion_matrix(test_target, test_target_pred)

array([[4439,  859],
       [ 918, 3984]])

# Train a random forest classifier

In [37]:
from sklearn.ensemble import RandomForestClassifier 

rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs=-1) 

rnd_clf.fit(train_x, train_target)

In [38]:
#Train accuracy

train_target_pred = rnd_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_target_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8426890756302521


In [39]:
#Test accuracy

test_target_pred = rnd_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_target_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8122549019607843


In [53]:
confusion_matrix(test_target, test_target_pred)

array([[4439,  859],
       [ 918, 3984]])

# Train an adaboost classifier

In [43]:
from sklearn.ensemble import AdaBoostClassifier 


ada_clf = AdaBoostClassifier( 
            DecisionTreeClassifier(max_depth=4), n_estimators=500, 
            learning_rate=0.1) 


ada_clf.fit(train_x, train_target)



In [44]:
#Train accuracy

train_target_pred = ada_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_target_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8594537815126051


In [51]:
#Test accuracy

test_target_pred = ada_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_target_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8257843137254902


In [54]:
confusion_matrix(test_target, test_target_pred)

array([[4439,  859],
       [ 918, 3984]])

# Train a gradient boosting classifier

In [47]:
#Use GradientBoosting

from sklearn.ensemble import GradientBoostingClassifier

gbclf = GradientBoostingClassifier(max_depth=2, n_estimators=100, learning_rate=0.1) 

gbclf.fit(train_x, train_target)

In [50]:
#Train accuracy

train_target_pred = gbclf.predict(train_x)

train_acc = accuracy_score(train_target, train_target_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8188655462184874


In [49]:
#Test accuracy

test_target_pred = gbclf.predict(test_x)

test_acc = accuracy_score(test_target, test_target_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8119607843137255


In [55]:
confusion_matrix(test_target, test_target_pred)

array([[4439,  859],
       [ 918, 3984]])