# Basic Machine Learning Example

###  This example reflects what a simple manual process for comming up witn an effective model for a classificaiton problem



In [1]:
# import dependencies

import numpy as np
import pandas as pd
import pickle
from sqlalchemy import create_engine
import sqlite3 
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
import tqdm

## EVALUATE FUNCTION

Since we will be trying lots of different models, it would be nice to have a single function that will evaluate all our models and provide a standardized reporting format.

This will allow us to easily pick out the model we want to move forward with.

This function takes in a model ( pipeline ) and our train test split data. From there it simply performes predictions and generates results

In [2]:
def evaluate(pipeline, X_train, X_test, y_train, y_test):
    '''
    Evaluate a pipeline on training and test datasets
    '''    
    pipeline.fit(X_train, y_train)
    y_train_hat = pipeline.predict(X_train)
    y_test_hat = pipeline.predict(X_test)
    train_f1 = f1_score(y_train_hat, y_train)
    train_acc = accuracy_score(y_train_hat, y_train)
    test_f1 = f1_score(y_test_hat, y_test)
    test_acc = accuracy_score(y_test_hat, y_test)

    print(f"========== Predictor: {type(pipeline).__name__} ==========")
    print(f"Training result: f1: {train_f1:.3f}, acc: {train_acc:.3f}")
    print(f"Test result: f1: {test_f1:.3f}, acc: {test_acc:.3f}")
    print()


## DATA

In this case we are reading in transfusion data.  With this data we are trying to predict in an individual has given blood on Marth 2007 based on specific features.

#### The features are:
- Recency  ->   How long since the individual last gave blood
- Frequency -> How many times has the indivuaul give blood
- Monetary -> Amount of usable blood given
- Time -> How many months have they been given blood

In [3]:
# load dataset
engine = create_engine('sqlite:///../dataBase/Are_You_Hot.db')
df = pd.read_sql('select * from hot', engine)
df.head(10)

Unnamed: 0,Name,Sex,DOB,Birth Year,Eye Color,Hair Color,Distinctive Features,Height(ft),Weight(lbs),Zodiac Sign,Tattoo Body Art,Hot Test,Ht/Wt Ratio,Age
0,Angelina Jolie,Female,06/04/1975,1975,Gray,Blonde,Lips,5.6,119.0,Gemini,Yes,0,21.25,45
1,Johnny Depp,Male,06/09/1963,1963,Brown,Brown,Cheekbones,5.1,171.0,Gemini,Yes,0,33.53,57
2,Charlize Theron,Female,08/07/1975,1975,Green,Brown,Attractive,5.9,121.0,Leo,Yes,0,20.51,45
3,Brad Pitt,Male,12/18/1963,1963,Blue,Blonde,Jaw,5.11,172.0,Sagittarius,Yes,0,33.66,57
4,Amber Heard,Female,04/22/1986,1986,Blue,Blonde,Slim,5.7,137.0,Taurus,Yes,0,24.04,34
5,Jared Leto,Male,12/26/1971,1971,Blue,Brown,Eyes,5.9,152.0,Capricorn,Yes,0,25.76,49
6,Natalie Portman,Female,06/09/1981,1981,Hazel,Brown,Moles,5.3,110.0,Gemini,No,0,20.75,39
7,Channing Tatum,Male,04/26/1980,1980,Green,Brown,Eyes,6.1,196.0,Taurus,Yes,0,32.13,40
8,Mila Kunis,Female,08/14/1983,1983,Green and Blue,Brown,Sexy,5.4,115.0,Leo,No,0,21.3,37
9,Chris Hemsworth,Male,08/11/1983,1983,Blue,Blonde,Voice,6.3,201.0,Leo,Yes,0,31.9,37


In [4]:
x = df.drop(['Name', 'DOB', 'Birth Year', 'Height(ft)', 'Weight(lbs)'], axis=1)
x.tail()

Unnamed: 0,Sex,Eye Color,Hair Color,Distinctive Features,Zodiac Sign,Tattoo Body Art,Hot Test,Ht/Wt Ratio,Age
196,Male,Brown,Brown,Smile,Sagittarius,No,1,42.07,68
197,Male,Black,Bald,Face,Cancer,No,1,32.35,57
198,Male,Brown,Brown,Hair,Sagittarius,Yes,1,30.0,55
199,Female,Black,Black,Smile,Cancer,No,1,56.86,37
200,Male,Blue,Brown,Hair,Cancer,No,1,26.95,38


In [5]:
cat_col = x.select_dtypes(include=['object']).columns
dummies = pd.get_dummies(x[cat_col],drop_first=True)
without_dummies = x.drop(cat_col,axis=1)
data = pd.concat([dummies,without_dummies],axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 67 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Sex_MAle                          201 non-null    uint8  
 1   Sex_Male                          201 non-null    uint8  
 2   Sex_male                          201 non-null    uint8  
 3   Eye Color_Blue                    201 non-null    uint8  
 4   Eye Color_Brown                   201 non-null    uint8  
 5   Eye Color_Gray                    201 non-null    uint8  
 6   Eye Color_Green                   201 non-null    uint8  
 7   Eye Color_Green and Blue          201 non-null    uint8  
 8   Eye Color_Hazel                   201 non-null    uint8  
 9   Hair Color_Bald                   201 non-null    uint8  
 10  Hair Color_Black                  201 non-null    uint8  
 11  Hair Color_Blond                  201 non-null    uint8  
 12  Hair Col

## Extract Features from Results

# build X and y matrices
X = df.drop(['Name', 'DOB', 'Hot Test'], axis=1)
y = df[['Hot Test']].values.reshape(-1)
# Xv = X.values
# yv = y.values.reshape(-1)
y
# enc = LabelEncoder()
# cat_cols = ['Sex', 'Birth Year', 'Eye Color', 'Hair Color', 'Distinctive Features', 'Height(ft)', 'Weight(lbs)', 'Zodiac Sign']
# for col in cat_cols:
#     X[col] = X[col].astype('str')
#     y[col] = y[col].astype('str')
#     X[col] = enc.fit_transform(X[col])
#     y[col] = enc.transform(y[col])



## Preliminary Data Analysis

In [6]:
# make sure there is no nan
# if there is nan, you need to deal with it, either by imputing or discarding
data.isnull().sum(axis = 0)

Sex_MAle               0
Sex_Male               0
Sex_male               0
Eye Color_Blue         0
Eye Color_Brown        0
                      ..
Zodiac Sign_Virgo      0
Tattoo Body Art_Yes    0
Hot Test               0
Ht/Wt Ratio            0
Age                    0
Length: 67, dtype: int64

## Data Cleanup

Had the above test ( or any others wey may want to add ) had encountered issues we need to address, a lot more code could be required here....

## Train Test Split

The stratify argument is used to make sure the train test split data has similar populations

cat_cols = ['Sex', 'Eye Color', 'Hair Color', 'Distinctive Features', 'Zodiac Sign']
enc = LabelEncoder()

for col in cat_cols:
    Xv[col] = Xv[col].astype('str')
    yv[col] = yv[col].astype('str')
    Xv[col] = enc.fit_transform(Xv[col])
    yv[col] = enc.transform(yv[col])

In [7]:
X = data.drop('Hot Test', axis=1).values 
y = df['Hot Test'].values.reshape(-1)


In [8]:
# split to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Pick A Model For A Base Point To Evaluate Other Models Against

In this case we are choosing Logistric Regression

In [9]:
# try LogisticRegression to establish a baseline performance
pipeline = Pipeline([
    ('scale', StandardScaler()), # remember to scale first before feeding data into lgr
    ('lgr', LogisticRegression()),
])
evaluate(pipeline, X_train, X_test, y_train, y_test)

Training result: f1: 0.904, acc: 0.900
Test result: f1: 0.615, acc: 0.634



## Now Let's Try A Few More...

In [10]:
# try other predictors
evaluate(XGBClassifier(n_jobs=-1), X_train, X_test, y_train, y_test)
evaluate(LGBMClassifier(n_jobs=-1), X_train, X_test, y_train, y_test)
evaluate(RandomForestClassifier(n_jobs=-1), X_train, X_test, y_train, y_test)
evaluate(DecisionTreeClassifier(), X_train, X_test, y_train, y_test)
evaluate(GradientBoostingClassifier(), X_train, X_test, y_train, y_test)

Training result: f1: 1.000, acc: 1.000
Test result: f1: 0.703, acc: 0.732

Training result: f1: 0.945, acc: 0.944
Test result: f1: 0.667, acc: 0.683

Training result: f1: 1.000, acc: 1.000
Test result: f1: 0.722, acc: 0.756

Training result: f1: 1.000, acc: 1.000
Test result: f1: 0.636, acc: 0.610

Training result: f1: 0.994, acc: 0.994
Test result: f1: 0.611, acc: 0.659



## Let's Pick a Final Model To Move Forward With

From the above evaluations, it looks like XGBClassifier is a very promising candidate

We will then hypertune the classifier model to come up with the best model we can.

## Let's Create Our Tuning Object

In [15]:
# RandomizedSearchCV on XGB
xgb_param_grid = {
    'n_estimators': [10, 20, 50, 100, 200, 300, 400],
    'max_depth': np.arange(5, 20),
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'subsample': np.arange(0.5, 1.0, 0.05),
    'min_child_weight': np.arange(1, 10),
    'colsample_bytree': np.arange(0.2, 1.0, 0.1),
    'gamma': [0, 0.001, 0.002, 0.003, 0.004, 0.005, 1e-2],
    'n_jobs': [-1]
}


## Let's find The Best Model We Can

The RandomizedSearchCV function will try all our combinations above and select the most accurate model.  

That best model is found in the best_estimator_ property of the RandomizedSerachCV object. 

In [16]:
predictor = XGBClassifier()
rs = RandomizedSearchCV(predictor, xgb_param_grid, cv=5, scoring='f1', n_jobs=-1, n_iter=100, verbose=1)
rs.fit(X_train, y_train)
evaluate(rs.best_estimator_, X_train, X_test, y_train, y_test)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.0s
Training result: f1: 0.500, acc: 0.820
Test result: f1: 0.000, acc: 0.840

[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   17.0s finished


# Evaluate Our Model Further

Now we are going to shuffle the data over and over and apply our new model to the results to further determine if we want to use this model.

In [17]:
# evaluate model with kfold
kfold = KFold(n_splits=10)
results = cross_val_score(rs.best_estimator_, X, y, cv=kfold, n_jobs=-1)
print("Results: %.2f (%.2f) accuracy" % (results.mean(), results.std()))

Results: 0.70 (0.36) accuracy


## Save The Model For Future Use

In [18]:
# save model
with open(f'best_xgb_model.pickle', 'wb') as f:
    pickle.dump(rs.best_estimator_, f)

## Confirm File

Make sure the operating system you are NOT using is commented out below

In [19]:
# windows
! dir best_xgb*
# mac / linux / Unix
# ! ls -a best_xgb*

 Volume in drive C is TI10653400C
 Volume Serial Number is 24D7-5A9C

 Directory of c:\Users\tahir\Desktop\Codes\project3\00_Project_3_Type_Example

12/12/2020  12:28 AM            53,829 best_xgb_model.pickle
               1 File(s)         53,829 bytes
               0 Dir(s)  30,749,179,904 bytes free
