# Hot Project Machine Learning Model

###  This model reflects manual process for comming up witn an effective model for a classificaiton Project



In [1]:
# import dependencies needed to build our model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from sqlalchemy import create_engine
import sqlite3 
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, plot_roc_curve
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
import tqdm

## EVALUATE FUNCTION

Since we will be trying lots of different models, we built a single function that will evaluate all our models and provide a standardized reporting format.

This will allow us to easily pick out the best model we want to move forward with.

This function takes in a model ( pipeline ) and our train test split data. From there it simply performes predictions and generates results

In [2]:
def evaluate(pipeline, X_train, X_test, y_train, y_test):
    '''
    Evaluate a pipeline on training and test datasets
    '''    
    pipeline.fit(X_train, y_train)
    y_train_hat = pipeline.predict(X_train)
    y_test_hat = pipeline.predict(X_test)
    train_f1 = f1_score(y_train_hat, y_train)
    train_acc = accuracy_score(y_train_hat, y_train)
    test_f1 = f1_score(y_test_hat, y_test)
    test_acc = accuracy_score(y_test_hat, y_test)

    print(f"========== Predictor: {type(pipeline).__name__} ==========")
    print(f"Training result: f1: {train_f1:.3f}, acc: {train_acc:.3f}")
    print(f"Test result: f1: {test_f1:.3f}, acc: {test_acc:.3f}")
    print()


## DATA
In this case we are reading in top 200 hot and not hot people data.  With this data we are trying to predict if an individual is hot or not based on specific features.

#### The features are:
- Sex  ->   Male or Female.
- Age -> How old is an indiviual.
- Eye Color -> Variation of eyes colors.
- Hair Color -> Different hair colors effect looks.
- Distinctive Features -> Mainly related to how an individual look like.
- Height -> How tall are they.
- Weight -> Body mass effect looks.
- Zodiac Sign -> Is your star lucky.
- Tattoo Body Art -> Do they have any inks in their body.

In [3]:
# load dataset from sqlite dataBase. Creat an engine and then use pandas to read and convert the sql table into dataframe

engine = create_engine('sqlite:///../dataBase/Are_You_Hot.db')
hot_df = pd.read_sql('select * from hot', engine)
hot_df.head(10)

Unnamed: 0,name,sex,dob,birth_year,eye_color,hair_color,distinctive_features,height(ft),weight(lbs),zodiac_sign,tattoo_body_art,hot_test,ratio(wt/ht),age
0,Angelina Jolie,Female,06/04/1975,1975,Gray,Blonde,Lips,5.5,119.0,Gemini,Yes,0,21.64,45
1,Johnny Depp,Male,06/09/1963,1963,Brown,Brown,Cheekbones,5.83,171.0,Gemini,Yes,0,29.33,57
2,Charlize Theron,Female,08/07/1975,1975,Green,Brown,Attractive,5.75,121.0,Leo,Yes,0,21.04,45
3,Brad Pitt,Male,12/18/1963,1963,Blue,Blonde,Jaw,5.91,172.0,Sagittarius,Yes,0,29.1,57
4,Amber Heard,Female,04/22/1986,1986,Blue,Blonde,Body,5.58,137.0,Taurus,Yes,0,24.55,34
5,Jared Leto,Male,12/26/1971,1971,Blue,Brown,Eyes,5.75,152.0,Capricorn,Yes,0,26.43,49
6,Natalie Portman,Female,06/09/1981,1981,Hazel,Brown,Moles,5.25,110.0,Gemini,No,0,20.95,39
7,Channing Tatum,Male,04/26/1980,1980,Green,Brown,Eyes,6.08,196.0,Taurus,Yes,0,32.24,40
8,Mila Kunis,Female,08/14/1983,1983,Green,Brown,Body,5.33,115.0,Leo,No,0,21.58,37
9,Chris Hemsworth,Male,08/11/1983,1983,Blue,Blonde,Voice,6.25,201.0,Leo,Yes,0,32.16,37


## DataFrame cleanup 

In [4]:
# Data clean up and droping columns we are not going to use to train our model.

clean_df = hot_df.drop(['name', 'dob', 'birth_year', 'height(ft)', 'weight(lbs)', 'hair_color', 'tattoo_body_art'], axis=1)
clean_df.tail()

Unnamed: 0,sex,eye_color,distinctive_features,zodiac_sign,hot_test,ratio(wt/ht),age
196,Male,Brown,Smile,Sagittarius,1,36.38,68
197,Male,Black,Face,Cancer,1,32.48,57
198,Male,Brown,Hair,Sagittarius,1,30.0,55
199,Female,Black,Smile,Cancer,1,57.09,37
200,Male,Blue,Hair,Cancer,1,27.65,38


In [88]:
y = hot_df['Eye Color'].value_counts()
y

Blue     91
Brown    55
Green    29
Hazel    19
Black     5
Gray      2
Name: Eye Color, dtype: int64

In [7]:
clean_df['distinctive_features'] = clean_df['distinctive_features'].replace({'Bald': 'Other', 'Goatee': 'Other', 'Moles': 'Other', 'Skin': 'Other', 'Height': 'Other', 'Forehead': 'Other', 'Style': 'Other', 'Cheeks': 'Other', 'Jaw': 'Other', 'Legs': 'Other', 'Stubble': 'Other'})
# clean_df['Eye Color'] = clean_df['Eye Color'].replace({'Black': 'Others', 'Gray': 'Others'})
clean_df['zodiac_sign'] = clean_df['zodiac_sign'].replace({'Pisces': 'Others', 'Aries': 'Others', 'Virgo': 'Others', 'Libra': 'Others', 'Aquarius': 'Others', 'Gemini': 'Others'})
# , 'Green and Blue': 'Others'
model_df = clean_df
model_df.head()

Unnamed: 0,sex,eye_color,distinctive_features,zodiac_sign,hot_test,ratio(wt/ht),age
0,Female,Gray,Lips,Others,0,21.64,45
1,Male,Brown,Cheekbones,Others,0,29.33,57
2,Female,Green,Attractive,Leo,0,21.04,45
3,Male,Blue,Other,Sagittarius,0,29.1,57
4,Female,Blue,Body,Taurus,0,24.55,34


In [None]:
model_df.dtypes

In [8]:
# We need to convert any string/object column (categorical variables) to integer for training our model
# We use pandas get_dummies to create OneHotEncoder variables. 

cat_col = model_df.select_dtypes(include=['object']).columns
dummies = pd.get_dummies(model_df[cat_col],drop_first=True)
without_dummies = model_df.drop(cat_col,axis=1)
model_data = pd.concat([dummies,without_dummies],axis=1)
model_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 32 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   sex_Male                         201 non-null    uint8  
 1   eye_color_Blue                   201 non-null    uint8  
 2   eye_color_Brown                  201 non-null    uint8  
 3   eye_color_Gray                   201 non-null    uint8  
 4   eye_color_Green                  201 non-null    uint8  
 5   eye_color_Hazel                  201 non-null    uint8  
 6   distinctive_features_Body        201 non-null    uint8  
 7   distinctive_features_Cheekbones  201 non-null    uint8  
 8   distinctive_features_Chin        201 non-null    uint8  
 9   distinctive_features_Ears        201 non-null    uint8  
 10  distinctive_features_Eyebrows    201 non-null    uint8  
 11  distinctive_features_Eyes        201 non-null    uint8  
 12  distinctive_features_F

## Extract Features from Results

## Preliminary Data Analysis

In [9]:
# make sure there is no nan
# if there is nan, we need to deal with it, either by imputing or discarding
model_data.isnull().sum(axis = 0)

sex_Male                           0
eye_color_Blue                     0
eye_color_Brown                    0
eye_color_Gray                     0
eye_color_Green                    0
eye_color_Hazel                    0
distinctive_features_Body          0
distinctive_features_Cheekbones    0
distinctive_features_Chin          0
distinctive_features_Ears          0
distinctive_features_Eyebrows      0
distinctive_features_Eyes          0
distinctive_features_Face          0
distinctive_features_Hair          0
distinctive_features_Jawline       0
distinctive_features_Lips          0
distinctive_features_Muscular      0
distinctive_features_Nose          0
distinctive_features_Other         0
distinctive_features_Smile         0
distinctive_features_Teeth         0
distinctive_features_Voice         0
zodiac_sign_Cancer                 0
zodiac_sign_Capricorn              0
zodiac_sign_Leo                    0
zodiac_sign_Others                 0
zodiac_sign_Sagittarius            0
z

## Train Test Split

The stratify argument is used to make sure the train test split data has similar populations

In [11]:
# define our X and y values for model training and test
# , 'Distinctive Features_Bald'
X = model_data.drop('hot_test', axis=1)
y = hot_df['hot_test'].values.reshape(-1)


In [12]:
# split to training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Pick A Model For A Base Point To Evaluate Other Models Against

In this case we are choosing Logistric Regression

In [13]:
# Do LogisticRegression first to establish a baseline performance

pipeline = Pipeline([
    ('scale', StandardScaler()), # scale first before feeding data into lgr
    ('lgr', LogisticRegression()),
])
evaluate(pipeline, X_train, X_test, y_train, y_test)

Training result: f1: 0.820, acc: 0.819
Test result: f1: 0.650, acc: 0.659



## Try A Few More Models...

In [14]:
# try other predictors

evaluate(XGBClassifier(n_jobs=-1), X_train, X_test, y_train, y_test)
evaluate(LGBMClassifier(n_jobs=-1), X_train, X_test, y_train, y_test)
evaluate(RandomForestClassifier(n_jobs=-1), X_train, X_test, y_train, y_test)
evaluate(DecisionTreeClassifier(), X_train, X_test, y_train, y_test)
evaluate(GradientBoostingClassifier(), X_train, X_test, y_train, y_test)

Training result: f1: 1.000, acc: 1.000
Test result: f1: 0.727, acc: 0.707

Training result: f1: 0.883, acc: 0.887
Test result: f1: 0.744, acc: 0.732

Training result: f1: 1.000, acc: 1.000
Test result: f1: 0.773, acc: 0.756

Training result: f1: 1.000, acc: 1.000
Test result: f1: 0.595, acc: 0.634

Training result: f1: 0.975, acc: 0.975
Test result: f1: 0.732, acc: 0.732



In [None]:
gbc = GradientBoostingClassifier(random_state=42)
gbc.fit(X_train, y_train)
ax = plt.gca()
gbc_disp = plot_roc_curve(gbc, X_test, y_test, ax=ax, alpha=0.8)
plt.savefig('../Images/gbc_ROC.png')
plt.show()

In [None]:
xgbc = XGBClassifier(random_state=42)
xgbc.fit(X_train, y_train)
ax = plt.gca()
xgbc_disp = plot_roc_curve(xgbc, X_test, y_test, ax=ax, alpha=0.8)
plt.savefig('../Images/xgbc_ROC.png')
plt.show()

In [None]:
lgbc = LGBMClassifier(random_state=42)
lgbc.fit(X_train, y_train)
ax = plt.gca()
lgbc_disp = plot_roc_curve(lgbc, X_test, y_test, ax=ax, alpha=0.8)
plt.savefig('../Images/lgbc_ROC2.png')
plt.show()

In [None]:
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)
ax = plt.gca()
dtc_disp = plot_roc_curve(dtc, X_test, y_test, ax=ax, alpha=0.8)
plt.savefig('../Images/dtc_ROC.png')
plt.show()

In [None]:
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(X_train, y_train)
ax = plt.gca()
rfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=ax, alpha=0.8)
dtc_disp.plot(ax=ax, alpha=0.8)
lgbc_disp.plot(ax=ax, alpha=0.8)
xgbc_disp.plot(ax=ax, alpha=0.8)
gbc_disp.plot(ax=ax, alpha=0.8)
plt.savefig('../Images/rfc_ROC.png')
plt.show()

## Let's Pick a Final Model To Move Forward With

From the above evaluations, it looks like LGBMClassifier is a very promising candidate

We will then hypertune the classifier model to come up with the best model we can.

## Let's Create Our Tuning Object

In [15]:
# RandomizedSearchCV on LGBM

lgbm_param_grid = {
    'n_estimators': [10, 20, 50, 100, 200, 300, 400],
    'max_depth': np.arange(5, 20),
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'subsample': np.arange(0.5, 1.0, 0.05),
    'min_child_weight': np.arange(1, 10),
    'colsample_bytree': np.arange(0.2, 1.0, 0.1),
    'gamma': [0, 0.001, 0.002, 0.003, 0.004, 0.005, 1e-2],
    'n_jobs': [-1]
}

# lgbm_param_grid = {
#     'learning_rate': [ 0.1],
#     'num_leaves': [31],
#     'boosting_type' : ['gbdt'],
#     'objective' : ['binary']
# }

# lgbm_param_grid = {

#         # 'bagging_fraction': (0.5, 0.8),
#         # 'bagging_frequency': (5, 8),
#         'n_jobs': [-1]
#         'feature_fraction': (0.5, 0.8),
#         'max_depth': (10, 13),
#         'min_data_in_leaf': (90, 120),
#         'num_leaves': (1200, 1550)

# }

## Let's find The Best Model We Can

The RandomizedSearchCV function will try all our combinations above and select the most accurate model.  

That best model is found in the best_estimator_ property of the RandomizedSerachCV object. 

In [16]:
predictor = LGBMClassifier()
# predictor = XGBClassifier()
rs = RandomizedSearchCV(predictor, lgbm_param_grid, cv=5, scoring='f1', n_jobs=-1, n_iter=100, verbose=1)
rs.fit(X_train, y_train)
evaluate(rs.best_estimator_, X_train, X_test, y_train, y_test)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.8s
Training result: f1: 0.774, acc: 0.781
Test result: f1: 0.773, acc: 0.756

[Parallel(n_jobs=-1)]: Done 485 out of 500 | elapsed:    6.7s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    6.8s finished


# Evaluate Our Model Further

Now we are going to shuffle the data over and over and apply our new model to the results to further determine if we want to use this model.

In [17]:
# evaluate model with kfold

kfold = KFold(n_splits=10)
results = cross_val_score(rs.best_estimator_, X, y, cv=kfold, n_jobs=-1)
print("Results: %.2f (%.2f) accuracy" % (results.mean(), results.std()))

Results: 0.69 (0.10) accuracy


## Save The Model For Future Use

In [19]:
model_columns = list(X.columns)
with open(f'model_columns.pickle', 'wb') as f:
    pickle.dump(model_columns, f)

In [18]:
# save model

with open(f'lgbm_model.pickle', 'wb') as f:
    pickle.dump(rs.best_estimator_, f)
# with open(f'xgb_model.pickle', 'wb') as f:
#     pickle.dump(rs.best_estimator_, f)

## Confirm File

Make sure the operating system you are NOT using is commented out below

In [20]:
# windows
# ! dir best_xgb*
! dir lgbm_model*
# ! dir xgb_model*
! dir model_columns*
# mac / linux / Unix
# ! ls -a best_xgb*

 Volume in drive C is TI10653400C
 Volume Serial Number is 24D7-5A9C

 Directory of c:\Users\tahir\Desktop\repos\are_you_hot\Project_3_Hot_Model

12/17/2020  06:01 PM           113,587 lgbm_model.pickle
               1 File(s)        113,587 bytes
               0 Dir(s)  33,010,659,328 bytes free
 Volume in drive C is TI10653400C
 Volume Serial Number is 24D7-5A9C

 Directory of c:\Users\tahir\Desktop\repos\are_you_hot\Project_3_Hot_Model

12/17/2020  06:02 PM               874 model_columns.pickle
               1 File(s)            874 bytes
               0 Dir(s)  33,010,663,424 bytes free
