# ML Model Training

Note: Install Install the **libomp.dylib** library using a package manager like Homebrew.

`brew install libomp`

Which is required for LightGBM to function properly.

### Import Dependencies

In [1]:
## data_df_df Analysis packages
import os
import sys
import numpy as np
import pandas as pd
import joblib

## Machine learning packages
import sklearn
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_curve, auc
from category_encoders import OrdinalEncoder

import warnings 
warnings.filterwarnings("ignore")

In [2]:
print(f"Pandas Version: {pd.__version__}")
print(f"Numpy Version: {np.__version__}")
print(f"scikit-learn Version: {sklearn.__version__}")
print(f"joblib Version: {joblib.__version__}")
print(f"lightgbm Version: {lgb.__version__}")

Pandas Version: 2.0.3
Numpy Version: 1.24.4
scikit-learn Version: 1.3.2
joblib Version: 1.4.2
lightgbm Version: 4.5.0


### Load Dataset

In [3]:

# Dataset File
data_file = './data/Placement_Data_Full_Class.csv'

# Load train loan dataset
try:
    data = pd.read_csv(data_file)
    print("The dataset has {} samples with {} features.".format(*data.shape))
except:
    print("The dataset could not be loaded. Is the dataset missing?")

The dataset has 215 samples with 15 features.


In [4]:
data.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
 14  salary          148 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 25.3+ KB


### Data Preparation

##### Define Target columns

In [6]:
target = data['status'].map({"Placed": 0 , "Not Placed": 1})

##### Define numeric and categorical features

In [7]:
exclude_feature = ['sl_no', 'salary', 'status']
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
numeric_features = [col for col in numeric_columns if col not in exclude_feature]
categorical_features = [col for col in categorical_columns if col not in exclude_feature]

##### Define final feature list for training and validation

In [8]:
features = numeric_features + categorical_features

##### Final data for training and validation

In [9]:
data = data[features]
data = data.fillna(0)

##### Split data in train and validation


In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(data, target, test_size=0.15, random_state=10)
X_valid.to_json(path_or_buf='./data/valid.json', orient='records', lines=True)

##### Perform label encoding for categorical variable

In [11]:
le = OrdinalEncoder(cols=categorical_features)
le.fit(X_train[categorical_features])
X_train[categorical_features] = le.transform(X_train[categorical_features])
X_valid[categorical_features] = le.transform(X_valid[categorical_features])

### Train model

##### Perform model training

In [12]:
clf = LGBMClassifier(random_state=10)
clf.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 53, number of negative: 129
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000227 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 238
[LightGBM] [Info] Number of data points in the train set: 182, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291209 -> initscore=-0.889520
[LightGBM] [Info] Start training from score -0.889520


### Evaluate model

##### Validation AUC

In [13]:
valid_prediction = clf.predict_proba(X_valid)[:, 1]
fpr, tpr, thresholds = roc_curve(y_valid, valid_prediction)
roc_auc = auc(fpr, tpr) # compute area under the curve
print("=====================================")
print("Validation AUC:{}".format(roc_auc))
print("=====================================")

Validation AUC:0.9135338345864662


##### Perform model evaluation 

In [14]:
print(classification_report(y_valid,clf.predict(X_valid)))

              precision    recall  f1-score   support

           0       0.75      0.95      0.84        19
           1       0.89      0.57      0.70        14

    accuracy                           0.79        33
   macro avg       0.82      0.76      0.77        33
weighted avg       0.81      0.79      0.78        33



### Save model

In [15]:
joblib.dump(le, './model/label_encoder.joblib')
joblib.dump(clf, './model/lgb_model.joblib')
joblib.dump(features, './model/features.joblib')
joblib.dump(categorical_features, './model/categorical_features.joblib')

['./model/categorical_features.joblib']

# Local API

Open terminal, go to the project folder and activate the conda environment. Run the following commands to test the local deploment server:

1. `conda activate conda-env-name`

2. `cd app/`

3. `python main.py`

Expected response:

~~~
INFO:     Will watch for changes in these directories: ['/root/project-folder/app']
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
INFO:     Started reloader process [1294] using StatReload
INFO:     Started server process [1297]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     127.0.0.1:64944 - "GET / HTTP/1.1" 200 OK
~~~

In [21]:
import requests
data = {
  "sl_no": 112,
  "ssc_p": 84.0,
  "hsc_p": 90.9,
  "degree_p": 64.5,
  "etest_p": 86.04,
  "mba_p": 59.42,
  "gender": "M",
  "ssc_b": "Others",
  "hsc_b": "Others",
  "hsc_s": "Science",
  "degree_t": "Sci&Tech",
  "workex": "No",
  "specialisation": "Mkt&Fin"
}
response = requests.post("http://0.0.0.0:8000/predict", json=data)
print(response.text)

{"prediction":["Placed"]}


In [22]:
data = {
  "sl_no": 113,
  "ssc_p": 52.0,
  "hsc_p": 57.0,
  "degree_p": 50.8,
  "etest_p": 67.0,
  "mba_p": 62.79,
  "gender": "M",
  "ssc_b": "Central",
  "hsc_b": "Central",
  "hsc_s": "Commerce",
  "degree_t": "Comm&Mgmt",
  "workex": "No",
  "specialisation": "Mkt&HR"
}
response = requests.post("http://0.0.0.0:8000/predict", json=data)
print(response.text)

{"prediction":["Not Placed"]}
