# Create a Logistic Regression to predict absenteeism

## 1. Import the relevant libraries

In [None]:
import numpy as np
import pandas as pd

# 2. Load the data

In [None]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [None]:
data_preprocessed

## 3. Create the Target

In [None]:
data_preprocessed['Absenteeism Time in Hours'].median()

# What are these classes
# Moderately absent (=< 3 hours)
# Excessively absent (>= 4 hours)

In [None]:
#targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [None]:
data_preprocessed['Excessive Absenteeism'] = targets

In [None]:
data_preprocessed.head(4)

In [None]:
### A comment on the targets

In [None]:
targets.sum() / targets.shape[0]

# Around 46% of the targets are 1s and 54% of the targets are 0s
# A balance of 45-55 is almost always sufficient

In [None]:
# Lets drop the Absenteeism Time in Hours since we don't need it anymore
# Drop the Date columns as well

#data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Date'], axis=1)



In [None]:
data_with_targets is data_preprocessed

In [None]:
data_with_targets.head()

## 4. Select the inputs for the regression

In [None]:
data_with_targets.shape

In [None]:
data_with_targets.iloc[:, 0:14]

In [None]:
# Or
data_with_targets.iloc[:,:-1]

In [None]:
# Let's create a new variable for the inputs
unscaled_inputs = data_with_targets.iloc[:,:-1]

## 5. Standardize the data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
absenteeism_scaler = StandardScaler()
absenteeism_scaler.fit(unscaled_inputs) 

In [None]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [None]:
scaled_inputs

In [None]:
scaled_inputs.shape

## 6. Splitting the Data for Training and Testing

In [None]:
# Split the data into train & test and shuffle

from sklearn.model_selection import train_test_split

In [None]:
# Split
train_test_split(scaled_inputs, targets)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8)
# train_size = 0.8 means 80% of the data will be used for training and 20% for testing

In [None]:
print(x_train.shape, y_train.shape)

In [None]:
print(x_test.shape, y_test.shape)

In [None]:
print(" x_train: ", x_train.shape)
print(" y_train: ", y_train.shape)
print(" x_test: ", x_test.shape)
print(" y_test: ", y_test.shape)

# Machine Learning Models

## 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### 1.1 Training the logistic model

In [None]:
reg = LogisticRegression()

In [None]:
reg.fit(x_train, y_train)

In [None]:
reg.score(x_train, y_train)

### 1.2 Manually check the accuracy

In [None]:
model_outputs = reg.predict(x_train)
model_outputs
# These are predictions for the model or outputs

In [None]:
y_train
# These are our targets fron the Data

In [None]:
model_outputs == y_train
# True = 1
# False = 0

In [None]:
np.sum(model_outputs==y_train)
# is the total number of correct predictions (True entries)

In [None]:
# Observations
model_outputs.shape[0]

In [None]:
# Acuracy = Correct predictions / # observations
Accuracy = np.sum(model_outputs==y_train) / model_outputs.shape[0]

In [None]:
Accuracy

## 2. Logistic Regression with Hyper parameters Optimization

In [None]:
import joblib
from sklearn.model_selection import GridSearchCV
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
def print_results(results):
    print("BEST PARAMS: {}\n".format(results.best_params_))
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/--{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [None]:
lr = LogisticRegression()

parameters = {
    'C':[0.001, 0.01, 0.1, 1, 10]
}

# Cross Validation : 5 kfolds
cv = GridSearchCV(lr, parameters, cv=5)

cv.fit(x_train, y_train)

print_results(cv)

In [None]:
# Check the best parameter
cv.best_estimator_

In [None]:
# Write Out picked model
joblib.dump(cv.best_estimator_, 'LR_model.pkl')