In [1]:
import pandas as pd
import numpy as np

In [2]:
example_folder = "C:\\Ricardo\\Ricardo\\curso - The Data Science Course 2019 Complete Data Science Bootcamp\\Proyectos\\56 - files\\";
data_preprocessed = pd.read_csv(example_folder + 'Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [4]:
data_preprocessed['Absenteeism Time in Hours'].describe()

count    700.000000
mean       6.761429
std       12.670082
min        0.000000
25%        2.000000
50%        3.000000
75%        8.000000
max      120.000000
Name: Absenteeism Time in Hours, dtype: float64

In [5]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

#### The absenteeism data median seems to be a great threshold to assign someone as absented

In [6]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0) 

In [7]:
print(len(targets))
targets[:5]

700


array([1, 0, 0, 1, 0])

#### Creating the targets

In [8]:
data_preprocessed['Excessive Absenteeism'] = targets

In [9]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [10]:
targets.sum()/ targets.shape[0]

0.45571428571428574

In [11]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the Week', 'Daily Work Load Average', 'Distance to Work'], axis=1)

In [12]:
data_with_targets is data_preprocessed #if False then checkpoint done

False

#### Select input for regression model

In [13]:
data_with_targets.shape

(700, 12)

In [14]:
data_with_targets.iloc[:,:14].head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [15]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

#### Standardizing data

In [16]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler()

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

In [18]:
class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True,with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_= None
        self.var_= None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [19]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [20]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [21]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [22]:
absenteeism_scaler.fit(unscaled_inputs)

CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pet'],
             copy=None, with_mean=None, with_std=None)

In [23]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [24]:
print(scaled_inputs.shape)
scaled_inputs[:2]

(700, 11)


Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.030796,-1.574681,2.130803,1.002633,0,-0.01928,-0.58969


#### Splitting data into training and test

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size=0.2, random_state=20)

In [27]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [29]:
reg = LogisticRegression()

In [30]:
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
reg.score(x_train, y_train)

0.775

#### Checking accuracy of model

In [32]:
model_outputs = reg.predict(x_train)
model_outputs[:5]

array([0, 1, 1, 1, 1])

In [33]:
(model_outputs == y_train)[:5]

array([ True,  True,  True,  True,  True])

In [34]:
np.sum((model_outputs == y_train))/model_outputs.shape[0]

0.775

In [35]:
feature_name = unscaled_inputs.columns.values

In [36]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)

In [37]:
summary_table['Coefficient'] = np.transpose(reg.coef_)
#summary_table['Coefficient'] = reg.coef_

In [38]:
summary_table.head()

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.602372
1,Reason_2,0.8435
2,Reason_3,2.940787
3,Reason_4,0.637234
4,Month Value,0.005651


In [39]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table.head()

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.431381
1,Reason_1,2.602372
2,Reason_2,0.8435
3,Reason_3,2.940787
4,Reason_4,0.637234


#### It is necessary to apply exponential to these results to interpret them because of the coefficients are logs of model results  

In [40]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [41]:
summary_table.head()

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.431381,0.238979
1,Reason_1,2.602372,13.495716
2,Reason_2,0.8435,2.324489
3,Reason_3,2.940787,18.930743
4,Reason_4,0.637234,1.891243


In [42]:
summary_table.sort_values('Odds_ratio',ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,2.940787,18.930743
1,Reason_1,2.602372,13.495716
2,Reason_2,0.8435,2.324489
4,Reason_4,0.637234,1.891243
6,Transportation Expense,0.619534,1.858062
10,Children,0.35195,1.421838
8,Body Mass Index,0.284103,1.32857
5,Month Value,0.005651,1.005667
7,Age,-0.176355,0.83832
9,Education,-0.263725,0.768185


#### Testing

In [43]:
reg.score(x_test,y_test)

0.7357142857142858

In [45]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba[:5]

array([[0.75308922, 0.24691078],
       [0.60926091, 0.39073909],
       [0.4859575 , 0.5140425 ],
       [0.7552847 , 0.2447153 ],
       [0.0839675 , 0.9160325 ]])

In [46]:
predicted_proba.shape

(140, 2)

In [48]:
predicted_proba[:5,1]

array([0.24691078, 0.39073909, 0.5140425 , 0.2447153 , 0.9160325 ])

#### Saving the model

In [49]:
import pickle

In [50]:
with open('model','wb') as file:
    pickle.dump(reg, file)

In [51]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)