# creating a logistic regression to predict absenteesim

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Month Value,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,1,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,1,5,2,179,22,40,237.656,22,1,2,0,8
696,1,0,0,1,5,2,225,26,28,237.656,24,0,1,2,3
697,1,0,0,1,5,3,330,16,28,237.656,25,1,0,0,8
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2


### creating the targets

logistic regression is a form of classification. so we will essentially be categorizing the people into classes. 

the classes can be up to us. we will target 2 classes. one for the group of people who have been excessively absent. and one group for people who have been moderately absent. 

the method we will use for this is a bit naive, but numerically stable. we will take the average absenteesim time and use it as a cutoff point 

In [3]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

anyone absent for more than 3 hours will be thought of as excesively absent

In [4]:
#we can use np.where(condition, value if True, value if False) to filter data 
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [5]:
data_preprocessed['Excsessive Abensteesim'] = targets
data_preprocessed.head()

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Month Value,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excsessive Abensteesim
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,1,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


### a comment on the targets

using the median is numerically stable. this is because, by using the median, we have balanced the dataset. roughly half the targets are 1s, and half are 0s. this will prevent prior biases

In [6]:
targets.sum() / targets.shape[0]   #see how many targets are 1s. 

#we see that arounf 46% of the targets are 1s. which roughly halved., so the data is balanced 

0.45571428571428574

In [7]:
#drop the absenteesim column 
#// data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis = 1)
#// data_with_targets

#after running with this config. we removed other features also based on thier low contribution to the model

data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of Week', 'Daily Work Load Average',
                                            'Distance to Work'], axis = 1)
data_with_targets

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excsessive Abensteesim
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,1,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,1,5,179,40,22,1,2,0,1
696,1,0,0,1,5,225,28,24,0,1,2,0
697,1,0,0,1,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


### select inputs for regression

In [8]:
data_with_targets.shape

(700, 12)

In [9]:
# DataFram.iloc[row_indices,column_indices] selects (slices) data by position when givem rows and columns wanted

unscaled_inputs = data_with_targets.iloc[:,:-1]
unscaled_inputs

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,1,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,1,5,179,40,22,1,2,0
696,1,0,0,1,5,225,28,24,0,1,2
697,1,0,0,1,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


### standardizing the data

we proceeded with the original standardization and found that even the dummies had been standardized. this is a problem as the data was categorical. so no we have to modify the standardization procedure so the dummies remain unchanged

old code will be commented out with a #// 

In [10]:
#// from sklearn.preprocessing import StandardScaler

In [11]:
#used to scale data

#// absenteesim_scalar = StandardScaler()  

scaling is done by subtracting the mean and dividing by the standard deviation for each feature 

In [12]:
#this is the new scaling code
#this is a custom scaler based on the sklearn scaler
#this has an extra argument, columns to scale, so the custom scaler will only standardized the columns we choose
#in this way we can avoid standardizing the dummies 

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_  = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X , y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [13]:
unscaled_inputs.columns.values

array(['reason_1', 'reason_2', 'reason_3', 'reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [14]:
#modifying this after backward elimination
#//columns_to_scale = ['Month Value','Day of Week', 'Transportation Expense', 'Distance to Work', 'Age',
#//       'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']  #remove dummy variables

columns_to_omit = ['reason_1', 'reason_2', 'reason_3', 'reason_4','Education']

In [15]:
#use List Comprehension. this is a python syntax construct which allows us to create a list from existing lists based on 
#loops, conditionals etc

columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]  

In [16]:
absenteesim_scaler = CustomScaler(columns_to_scale)

In [17]:
absenteesim_scaler.fit(unscaled_inputs) #prepare scaling mechanism for unscaled_inputs

CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pets'])

In [18]:
#// absenteesim_scalar.fit(unscaled_inputs)  

In [19]:
scaled_inputs = absenteesim_scaler.transform(unscaled_inputs)  #scale the unscaled innputs

In [20]:
scaled_inputs  #all input data has been standardized, except dummys 

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.030796,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.030796,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,1,0.030796,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,1,-0.568019,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,1,-0.568019,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,1,-0.568019,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.568019,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [21]:
scaled_inputs.shape

(700, 11)

### split data into test  / train . then shuffle 

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
train_test_split(scaled_inputs, targets)  #creates 4 arrays, train inputs/targets and test inputs/targets

[     reason_1  reason_2  reason_3  reason_4  Month Value  \
 625         0         0         0         1     0.330204   
 331         1         0         0         1     1.228426   
 306         0         0         0         1     0.929019   
 472         0         0         0         1     0.030796   
 319         1         0         0         1     0.330204   
 ..        ...       ...       ...       ...          ...   
 386         0         0         0         1    -1.466241   
 288         1         0         0         1     0.629611   
 679         1         0         0         1     0.929019   
 320         0         0         0         1     1.228426   
 521         1         0         0         1     0.929019   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 625               -0.654143  0.248310         1.002633          0 -0.919030   
 331                1.036026  0.562059        -0.408580          0 -0.019280   
 306               -1.5746

In [24]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, shuffle=True, random_state=20) 
#x is inputs, y is targets 
#train size 0.8 is 80/20 split 
#shuffle = True is on by default. but will shuffle everytime the code is run.
#you can use random_state to a fixed number so it will always shuffle in the same way 

In [25]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [26]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


## Logistic Regression With sklearn

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### training the model

In [28]:
reg = LogisticRegression()
reg.fit(x_train, y_train)  #fit regression using training data

LogisticRegression()

In [29]:
reg.score(x_train, y_train)  #get model accuracy, seems to be 78%

0.7928571428571428

### Manually checking the accuracy

In [30]:
model_outputs = reg.predict(x_train)  #store predicted outputs for training data
model_outputs

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [31]:
y_train  #actual ouputs 

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [32]:
model_outputs == y_train  #get bool check for values so we can compare

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [33]:
np.sum((model_outputs == y_train)) #total number of matches

444

In [34]:
model_outputs.shape[0]

560

In [35]:
438/560  #calculated accuracy. same as one given by score() method

0.7821428571428571

### finding the intercepts and coefficients 

In [36]:
reg.intercept_   #bias 

array([-3.18168165])

In [37]:
reg.coef_  #weights

array([[ 1.97959176,  0.46747251,  2.37879175,  2.42832953,  0.02188835,
         0.68342679, -0.18127352,  0.31390575, -0.27732153,  0.37876225,
        -0.29636369]])

In [38]:
feature_names = unscaled_inputs.columns.values  #store column names 

In [39]:
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_names)  #create summary table with one column

In [40]:
summary_table['Coefficent'] = np.transpose(reg.coef_)  #create column with coeficient
#use tranpose to convert nd array to rows 

In [41]:
summary_table

Unnamed: 0,Feature name,Coefficent
0,reason_1,1.979592
1,reason_2,0.467473
2,reason_3,2.378792
3,reason_4,2.42833
4,Month Value,0.021888
5,Transportation Expense,0.683427
6,Age,-0.181274
7,Body Mass Index,0.313906
8,Education,-0.277322
9,Children,0.378762


we want to add the coefficents in the first row, however by default, new values will be added to the bottom. so we m ove all rows down by 1, leaving index 0 empty. we can then add the data to index 0

In [42]:
summary_table.index = summary_table.index + 1  #shift all indices by 1 

In [43]:
summary_table

Unnamed: 0,Feature name,Coefficent
1,reason_1,1.979592
2,reason_2,0.467473
3,reason_3,2.378792
4,reason_4,2.42833
5,Month Value,0.021888
6,Transportation Expense,0.683427
7,Age,-0.181274
8,Body Mass Index,0.313906
9,Education,-0.277322
10,Children,0.378762


In [44]:
summary_table.loc[0]=['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index() 
summary_table  #full summary table

Unnamed: 0,Feature name,Coefficent
0,Intercept,-3.181682
1,reason_1,1.979592
2,reason_2,0.467473
3,reason_3,2.378792
4,reason_4,2.42833
5,Month Value,0.021888
6,Transportation Expense,0.683427
7,Age,-0.181274
8,Body Mass Index,0.313906
9,Education,-0.277322


### interpreting the coefficients

the closer the coefficients are to 0, the smaller their impact on the target<br>
-this is true for our model since it is standardized, but may not be the case otherwise 



In [45]:
#the coeficents above are for the log() function since we did logistic regression
#we can also get the normal coefficents by calcuating the exponent 
summary_table['Odds Ratio'] = np.exp(summary_table.Coefficent)
summary_table

Unnamed: 0,Feature name,Coefficent,Odds Ratio
0,Intercept,-3.181682,0.041516
1,reason_1,1.979592,7.239787
2,reason_2,0.467473,1.595955
3,reason_3,2.378792,10.791856
4,reason_4,2.42833,11.339923
5,Month Value,0.021888,1.02213
6,Transportation Expense,0.683427,1.980653
7,Age,-0.181274,0.834207
8,Body Mass Index,0.313906,1.368761
9,Education,-0.277322,0.757811


In [46]:
summary_table.sort_values('Odds Ratio',ascending=False)  #sort table by odds ratio in desc order 

Unnamed: 0,Feature name,Coefficent,Odds Ratio
4,reason_4,2.42833,11.339923
3,reason_3,2.378792,10.791856
1,reason_1,1.979592,7.239787
6,Transportation Expense,0.683427,1.980653
2,reason_2,0.467473,1.595955
10,Children,0.378762,1.460476
8,Body Mass Index,0.313906,1.368761
5,Month Value,0.021888,1.02213
7,Age,-0.181274,0.834207
9,Education,-0.277322,0.757811


#### a feature is not important: <br>
        - if its coefficent is around 0 <br>
        - if its odds ratio is around 1 <br>
        
        
--After modifying the scaler to ignore dummy variables, odds ratio increased for the reason_?? rows 


!!keys-- <br>
     - reason_1 = various diseases <br>
     - reason_2 = pregnancy related <br>
     - reason_3 = poisoning <br>
     - reason_4 = light diseases



standardized models (almost) always yield higher accuracy

machine learnin engineers prefer models with higher accuracy, so they normally go for standardization

econometricians and statisticians prefer less accurate but more interpretable models because they care about the underlying reasons behind different phenomena 

Data scientists may be in either position. sometimes they need higher accuracy, and otehr times they need to find the main drivers of a problem

the bias (intercept) calibrates the model

### Backward Elimination

the idea is that we can simplify our model by removing all the features which have close to no contribution to the model. 

when we have p-values, we get rid of all coefficients with p>0.05

if the weight is small enough, it wont make a difference anyway

we will drop day of week, distance to work, and daily work load average  from the dfirst checkpoint and rerun the code. 

-- after dropping, our accuracy improved slightly. confirming that the dropped variables were useless

### Test model

In [47]:
reg.score(x_test, y_test)  #actual model accuracy on test data 

0.7357142857142858

In [48]:
#get probability of output being 0 or 1

predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.74719107, 0.25280893],
       [0.58626284, 0.41373716],
       [0.44624742, 0.55375258],
       [0.755758  , 0.244242  ],
       [0.07303193, 0.92696807],
       [0.29942655, 0.70057345],
       [0.29911945, 0.70088055],
       [0.09862839, 0.90137161],
       [0.7367704 , 0.2632296 ],
       [0.75088681, 0.24911319],
       [0.47519136, 0.52480864],
       [0.16052665, 0.83947335],
       [0.05114643, 0.94885357],
       [0.70309849, 0.29690151],
       [0.2765338 , 0.7234662 ],
       [0.49548863, 0.50451137],
       [0.47355728, 0.52644272],
       [0.48173264, 0.51826736],
       [0.36145045, 0.63854955],
       [0.05029259, 0.94970741],
       [0.73165498, 0.26834502],
       [0.755758  , 0.244242  ],
       [0.48648656, 0.51351344],
       [0.47994051, 0.52005949],
       [0.20917897, 0.79082103],
       [0.73803941, 0.26196059],
       [0.48392466, 0.51607534],
       [0.88417698, 0.11582302],
       [0.21832672, 0.78167328],
       [0.755758  , 0.244242  ],
       [0.

In [49]:
predicted_proba.shape  #first columns is probability of being 0. second is probability of being 1

(140, 2)

In [50]:
#we need probability of excessive absenteesim. so probability of getting 1 
predicted_proba[:,1]

array([0.25280893, 0.41373716, 0.55375258, 0.244242  , 0.92696807,
       0.70057345, 0.70088055, 0.90137161, 0.2632296 , 0.24911319,
       0.52480864, 0.83947335, 0.94885357, 0.29690151, 0.7234662 ,
       0.50451137, 0.52644272, 0.51826736, 0.63854955, 0.94970741,
       0.26834502, 0.244242  , 0.51351344, 0.52005949, 0.79082103,
       0.26196059, 0.51607534, 0.11582302, 0.78167328, 0.244242  ,
       0.4073934 , 0.71588246, 0.71010722, 0.52807624, 0.244242  ,
       0.63633787, 0.26450256, 0.79692052, 0.46831451, 0.63678348,
       0.24303433, 0.48020627, 0.25943442, 0.13390095, 0.82423384,
       0.61141569, 0.72208959, 0.24666951, 0.25566032, 0.24183071,
       0.49330339, 0.08439558, 0.70057345, 0.24947477, 0.83631403,
       0.41691994, 0.92398523, 0.26283315, 0.09747573, 0.09805379,
       0.70909733, 0.70468133, 0.26666015, 0.79618992, 0.24938722,
       0.251573  , 0.01555862, 0.26577946, 0.78533587, 0.30240271,
       0.25692413, 0.09557931, 0.90042805, 0.4650523 , 0.63025

### Save the model 

saving the model means saving the reg object 

we can do this using pickle[module]. this is a python module used to convert a python object into a character stream

In [51]:
import pickle

In [52]:
#model is the file name   
#wb is write bytes. when we unpickle (extract model), we use rb (read bytes)
#dump is saving the info in a file, when we unpickle, we laod it
#reg is the object to be dumped 
### we must save the absenteesim scaler too. this contains the columns to scale and the mean and sd of the features. 
#the info in the absenteesim scalar is used to preprocess new data
# so we must pickle the scaler too

with open('model', 'wb') as file:  
    pickle.dump(reg, file)
    
with open('scaler', 'wb') as file:
    pickle.dump(absenteesim_scaler, file)

the second step of the deployment is about creating a mechanism to load the saved model and make predictions 