In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np

In [2]:
df = pd.read_csv('Absenteeism_at_work6.csv')
df.head()

Unnamed: 0,ID,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average,Hit target,Disciplinary failure,Education,Body mass index,Absenteeism time in hours
0,11,26,7,3,1,289,36,13,33,239.0,97,0,1,30,4
1,36,0,7,3,1,118,13,18,50,239.0,97,1,1,31,0
2,3,23,7,4,1,179,51,18,38,239.0,97,0,1,31,2
3,7,7,7,5,1,279,5,14,39,239.0,97,0,1,24,4
4,11,23,7,5,1,289,36,13,33,239.0,97,0,1,30,2


In [3]:
X_train = df.values
X_scaled = preprocessing.scale(X_train)
X_scaled

array([[-0.63716146,  0.80493838,  0.19676258, ..., -0.43385741,
         0.77593223, -0.21951111],
       [ 1.63271857, -2.28012426,  0.19676258, ..., -0.43385741,
         1.00943766, -0.51976661],
       [-1.36352307,  0.44896961,  0.19676258, ..., -0.43385741,
         1.00943766, -0.36963886],
       ...,
       [-1.27272787, -2.28012426, -1.84169773, ..., -0.43385741,
         1.70995395, -0.51976661],
       [-0.90954706, -2.28012426, -1.84169773, ..., -0.43385741,
         1.94345938, -0.51976661],
       [ 1.54192337, -2.28012426, -1.84169773, ..., -0.43385741,
        -0.39159492, -0.51976661]])

In [4]:
# Standardize dataset
dataset = X_scaled
print(dataset)

[[-0.63716146  0.80493838  0.19676258 ... -0.43385741  0.77593223
  -0.21951111]
 [ 1.63271857 -2.28012426  0.19676258 ... -0.43385741  1.00943766
  -0.51976661]
 [-1.36352307  0.44896961  0.19676258 ... -0.43385741  1.00943766
  -0.36963886]
 ...
 [-1.27272787 -2.28012426 -1.84169773 ... -0.43385741  1.70995395
  -0.51976661]
 [-0.90954706 -2.28012426 -1.84169773 ... -0.43385741  1.94345938
  -0.51976661]
 [ 1.54192337 -2.28012426 -1.84169773 ... -0.43385741 -0.39159492
  -0.51976661]]


In [5]:
# Example of separating data by class value
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
            separated[class_value].append(vector)
    return separated

In [6]:
# Test separating data by class
#dataset = array
separated = separate_by_class(dataset)
for label in separated:
    print(label)
    for row in separated[label]:
        print(row)

-0.21951111250831784
[-0.63716146  0.80493838  0.19676258 -0.64394734 -1.3901746   1.01140838
  0.42955567  0.10176977 -0.53286845 -0.89110796  0.63868581 -0.23904572
 -0.43385741  0.77593223 -0.21951111]
-0.5197666083607304
[ 1.63271857 -2.28012426  0.19676258 -0.64394734 -1.3901746  -1.54437915
 -1.12169354  1.24282479  2.09286015 -0.89110796  0.63868581  4.18330013
 -0.43385741  1.00943766 -0.51976661]
2.4827883501633954
[-0.36477586 -0.02565541  0.19676258 -1.34781887 -1.3901746  -0.99137249
 -1.18913915  0.32998078 -0.37841383 -0.89110796  0.63868581 -0.23904572
 -0.43385741 -0.39159492  2.48278835]
3.683810333573046
[ 1.45112817 -0.02565541  1.65280566 -0.64394734  1.30990205 -1.54437915
 -1.32403039 -0.58286324  0.08495004 -0.36826795  0.63868581 -0.23904572
 -0.43385741  0.30892137  3.68381033]
-0.1444472385452147
[-0.27398065  0.44896961  1.07038843  0.76379573  1.30990205  1.04130063
  0.09232758 -0.12644123  0.54831391 -0.55839159 -0.42042343 -0.23904572
 -0.43385741 -0.3915

# Summarizing a dataset

In [7]:
# Example of summarizing a dataset
from math import sqrt
# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]

    del(summaries[-1])
    
    return summaries

In [8]:
summary = summarize_dataset(dataset)
print(summary)

[(1.200241107702872e-18, 1.0006763612541973, 740), (-2.100421938480026e-16, 1.000676361254196, 740), (-9.067821568695198e-16, 1.000676361254196, 740), (8.881784197001253e-17, 1.000676361254198, 740), (2.3029626254048853e-16, 1.0006763612541898, 740), (6.601326092365796e-17, 1.0006763612541985, 740), (-5.911187455436644e-17, 1.0006763612541965, 740), (-3.6607353784937596e-17, 1.0006763612541962, 740), (-4.626929470194571e-16, 1.0006763612541942, 740), (nan, nan, 740), (-8.523587241421177e-16, 1.0006763612541951, 740), (-4.485150989347169e-16, 1.000676361254188, 740), (1.3780268217813598e-15, 1.0006763612541987, 740), (-1.2152441215491577e-16, 1.000676361254194, 740)]


# Summarize by class

In [9]:
# Example of summarizing data by class value
from math import sqrt

# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

In [10]:
separate_by_class(dataset)

{-0.5197666083607304: [array([ 1.63271857, -2.28012426,  0.19676258, -0.64394734, -1.3901746 ,
         -1.54437915, -1.12169354,  1.24282479,  2.09286015, -0.89110796,
          0.63868581,  4.18330013, -0.43385741,  1.00943766, -0.51976661]),
  array([ 0.17999535, -2.28012426,  0.77917981, -1.34781887,  1.30990205,
          0.57797073,  1.37379432, -0.35465224, -0.06950458,  1.17648664,
         -0.68520074,  4.18330013, -0.43385741, -0.85860577, -0.51976661]),
  array([ 0.99715216, -2.28012426,  0.77917981, -1.34781887,  1.30990205,
          0.05485632, -0.24490051, -0.81107424, -1.30514157,  1.17648664,
         -0.68520074,  4.18330013, -0.43385741, -0.62510035, -0.51976661]),
  array([-0.63716146, -2.28012426,  0.77917981, -0.64394734,  1.30990205,
          1.01140838,  0.42955567,  0.10176977, -0.53286845,  1.17648664,
         -0.68520074,  4.18330013, -0.43385741,  0.77593223, -0.51976661]),
  array([ 1.63271857, -2.28012426,  0.77917981, -0.64394734,  1.30990205,
         

# Calculating class probabilities

In [11]:
# Example of calculating class probabilities
from math import sqrt
from math import pi
from math import exp

# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

In [12]:
# Test calculating class probabilities

summaries = summarize_by_class(dataset)
probabilities = calculate_class_probabilities(summaries, dataset[0])
print(probabilities)



ZeroDivisionError: float division by zero