In [1]:
#imports
%matplotlib inline

import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import subprocess
from collections import defaultdict
from scipy.stats.stats import pearsonr

import fim
from fim import apriori

plt.rcParams["font.family"] = 'serif'
plt.rcParams["font.size"] = '14'

In [34]:
#read DB
df = pd.read_csv("HR.csv") 

In [35]:
#discretize and renaming values in the DB

#create a list of indexes
index = range(14999)
df = df.assign(indexes = index)

#satisfaction level
SL = pd.cut(df['satisfaction_level'], 5, labels = ['SL1', 'SL2', 'SL3', 'SL4', 'SL5'])
df['SL'] = SL
df = df.drop(labels = 'satisfaction_level', axis = 1)

#last evaluation
LE = pd.cut(df['last_evaluation'], 5, labels = ['LE1', 'LE2', 'LE3', 'LE4', 'LE5'])
df['LE'] = LE
df = df.drop(labels = 'last_evaluation', axis = 1)

#number project
NP = pd.cut(df['number_project'], 6, labels = ['NP2', 'NP3', 'NP4', 'NP5', 'NP6', 'NP7'])
df['NP'] = NP
df = df.drop(labels = 'number_project', axis = 1)

#average monthly hours
AMH = pd.cut(df['average_montly_hours'], 5, labels = ['AMH1', 'AMH2', 'AMH3', 'AMH4', 'AMH5'])
df['AMH'] = AMH
df = df.drop(labels = 'average_montly_hours', axis = 1)

#time spent company
TSC = pd.cut(df['time_spend_company'], 9, labels = ['TSC2', 'TSC3', 'TSC4', 'TSC5', 'TSC6', 'TSC7', 'TSC8', 'TSC9', 'TSC10'])
df['TSC'] = TSC
df = df.drop(labels = 'time_spend_company', axis = 1)

#work accident
WA = pd.cut(df['Work_accident'], 2, labels = ['WA0', 'WA1'])
df['WA'] = WA
df = df.drop(labels = 'Work_accident', axis = 1)

#left
L = pd.cut(df['left'], 2, labels = ['L0', 'L1'])
df['L'] = L
df = df.drop(labels = 'left', axis = 1)

#promotion last 5 years
PL5Y = pd.cut(df['promotion_last_5years'], 2, labels = ['PL5Y0', 'PL5Y1'])
df['PL5Y'] = PL5Y
df = df.drop(labels = 'promotion_last_5years', axis = 1)

# department
DEP = df['sales']
df['DEP'] = DEP
df = df.drop(labels = 'sales', axis = 1)

#salary
S = df['salary']
df['S'] = S
df = df.drop(labels = 'salary', axis = 1)

#create new csv file with new DB
df.to_csv('HR_association.csv', sep=',', header=False)

In [9]:
#transform dataframe into a transaction like dictionary: index -> list of items
baskets = defaultdict(list)

for i in range(len(df)):
    row = df[df['indexes'] == i]
    item = row['SL']
    baskets[i].append(item[i])
    item = row['LE']
    baskets[i].append(item[i])
    item = row['NP']
    baskets[i].append(item[i])
    item = row['AMH']
    baskets[i].append(item[i])
    item = row['TSC']
    baskets[i].append(item[i])
    item = row['WA']
    baskets[i].append(item[i])
    item = row['L']
    baskets[i].append(item[i])
    item = row['PL5Y']
    baskets[i].append(item[i])
    item = row['DEP']
    baskets[i].append(item[i])
    item = row['S']
    baskets[i].append(item[i])

In [10]:
#get the list of items from the transactions
baskets_lists = [b for b in baskets.values()]

In [37]:
#top 10 itemsets frequenti (che coincide con i closed)
itemsets = apriori(baskets_lists, supp=3, zmin=2, target='c', report='as')
#target s or a = all, c = closed, m = maximal, r = rules
#supp = min support, default 10
#conf    minimum confidence of an assoc. rule   (default: 80%)
#help(fim.apriori)
itemsets

[(('management', 'L0'), 539, 0.03593572904860324),
 (('management', 'L0', 'PL5Y0'), 473, 0.03153543569571305),
 (('management', 'WA0'), 527, 0.035135675711714116),
 (('management', 'WA0', 'PL5Y0'), 475, 0.0316687779185279),
 (('management', 'PL5Y0'), 561, 0.03740249349956664),
 (('TSC6', 'L0'), 509, 0.033935595706380425),
 (('TSC6', 'L0', 'PL5Y0'), 492, 0.032802186812454164),
 (('TSC6', 'WA0'), 611, 0.040736049069937996),
 (('TSC6', 'WA0', 'PL5Y0'), 603, 0.040202680178678576),
 (('TSC6', 'PL5Y0'), 701, 0.04673644909660644),
 (('hr', 'L0'), 524, 0.034935662377491836),
 (('hr', 'L0', 'PL5Y0'), 509, 0.033935595706380425),
 (('hr', 'WA0'), 650, 0.04333622241482765),
 (('hr', 'WA0', 'PL5Y0'), 640, 0.04266951130075338),
 (('hr', 'PL5Y0'), 724, 0.04826988465897727),
 (('accounting', 'L0'), 563, 0.03753583572238149),
 (('accounting', 'L0', 'WA0'), 476, 0.03173544902993533),
 (('accounting', 'L0', 'WA0', 'PL5Y0'), 463, 0.030868724581638776),
 (('accounting', 'L0', 'PL5Y0'), 549, 0.0366024401626

In [295]:
#itemsets massimali tra top 10
itemsets = apriori(baskets_lists, supp=36.9, zmin=2, target='m', report='as')
#target s or a = all, c = closed, m = maximal, r = rules
#supp = min support, default 10
#conf    minimum confidence of an assoc. rule   (default: 80%)
#help(fim.apriori)
itemsets

[(('TSC3', 'WA0'), 5548, 0.3698913260884059),
 (('TSC3', 'PL5Y0'), 6309, 0.42062804186945796),
 (('medium', 'PL5Y0'), 6265, 0.4176945129675312),
 (('low', 'WA0', 'PL5Y0'), 6228, 0.4152276818454564),
 (('L0', 'WA0', 'PL5Y0'), 9200, 0.6133742249483299)]

In [349]:
#top 20 itemsets frequenti (che coincide con i closed)
itemsets = apriori(baskets_lists, supp=29, zmin=1, target='c', report='s')
#target s or a = all, c = closed, m = maximal, r = rules
#supp = min support, default 10
#conf    minimum confidence of an assoc. rule   (default: 80%)
#help(fim.apriori)
itemsets

[(('NP4',), 0.2910194012934196),
 (('AMH2',), 0.297019801320088),
 (('AMH2', 'PL5Y0'), 0.29081938795919726),
 (('TSC3',), 0.4295619707980532),
 (('TSC3', 'L0'), 0.32382158810587375),
 (('TSC3', 'L0', 'PL5Y0'), 0.31588772584838987),
 (('TSC3', 'WA0'), 0.3698913260884059),
 (('TSC3', 'WA0', 'PL5Y0'), 0.3630908727248483),
 (('TSC3', 'PL5Y0'), 0.42062804186945796),
 (('medium',), 0.4297619841322755),
 (('medium', 'L0'), 0.34195613040869394),
 (('medium', 'L0', 'PL5Y0'), 0.33022201480098673),
 (('medium', 'WA0'), 0.36729115274351626),
 (('medium', 'WA0', 'PL5Y0'), 0.3580905393692913),
 (('medium', 'PL5Y0'), 0.4176945129675312),
 (('low',), 0.48776585105673714),
 (('low', 'L0'), 0.34295619707980535),
 (('low', 'L0', 'PL5Y0'), 0.3394892992866191),
 (('low', 'WA0'), 0.4184278951930129),
 (('low', 'WA0', 'PL5Y0'), 0.4152276818454564),
 (('low', 'PL5Y0'), 0.4833655577038469),
 (('L0',), 0.7619174611640777),
 (('L0', 'WA0'), 0.6285752383492232),
 (('L0', 'WA0', 'PL5Y0'), 0.6133742249483299),
 (('

In [355]:
#itemsets massimali tra top 20
itemsets = apriori(baskets_lists, supp=29, zmin=2, target='m', report='s')
#target s or a = all, c = closed, m = maximal, r = rules
#supp = min support, default 10
#conf    minimum confidence of an assoc. rule   (default: 80%)
#help(fim.apriori)
itemsets

[(('AMH2', 'PL5Y0'), 0.29081938795919726),
 (('TSC3', 'L0', 'PL5Y0'), 0.31588772584838987),
 (('TSC3', 'WA0', 'PL5Y0'), 0.3630908727248483),
 (('medium', 'L0', 'PL5Y0'), 0.33022201480098673),
 (('medium', 'WA0', 'PL5Y0'), 0.3580905393692913),
 (('low', 'L0', 'PL5Y0'), 0.3394892992866191),
 (('low', 'WA0', 'PL5Y0'), 0.4152276818454564),
 (('L0', 'WA0', 'PL5Y0'), 0.6133742249483299)]

In [393]:
#top 700 itemsets frequenti (che coincide con i closed)
itemsets = apriori(baskets_lists, supp=7, zmin=2, target='c', report='s')
#target s or a = all, c = closed, m = maximal, r = rules
#supp = min support, default 10
#conf    minimum confidence of an assoc. rule   (default: 80%)
#help(fim.apriori)
itemsets

[(('NP6', 'PL5Y0'), 0.07693846256417095),
 (('IT', 'WA0'), 0.07087139142609507),
 (('IT', 'WA0', 'PL5Y0'), 0.0706713780918728),
 (('IT', 'PL5Y0'), 0.08160544036269085),
 (('LE1', 'WA0'), 0.07073804920328022),
 (('LE1', 'PL5Y0'), 0.08007200480032002),
 (('high', 'L0'), 0.07700513367557837),
 (('high', 'L0', 'PL5Y0'), 0.07220481365424361),
 (('high', 'PL5Y0'), 0.07767184478965264),
 (('AMH5', 'WA0'), 0.08600573371558104),
 (('AMH5', 'WA0', 'PL5Y0'), 0.08527235149009935),
 (('AMH5', 'PL5Y0'), 0.09553970264684312),
 (('TSC5', 'WA0'), 0.08680578705247016),
 (('TSC5', 'WA0', 'PL5Y0'), 0.08607240482698847),
 (('TSC5', 'PL5Y0'), 0.09707313820921394),
 (('AMH1', 'WA0'), 0.09380625375025002),
 (('AMH1', 'WA0', 'PL5Y0'), 0.09187279151943463),
 (('AMH1', 'PL5Y0'), 0.10487365824388292),
 (('SL1', 'WA0'), 0.108007200480032),
 (('SL1', 'WA0', 'PL5Y0'), 0.10687379158610574),
 (('SL1', 'PL5Y0'), 0.12080805387025802),
 (('WA1', 'L0'), 0.13334222281485433),
 (('WA1', 'L0', 'PL5Y0'), 0.12854190279351957),

In [394]:
#itemsets massimali tra top 700
itemsets = apriori(baskets_lists, supp=7, zmin=2, target='m', report='s')
#target s or a = all, c = closed, m = maximal, r = rules
#supp = min support, default 10
#conf    minimum confidence of an assoc. rule   (default: 80%)
#help(fim.apriori)
itemsets

[(('NP6', 'PL5Y0'), 0.07693846256417095),
 (('IT', 'WA0', 'PL5Y0'), 0.0706713780918728),
 (('LE1', 'WA0'), 0.07073804920328022),
 (('LE1', 'PL5Y0'), 0.08007200480032002),
 (('high', 'L0', 'PL5Y0'), 0.07220481365424361),
 (('AMH5', 'WA0', 'PL5Y0'), 0.08527235149009935),
 (('TSC5', 'WA0', 'PL5Y0'), 0.08607240482698847),
 (('AMH1', 'WA0', 'PL5Y0'), 0.09187279151943463),
 (('SL1', 'WA0', 'PL5Y0'), 0.10687379158610574),
 (('WA1', 'L0', 'PL5Y0'), 0.12854190279351957),
 (('support', 'low', 'PL5Y0'), 0.07593839589305953),
 (('support', 'L0', 'WA0', 'PL5Y0'), 0.0898059870658044),
 (('SL2', 'NP2', 'L1', 'TSC3', 'WA0', 'PL5Y0'), 0.0915394359623975),
 (('SL2', 'NP2', 'LE2'), 0.07027135142342823),
 (('SL2', 'LE2', 'TSC3', 'PL5Y0'), 0.07087139142609507),
 (('SL2', 'LE2', 'WA0', 'PL5Y0'), 0.07360490699379958),
 (('SL2', 'AMH2', 'WA0', 'PL5Y0'), 0.0716714447629842),
 (('SL2', 'low', 'WA0', 'PL5Y0'), 0.07687179145276352),
 (('NP2', 'L1', 'LE2', 'TSC3', 'PL5Y0'), 0.07013800920061337),
 (('NP2', 'LE2', '

In [313]:
support_list = []
for i in range(len(itemsets)):
    support_list.append(itemsets[i][1])
#support_list.sort()

In [314]:
support_list

[0.29081938795919726,
 0.32382158810587375,
 0.31588772584838987,
 0.3698913260884059,
 0.3630908727248483,
 0.42062804186945796,
 0.34195613040869394,
 0.33022201480098673,
 0.36729115274351626,
 0.3580905393692913,
 0.4176945129675312,
 0.34295619707980535,
 0.3394892992866191,
 0.4184278951930129,
 0.4152276818454564,
 0.4833655577038469,
 0.6285752383492232,
 0.6133742249483299,
 0.7419161277418495,
 0.8391892792852856]

In [401]:
len(rules)

1176

In [402]:
lift_list = []
for i in range(len(rules)):
    lift_list.append(rules[i][5])
lift_list.sort()

In [398]:
lift_list

[0.9580379702041827,
 0.9622600935285552,
 0.975129886871827,
 0.9787401928375709,
 0.9797413310440073,
 0.980158942306271,
 0.9824063937946045,
 0.9825216353694466,
 0.9826798222463158,
 0.9836059823498312,
 0.9844840722488482,
 0.9846246072724831,
 0.9846283089522476,
 0.9848170189212633,
 0.9849479564032697,
 0.9850329396372777,
 0.985361649183319,
 0.985523728330574,
 0.9859131333138944,
 0.9859296465051889,
 0.9859656224868258,
 0.9863190768432664,
 0.9865974690426966,
 0.9866698975691466,
 0.9873863714423099,
 0.9875368824429801,
 0.9877283112659077,
 0.987920508460211,
 0.9883286727079824,
 0.9883726676968393,
 0.9885618674753239,
 0.9888443660843084,
 0.989098690202919,
 0.9891910654471615,
 0.9892282227487098,
 0.9893286518310281,
 0.9893468157255606,
 0.9894069890167189,
 0.9894361487111735,
 0.9895348528888137,
 0.989636515071965,
 0.9896704967454862,
 0.9896899239525677,
 0.9898615324786368,
 0.9898818719598558,
 0.9899472366882639,
 0.9899591280653951,
 0.9900486872399299,

In [399]:
help(fim.apriori)

Help on built-in function apriori in module fim:

apriori(...)
    apriori (tracts, target='s', supp=10, zmin=1, zmax=None, report='a',
             eval='x', agg='x', thresh=10, prune=None, algo='b', mode='',
             border=None)
    Find frequent item sets with the Apriori algorithm.
    tracts  transaction database to mine (mandatory)
            The database must be an iterable of transactions;
            each transaction must be an iterable of items;
            each item must be a hashable object.
            If the database is a dictionary, the transactions are
            the keys, the values their (integer) multiplicities.
    target  type of frequent item sets to find     (default: s)
            s/a   sets/all   all     frequent item sets
            c     closed     closed  frequent item sets
            m     maximal    maximal frequent item sets
            g     gens       generators
            r     rules      association rules
    supp    minimum support of an i