# Frequent Patterns & Association Rules

In [1]:
%matplotlib inline

import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

from collections import defaultdict
from scipy.stats.stats import pearsonr

In [2]:
import fim
from fim import apriori

In [3]:
df = pd.read_csv("hr_COMMA_SEP.csv")
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.238083,0.021268
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.425924,0.144281
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


In [5]:
df2 = df.copy()

In [6]:
satisfaction_levels = ['Molto insoddisfatto', 'Insoddisfatto', 'Neutrale', 'Soddisfatto', 'Molto soddisfatto'] #5 bins
bins_sl = [0, 0.2, 0.4, 0.6, 0.8, 1.01]
df2['satisfaction_level_cat'] = pd.cut(df2['satisfaction_level'], bins=bins_sl, 
                         right=False, labels=satisfaction_levels)

In [7]:
#last_evaluation_levels = ['Molto negativa', 'Negativa', 'Neutrale', 'Positiva', 'Molto positiva'] #5 bins
#bins_le = [0, 0.2, 0.4, 0.6, 0.8, 1.01]
#df2['last_evaluation_cat'] = pd.cut(df2['last_evaluation'], bins=bins_le, 
#                        right=False, labels=last_evaluation_levels)

In [8]:
last_evaluation_levels = ['Insufficiente', 'Sufficiente', 'Discreto', 'Buono', 'Ottimo'] #5 bins
bins_le = [0.36, 0.56, 0.66, 0.76, 0.91, 1.01]
df2['last_evaluation_cat'] = pd.cut(df2['last_evaluation'], bins=bins_le, 
                         right=False, labels=last_evaluation_levels)

In [9]:
#number_project_labels = ['2/3', '4/5', '6/7'] #3 bins
#bins_np = [2, 4, 6, 8]
#df2['number_project_cat'] = pd.cut(df2['number_project'], bins=bins_np,
#                        right=False, labels=number_project_labels)

In [10]:
#average_monthly_hours_groups = ['96-138', '139-182', '183-225', '226-269', '270-310'] #5 bins
#bins_amh = [96, 139, 183, 226, 270, 311]
#df2['avg_monthly_hours_cat'] = pd.cut(df2['average_montly_hours'], bins=bins_amh,
#                         right=False, labels=average_monthly_hours_groups)

In [11]:
average_monthly_hours_groups = ['96-118', '119-140', '141-162', '163-184', '185-206',
                                '207-228', '229-250', '251-272', '273-294', '295-310'] #10 bins

bins_amh = [96, 119, 141, 163, 185, 207, 229, 251, 273, 295, 311]

df2['avg_monthly_hours_cat'] = pd.cut(df2['average_montly_hours'], bins=bins_amh,
                         right=False, labels=average_monthly_hours_groups)

In [12]:
#time_spend_company_labels = ['2/3/4', '5/6/7', '8/9/10'] #3 bins
#bins_tsc = [2, 5, 8, 11]
#df2['time_spend_company_cat'] = pd.cut(df2['time_spend_company'], bins = bins_tsc,
#                       right=False, labels=time_spend_company_labels) 

In [13]:
df2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary,satisfaction_level_cat,last_evaluation_cat,avg_monthly_hours_cat
0,0.38,0.53,2,157,3,0,1,0,sales,low,Insoddisfatto,Insufficiente,141-162
1,0.8,0.86,5,262,6,0,1,0,sales,medium,Molto soddisfatto,Buono,251-272
2,0.11,0.88,7,272,4,0,1,0,sales,medium,Molto insoddisfatto,Buono,251-272
3,0.72,0.87,5,223,5,0,1,0,sales,low,Soddisfatto,Buono,207-228
4,0.37,0.52,2,159,3,0,1,0,sales,low,Insoddisfatto,Insufficiente,141-162


In [14]:
df2.tail()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary,satisfaction_level_cat,last_evaluation_cat,avg_monthly_hours_cat
14994,0.4,0.57,2,151,3,0,1,0,support,low,Neutrale,Sufficiente,141-162
14995,0.37,0.48,2,160,3,0,1,0,support,low,Insoddisfatto,Insufficiente,141-162
14996,0.37,0.53,2,143,3,0,1,0,support,low,Insoddisfatto,Insufficiente,141-162
14997,0.11,0.96,6,280,4,0,1,0,support,low,Molto insoddisfatto,Ottimo,273-294
14998,0.37,0.52,2,158,3,0,1,0,support,low,Insoddisfatto,Insufficiente,141-162


In [15]:
df2.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.238083,0.021268
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.425924,0.144281
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


In [16]:
df2.drop(['satisfaction_level', 'last_evaluation', 'average_montly_hours'], axis=1, inplace=True)
df2 = df2[['satisfaction_level_cat', 'last_evaluation_cat', 'number_project', 'avg_monthly_hours_cat', 'time_spend_company', 'Work_accident', 'left', 'promotion_last_5years', 'sales', 'salary']]
df2.head()

Unnamed: 0,satisfaction_level_cat,last_evaluation_cat,number_project,avg_monthly_hours_cat,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,Insoddisfatto,Insufficiente,2,141-162,3,0,1,0,sales,low
1,Molto soddisfatto,Buono,5,251-272,6,0,1,0,sales,medium
2,Molto insoddisfatto,Buono,7,251-272,4,0,1,0,sales,medium
3,Soddisfatto,Buono,5,207-228,5,0,1,0,sales,low
4,Insoddisfatto,Insufficiente,2,141-162,3,0,1,0,sales,low


In [17]:
#df3 = df2.copy()
#df3['satisfaction_level_cat'] = df2['satisfaction_level_cat'].astype(str) + '_SAT'
#df3['last_evaluation_cat'] = df2['last_evaluation_cat'].astype(str) + '_LE'
#df3['number_project_cat'] = df2['number_project_cat'].astype(str) + '_NP'
#df3['avg_monthly_hours_cat'] = df2['avg_monthly_hours_cat'].astype(str) + '_AMH'
#df3['time_spend_company_cat'] = df2['time_spend_company_cat'].astype(str) + '_TSC'
#df3['Work_accident'] = df2['Work_accident'].astype(str) + '_WA'
#df3['left'] = df2['left'].astype(str) + '_L'
#df3['promotion_last_5years'] = df2['promotion_last_5years'].astype(str) + '_P'
#df3['sales'] = df2['sales'].astype(str) + '_D'
#df3['salary'] = df2['salary'].astype(str) + '_SA'

In [18]:
df3 = df2.copy()
df3['satisfaction_level_cat'] = df2['satisfaction_level_cat'].astype(str) + '_SAT'
df3['last_evaluation_cat'] = df2['last_evaluation_cat'].astype(str) + '_LE'
df3['number_project'] = df2['number_project'].astype(str) + '_NP'
df3['avg_monthly_hours_cat'] = df2['avg_monthly_hours_cat'].astype(str) + '_AMH'
df3['time_spend_company'] = df2['time_spend_company'].astype(str) + '_TSC'
df3['Work_accident'] = df2['Work_accident'].astype(str) + '_WA'
df3['left'] = df2['left'].astype(str) + '_L'
df3['promotion_last_5years'] = df2['promotion_last_5years'].astype(str) + '_P'
df3['sales'] = df2['sales'].astype(str) + '_D'
df3['salary'] = df2['salary'].astype(str) + '_SA'

In [19]:
df3.head()

Unnamed: 0,satisfaction_level_cat,last_evaluation_cat,number_project,avg_monthly_hours_cat,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,Insoddisfatto_SAT,Insufficiente_LE,2_NP,141-162_AMH,3_TSC,0_WA,1_L,0_P,sales_D,low_SA
1,Molto soddisfatto_SAT,Buono_LE,5_NP,251-272_AMH,6_TSC,0_WA,1_L,0_P,sales_D,medium_SA
2,Molto insoddisfatto_SAT,Buono_LE,7_NP,251-272_AMH,4_TSC,0_WA,1_L,0_P,sales_D,medium_SA
3,Soddisfatto_SAT,Buono_LE,5_NP,207-228_AMH,5_TSC,0_WA,1_L,0_P,sales_D,low_SA
4,Insoddisfatto_SAT,Insufficiente_LE,2_NP,141-162_AMH,3_TSC,0_WA,1_L,0_P,sales_D,low_SA


In [20]:
df3.describe()

Unnamed: 0,satisfaction_level_cat,last_evaluation_cat,number_project,avg_monthly_hours_cat,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
count,14999,14999,14999,14999,14999,14999,14999,14999,14999,14999
unique,5,5,6,10,8,2,2,2,10,3
top,Soddisfatto_SAT,Buono_LE,4_NP,141-162_AMH,3_TSC,0_WA,0_L,0_P,sales_D,low_SA
freq,4239,3999,4365,2667,6443,12830,11428,14680,4140,7316


In [21]:
df3.to_csv('hr_for_patterns.csv', sep=',', header=True)

In [22]:
hr_baskets_list = list()
for row in df3.values:
    hr_baskets_list.append(list(row))

In [23]:
#hr_baskets_list

In [24]:
len(hr_baskets_list)

14999

In [25]:
#help(apriori)

# Patterns

In [44]:
frequent_patterns = apriori(hr_baskets_list, supp=20, zmin=3, target = 's')

In [45]:
len(frequent_patterns)

36

In [47]:
frequent_patterns.sort(key=lambda pattern: pattern[1], reverse=True)
frequent_patterns

[(('0_L', '0_WA', '0_P'), 9200),
 (('low_SA', '0_WA', '0_P'), 6228),
 (('3_TSC', '0_WA', '0_P'), 5446),
 (('medium_SA', '0_WA', '0_P'), 5371),
 (('low_SA', '0_L', '0_P'), 5092),
 (('medium_SA', '0_L', '0_P'), 4953),
 (('3_TSC', '0_L', '0_P'), 4738),
 (('medium_SA', '0_L', '0_WA'), 4266),
 (('low_SA', '0_L', '0_WA'), 4199),
 (('low_SA', '0_L', '0_WA', '0_P'), 4162),
 (('medium_SA', '0_L', '0_WA', '0_P'), 4132),
 (('3_TSC', '0_L', '0_WA'), 4036),
 (('3_TSC', '0_L', '0_WA', '0_P'), 3946),
 (('3_NP', '0_L', '0_P'), 3888),
 (('4_NP', '0_L', '0_P'), 3843),
 (('Soddisfatto_SAT', '0_L', '0_P'), 3750),
 (('4_NP', '0_WA', '0_P'), 3590),
 (('Molto soddisfatto_SAT', '0_L', '0_P'), 3549),
 (('Molto soddisfatto_SAT', '0_WA', '0_P'), 3483),
 (('sales_D', '0_WA', '0_P'), 3482),
 (('Soddisfatto_SAT', '0_WA', '0_P'), 3459),
 (('Buono_LE', '0_WA', '0_P'), 3389),
 (('1_L', '0_WA', '0_P'), 3387),
 (('4_NP', '0_L', '0_WA'), 3300),
 (('3_NP', '0_L', '0_WA'), 3274),
 (('3_NP', '0_WA', '0_P'), 3271),
 (('4_NP'

In [48]:
closed_patterns = apriori(hr_baskets_list, supp=20, zmin=3, target = 'c')

In [49]:
len(closed_patterns)

36

In [51]:
closed_patterns.sort(key=lambda pattern: pattern[1], reverse=True)
closed_patterns

[(('0_L', '0_WA', '0_P'), 9200),
 (('low_SA', '0_WA', '0_P'), 6228),
 (('3_TSC', '0_WA', '0_P'), 5446),
 (('medium_SA', '0_WA', '0_P'), 5371),
 (('low_SA', '0_L', '0_P'), 5092),
 (('medium_SA', '0_L', '0_P'), 4953),
 (('3_TSC', '0_L', '0_P'), 4738),
 (('medium_SA', '0_L', '0_WA'), 4266),
 (('low_SA', '0_L', '0_WA'), 4199),
 (('low_SA', '0_L', '0_WA', '0_P'), 4162),
 (('medium_SA', '0_L', '0_WA', '0_P'), 4132),
 (('3_TSC', '0_L', '0_WA'), 4036),
 (('3_TSC', '0_L', '0_WA', '0_P'), 3946),
 (('3_NP', '0_L', '0_P'), 3888),
 (('4_NP', '0_L', '0_P'), 3843),
 (('Soddisfatto_SAT', '0_L', '0_P'), 3750),
 (('4_NP', '0_WA', '0_P'), 3590),
 (('Molto soddisfatto_SAT', '0_L', '0_P'), 3549),
 (('Molto soddisfatto_SAT', '0_WA', '0_P'), 3483),
 (('sales_D', '0_WA', '0_P'), 3482),
 (('Soddisfatto_SAT', '0_WA', '0_P'), 3459),
 (('Buono_LE', '0_WA', '0_P'), 3389),
 (('1_L', '0_WA', '0_P'), 3387),
 (('4_NP', '0_L', '0_WA'), 3300),
 (('3_NP', '0_L', '0_WA'), 3274),
 (('3_NP', '0_WA', '0_P'), 3271),
 (('4_NP'

In [55]:
maximal_patterns = apriori(hr_baskets_list, supp=20, zmin=3, target = 'm')

In [56]:
len(maximal_patterns)

17

In [57]:
max_pat = maximal_patterns[0]
max_pat

(('2_TSC', '0_L', '0_P'), 3137)

In [58]:
max_pat[1]

3137

In [59]:
maximal_patterns

[(('2_TSC', '0_L', '0_P'), 3137),
 (('1_L', '0_WA', '0_P'), 3387),
 (('Insufficiente_LE', '0_WA', '0_P'), 3080),
 (('Neutrale_SAT', '0_WA', '0_P'), 3066),
 (('Buono_LE', '0_WA', '0_P'), 3389),
 (('3_NP', '0_L', '0_WA', '0_P'), 3200),
 (('sales_D', '0_L', '0_P'), 3033),
 (('sales_D', '0_WA', '0_P'), 3482),
 (('Molto soddisfatto_SAT', '0_L', '0_WA'), 3021),
 (('Molto soddisfatto_SAT', '0_L', '0_P'), 3549),
 (('Molto soddisfatto_SAT', '0_WA', '0_P'), 3483),
 (('Soddisfatto_SAT', '0_L', '0_WA', '0_P'), 3077),
 (('4_NP', '0_L', '0_WA', '0_P'), 3211),
 (('3_TSC', 'low_SA', '0_P'), 3173),
 (('3_TSC', '0_L', '0_WA', '0_P'), 3946),
 (('medium_SA', '0_L', '0_WA', '0_P'), 4132),
 (('low_SA', '0_L', '0_WA', '0_P'), 4162)]

In [60]:
maximal_patterns.sort(key=lambda pattern: pattern[1], reverse=True)
maximal_patterns

[(('low_SA', '0_L', '0_WA', '0_P'), 4162),
 (('medium_SA', '0_L', '0_WA', '0_P'), 4132),
 (('3_TSC', '0_L', '0_WA', '0_P'), 3946),
 (('Molto soddisfatto_SAT', '0_L', '0_P'), 3549),
 (('Molto soddisfatto_SAT', '0_WA', '0_P'), 3483),
 (('sales_D', '0_WA', '0_P'), 3482),
 (('Buono_LE', '0_WA', '0_P'), 3389),
 (('1_L', '0_WA', '0_P'), 3387),
 (('4_NP', '0_L', '0_WA', '0_P'), 3211),
 (('3_NP', '0_L', '0_WA', '0_P'), 3200),
 (('3_TSC', 'low_SA', '0_P'), 3173),
 (('2_TSC', '0_L', '0_P'), 3137),
 (('Insufficiente_LE', '0_WA', '0_P'), 3080),
 (('Soddisfatto_SAT', '0_L', '0_WA', '0_P'), 3077),
 (('Neutrale_SAT', '0_WA', '0_P'), 3066),
 (('sales_D', '0_L', '0_P'), 3033),
 (('Molto soddisfatto_SAT', '0_L', '0_WA'), 3021)]

# Regole

In [73]:
rules = apriori(hr_baskets_list, supp=10, zmin=2, target='r', conf=80, report='ascl')

In [74]:
first_rule = rules[0]
first_rule

('0_WA',
 ('Insoddisfatto_SAT',),
 1337,
 0.08913927595173012,
 0.8877822045152722,
 1.0378679100175034)

In [75]:
len(rules)

484

In [76]:
rules_left = list()
rules_not_left = list()
for rule in rules:
    if rule[0] == '1_L':
        rules_left.append(rule)
    elif rule[0] == '0_L':
        rules_not_left.append(rule)
len(rules_left), len(rules_not_left)

(6, 83)

In [79]:
min_lift = 1.0
interesting_left_rules = list()
interesting_not_left_rules = list()

for rule in rules_left:
    if rule[5] > min_lift:
        interesting_left_rules.append(rule)
for rule in rules_not_left:
    if rule[5] > min_lift:
        interesting_not_left_rules.append(rule)    
        
len(interesting_left_rules), len(interesting_not_left_rules)

(6, 83)

In [81]:
interesting_left_rules.sort(key=lambda rule: rule[5], reverse=True)
for int_rule in interesting_left_rules:
    print int_rule

('1_L', ('2_NP', '3_TSC', '0_WA', '0_P'), 1446, 0.09640642709513968, 0.8431486880466472, 3.5414133777685977)
('1_L', ('2_NP', 'Insufficiente_LE', '0_P'), 1292, 0.0861390759383959, 0.8427919112850619, 3.5399148354423535)
('1_L', ('2_NP', '3_TSC', '0_WA'), 1455, 0.09700646709780653, 0.842501447596989, 3.5386948228807724)
('1_L', ('2_NP', 'Insufficiente_LE'), 1301, 0.08673911594106273, 0.8350449293966624, 3.5073757759788684)
('1_L', ('2_NP', '3_TSC', '0_P'), 1516, 0.10107340489365958, 0.8243610657966286, 3.4625011553860636)
('1_L', ('2_NP', '3_TSC'), 1528, 0.1018734582305487, 0.8241639697950378, 3.461673308024579)


In [82]:
interesting_not_left_rules.sort(key=lambda rule: rule[2], reverse=True)
for int_rule in interesting_not_left_rules:
    print int_rule

('0_L', ('3_NP',), 3983, 0.2655510367357824, 0.9822441430332922, 1.2891739500661838)
('0_L', ('4_NP',), 3956, 0.26375091672778184, 0.9063001145475372, 1.1894990740373215)
('0_L', ('3_NP', '0_P'), 3888, 0.2592172811520768, 0.9818181818181818, 1.288614885289719)
('0_L', ('4_NP', '0_P'), 3843, 0.2562170811387426, 0.9046610169491526, 1.1873477942964945)
('0_L', ('Soddisfatto_SAT',), 3841, 0.2560837389159277, 0.9061099315876386, 1.189249463062915)
('0_L', ('Soddisfatto_SAT', '0_P'), 3750, 0.25001666777785186, 0.9047044632086851, 1.1874048165616966)
('0_L', ('Molto soddisfatto_SAT',), 3647, 0.24314954330288685, 0.8633996212121212, 1.1331931150298045)
('0_L', ('Molto soddisfatto_SAT', '0_P'), 3549, 0.236615774384959, 0.8603636363636363, 1.1292084513316576)
('0_L', ('4_NP', '0_WA'), 3300, 0.22001466764450964, 0.8962520369364476, 1.1763111919854548)
('0_L', ('3_NP', '0_WA'), 3274, 0.21828121874791653, 0.9787742899850523, 1.2846198438471998)
('0_L', ('4_NP', '0_WA', '0_P'), 3211, 0.2140809387292

In [79]:
first_int_rule = interesting_left_rules[0]
first_int_rule

('1_L',
 ('2_NP', '3_TSC', '0_WA', '0_P'),
 1446,
 0.09640642709513968,
 0.8431486880466472,
 3.5414133777685977)

In [80]:
hr_baskets_list[0]

['Insoddisfatto_SAT',
 'Insufficiente_LE',
 '2_NP',
 '141-162_AMH',
 '3_TSC',
 '0_WA',
 '1_L',
 '0_P',
 'sales_D',
 'low_SA']

In [84]:
from __future__ import division

In [85]:
counter = 0
c_left = 0
c_not_left = 0
for row in hr_baskets_list:
    if '2_NP' in row and '3_TSC' in row:
        counter += 1
    if '2_NP' in row and '3_TSC' in row and '1_L' in row:
        c_left += 1
    if '2_NP' in row and '3_TSC' in row and '0_L' in row:
        c_not_left += 1
accuracy = (c_left/counter) * 100
print 'counter', counter
print 'c_left', c_left
print 'c_not_left', c_not_left
print 'accuracy', accuracy

counter 1854
c_left 1528
c_not_left 326
accuracy 82.4163969795


In [83]:
counter = 0
c_left = 0
c_not_left = 0
for row in hr_baskets_list:
    if '3_NP' in row:
        counter += 1
    if '3_NP' in row and '1_L' in row:
        c_left += 1
    if '3_NP' in row and '0_L' in row:
        c_not_left += 1
accuracy = (c_not_left/counter) * 100
print 'counter', counter
print 'c_left', c_left
print 'c_not_left', c_not_left
print 'accuracy', accuracy

counter 4055
c_left 72
c_not_left 3983
accuracy 98.2244143033
