In [8]:
%matplotlib inline

import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

from collections import defaultdict
from scipy.stats.stats import pearsonr

In [9]:
import subprocess

In [10]:
def call_apriori(fileinput, fileoutput, delimiter=',', target_type = 's',
                min_nbr_items = 1, min_sup = 2, min_conf = 2):
    # apriori
    # -t# {m: maximal, c: closed, s: frequent, r: association rules}
    # -m# minimum number of items per item set/association rule
    # -s# minimum support of an item set, positive: percentage, negative: absolute
    # -c# minimum confidence rule percentage
    # -b# line delimiter (,)
    # The default additional information output format for rules is " (%X, %C)"
    # %X relative body set support as a percentage
    # %C rule confidence as a percentage
    # %L lift

    if target_type == 'r':
        call_cmd = ['apriori.exe', '-b%s' % delimiter, '-t%s' % target_type, '-m%s' % min_nbr_items, '-s%s' % min_sup, '-c%s' % min_conf, '-v (%X, %C, %L)', fileinput, fileoutput] 
                    
                   
    else:
        call_cmd = ['apriori.exe', '-b%s' % delimiter, '-t%s' % target_type, '-m%s' % min_nbr_items, '-s%s' % min_sup, fileinput, fileoutput]
                           
    
    ret = subprocess.call(call_cmd,  stdout=open('apriori_stdout.txt', 'w'), stderr=open('apriori_stderr.txt', 'w'),shell = True)
    
    return ret

In [11]:
import re
def read_rules(filename):
    data = open(filename, 'r')
    rules = list()
    # Creating the new text file without the break spaces inside a row
    with open('rules_cleaned.txt', 'w') as file:
        file.write(data.read().replace('\n ',' '))
    data_new = open('rules_cleaned.txt', 'r')
    for row in data_new:
        fileds = row.split(' <- ')
        cons = fileds[0]
        other = fileds[1].split(' (')
        ant = other[0].split(' ')
        other2 = other[1].split(', ')
        sup = float(other2[0])
        conf = float(other2[1])
        lift = float(other2[2].replace(')', ''))
        rule = {
            'ant': ant,
            'cons': cons,
            'sup': sup,
            'conf': conf,
            'lift': lift
        }
        rules.append(rule)
    data.close()
    data_new.close()
    return rules

In [12]:
#Function for reading frequent,maximal and closed patterns
def read_pattern(filename):
    data = open(filename, 'r')
    patterns = list()
    with open('pattern_cleaned.txt', 'w') as file:
         file.write(data.read().replace('\n ',' '))
    data_new = open('pattern_cleaned.txt', 'r')

    for row in data_new :
        fileds = row.split(' ')
        n=len(fileds)
        support = fileds[-1]
        support = support.replace('(','')
        support = support.replace(')','')
        support = support.replace('\n','')
        support = float(support)
        pattern = {
                    'support':support,
                    'pattern' :fileds[:(n-1)]
                }
        patterns.append(pattern)
    data.close()
    data_new.close()
    return patterns

In [13]:
df = pd.read_csv("HR_comma_sep.csv")

In [14]:
min(df["average_montly_hours"])

96

In [15]:
max(df["average_montly_hours"])

310

In [16]:
#Mappatura dei valori continui di Satisfaction Level in intervalli discreti di 0.1 tra 0.0 e 1.0
df["satisfactionGroups"] = pd.cut(df["satisfaction_level"], bins = [x * 0.1 for x in range(0, 12)],right=False, labels=[x * 0.1 for x in range(0, 11)])

In [17]:
#Mappatura dei valori continui di Satisfaction Level in intervalli discreti di 10 tra 90(il valore minimo e' 96) e 310(il valore massimo e'310)
df["A.M.H.groups"] = pd.cut(df["average_montly_hours"],bins=range(90,330,10),right = False,labels=range(90,320,10))

In [18]:
#Mappatura dei valori continui di Last Evaluation.(Come Satisfaction Level)

df["L.E.groups"] = pd.cut(df["last_evaluation"], bins = [x * 0.1 for x in range(0, 12)],right=False, labels=[x * 0.1 for x in range(0, 11)])

In [19]:
df.drop(['satisfaction_level','last_evaluation','average_montly_hours'],axis = 1, inplace=True)

In [20]:
df.head()

Unnamed: 0,number_project,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary,satisfactionGroups,A.M.H.groups,L.E.groups
0,2,3,0,1,0,sales,low,0.3,150,0.5
1,5,6,0,1,0,sales,medium,0.8,260,0.8
2,7,4,0,1,0,sales,medium,0.1,270,0.8
3,5,5,0,1,0,sales,low,0.7,220,0.8
4,2,3,0,1,0,sales,low,0.3,150,0.5


In [21]:
#trasformo in stringa i miei valori ed inserisco una lettera per identificare gli attributi numerici

df2 = df
df2['NumberProject'] = df['number_project'].astype(str) + '_NoP'
df2['TimeSpendCompany'] = df['time_spend_company'].astype(str) + '_TSC'
df2['WorkAccident'] = df['Work_accident'].map({0:'NoAcc' , 1:'YesAcc'}).astype(str)
df2['Left'] = df['left'].map({0:'NoLeft',1:'YesLeft'}).astype(str) 
df2['Promotion'] = df['promotion_last_5years'].map({0:'NoProm',1:'YesProm'}).astype(str)
df2['Deparment'] = df['sales'].astype(str)
df2['Salary'] = df['salary'].astype(str)
df2['Satisfaction'] = df['satisfactionGroups'].astype(str) + '_Sat'
df2['AverageHours'] = df['A.M.H.groups'].astype(str) + '_AH'
df2['LastEvaluation'] = df['L.E.groups'].astype(str) + '_LE'

In [22]:
#N.B,non confondere _NoP con NoProm
#Non capisco perche' in SatisfactionGroups,una volta trasformati in stringa 
# 0.3 e 0.7 vengono scritti con una marea di cifre significative.

In [23]:
df2.drop(['satisfactionGroups','A.M.H.groups','L.E.groups'],axis = 1, inplace=True)

In [24]:
df2.drop([ 'number_project', 'time_spend_company' ,'Work_accident' ,'left' ,'promotion_last_5years' ,'sales' ,'salary'],axis = 1, inplace= True)

In [25]:
df2["AverageHours"].value_counts()

150_AH    1259
140_AH    1218
250_AH    1034
130_AH    1016
260_AH    1012
240_AH     999
160_AH     893
220_AH     868
230_AH     862
180_AH     800
170_AH     799
190_AH     770
210_AH     767
200_AH     735
270_AH     665
120_AH     322
280_AH     286
100_AH     179
110_AH     156
300_AH     152
290_AH     135
90_AH       54
310_AH      18
Name: AverageHours, dtype: int64

In [26]:
#Ora abbiamo il nostro dataset pronto per l'Apriori
#La funzione ha bisogno di prendere un .csv come parametro

In [27]:
df2.to_csv('HR_FrequentPatterns.csv', sep=',', header=False)

In [28]:

delimiter=','
target_type='s'
min_nbr_items=2
min_sup=4
min_conf=2

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_FrequentPatterns.txt', 
                       delimiter, target_type, min_nbr_items, min_sup, min_conf)

In [29]:
delimiter=','
target_type='r'
min_nbr_items=2
min_sup=10
min_conf=25

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_Rules.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)

In [30]:
rules = read_rules('HR_Rules.txt')
for r in rules[:20]:
    print(r['ant'],'-->',r['cons'],' Lift',r['lift'],' Conf.',r['conf'])

['2_NoP', 'YesLeft', '3_TSC', 'NoProm'] --> 0.30000000000000004_Sat  Lift 576.969  Conf. 37.467
['2_NoP', 'YesLeft', '3_TSC'] --> 0.30000000000000004_Sat  Lift 572.438  Conf. 37.1728
['2_NoP', 'YesLeft', 'NoProm'] --> 0.30000000000000004_Sat  Lift 571.411  Conf. 37.1061
['2_NoP', 'YesLeft'] --> 0.30000000000000004_Sat  Lift 567.035  Conf. 36.822
['2_NoP', '3_TSC', 'NoAcc', 'NoProm'] --> 0.30000000000000004_Sat  Lift 526.183  Conf. 34.1691
['2_NoP', '3_TSC', 'NoAcc'] --> 0.30000000000000004_Sat  Lift 524.31  Conf. 34.0475
['2_NoP', '3_TSC', 'NoProm'] --> 0.30000000000000004_Sat  Lift 513.313  Conf. 33.3333
['2_NoP', '3_TSC'] --> 0.30000000000000004_Sat  Lift 510.821  Conf. 33.1715
['2_NoP', 'NoAcc', 'NoProm'] --> 0.30000000000000004_Sat  Lift 453.219  Conf. 29.431
['2_NoP', 'NoAcc'] --> 0.30000000000000004_Sat  Lift 450.748  Conf. 29.2705
['2_NoP', 'NoProm'] --> 0.30000000000000004_Sat  Lift 434.274  Conf. 28.2008
['2_NoP'] --> 0.30000000000000004_Sat  Lift 429.48  Conf. 27.8894
['YesLe

In [31]:
#Cafonata : prove su i maximal partendo da una minsupport di 20 incrementando di 10 finche' non mi ritorna una lista vuota
delimiter=','
target_type='m'
min_nbr_items=2
min_sup=20
min_conf=10

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_maximal.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)
MaxPatterns20 = read_pattern('HR_maximal.txt')
Maxsorted = sorted(MaxPatterns20, key=lambda k: k['support'],reverse = True)
for r in Maxsorted[:30]:
    print(r['pattern'],' Support : ',r['support'])


['low', 'NoLeft', 'NoAcc', 'NoProm']  Support :  27.7485
['medium', 'NoLeft', 'NoAcc', 'NoProm']  Support :  27.5485
['3_TSC', 'NoLeft', 'NoAcc', 'NoProm']  Support :  26.3084
['sales', 'NoAcc', 'NoProm']  Support :  23.2149
['YesLeft', 'NoAcc', 'NoProm']  Support :  22.5815
['0.5_LE', 'NoProm']  Support :  22.2148
['4_NoP', 'NoLeft', 'NoAcc', 'NoProm']  Support :  21.4081
['3_NoP', 'NoLeft', 'NoAcc', 'NoProm']  Support :  21.3348
['3_TSC', 'low', 'NoProm']  Support :  21.1547
['2_TSC', 'NoLeft', 'NoProm']  Support :  20.9147
['sales', 'NoLeft', 'NoProm']  Support :  20.2213


In [32]:
delimiter=','
target_type='m'
min_nbr_items=2
min_sup=30
min_conf=10

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_maximal.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)
MaxPatterns30 = read_pattern('HR_maximal.txt')
Maxsorted = sorted(MaxPatterns30, key=lambda k: k['support'],reverse = True)
for r in Maxsorted[:20]:
    print(r['pattern'],' Support : ',r['support'])

['NoLeft', 'NoAcc', 'NoProm']  Support :  61.3374
['low', 'NoAcc', 'NoProm']  Support :  41.5228
['3_TSC', 'NoAcc', 'NoProm']  Support :  36.3091
['medium', 'NoAcc', 'NoProm']  Support :  35.8091
['low', 'NoLeft', 'NoProm']  Support :  33.9489
['medium', 'NoLeft', 'NoProm']  Support :  33.0222
['3_TSC', 'NoLeft', 'NoProm']  Support :  31.5888


In [33]:
delimiter=','
target_type='m'
min_nbr_items=2
min_sup=50
min_conf=10

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_maximal.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)
MaxPatterns50 = read_pattern('HR_maximal.txt')
Maxsorted = sorted(MaxPatterns50, key=lambda k: k['support'],reverse = True)
for r in Maxsorted[:20]:
    print(r['pattern'],' Support : ',r['support'])

['NoLeft', 'NoAcc', 'NoProm']  Support :  61.3374


In [34]:
delimiter=','
target_type='m'
min_nbr_items=2
min_sup=60
min_conf=10

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_maximal.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)
MaxPatterns60 = read_pattern('HR_maximal.txt')
Maxsorted = sorted(MaxPatterns60, key=lambda k: k['support'],reverse = True)
for r in Maxsorted[:20]:
    print(r['pattern'],' Support : ',r['support'])

['NoLeft', 'NoAcc', 'NoProm']  Support :  61.3374


In [35]:
delimiter=','
target_type='m'
min_nbr_items=2
min_sup=70
min_conf=10

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_maximal.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)
MaxPatterns70 = read_pattern('HR_maximal.txt')
Maxsorted = sorted(MaxPatterns70, key=lambda k: k['support'],reverse = True)
for r in Maxsorted[:20]:
    print(r['pattern'],' Support : ',r['support'])


['NoAcc', 'NoProm']  Support :  83.9189
['NoLeft', 'NoProm']  Support :  74.1916


In [36]:
delimiter=','
target_type='m'
min_nbr_items=2
min_sup=80
min_conf=10

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_maximal.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)
MaxPatterns80 = read_pattern('HR_maximal.txt')
Maxsorted = sorted(MaxPatterns80, key=lambda k: k['support'],reverse = True)
for r in Maxsorted[:20]:
    print(r['pattern'],' Support : ',r['support'])


['NoAcc', 'NoProm']  Support :  83.9189


In [37]:
delimiter=','
target_type='m'
min_nbr_items=2
min_sup=90
min_conf=10

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_maximal.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)
MaxPatterns90 = read_pattern('HR_maximal.txt')
Maxsorted = sorted(MaxPatterns90, key=lambda k: k['support'],reverse = True)
for r in Maxsorted[:20]:
    print(r['pattern'],' Support : ',r['support'])

In [38]:
#DISCUSSIONE RISULTATI

In [39]:
#Stessa cosa con i closed
delimiter=','
target_type='c'
min_nbr_items=2
min_sup=20
min_conf=10

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_closed.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)
CloPattern20 = read_pattern('HR_closed.txt')
Closorted = sorted(CloPattern20, key=lambda k: k['support'],reverse = True)
for r in Closorted[:20]:
    print(r['pattern'],' Support : ',r['support'])

['NoAcc', 'NoProm']  Support :  83.9189
['NoLeft', 'NoProm']  Support :  74.1916
['NoLeft', 'NoAcc']  Support :  62.8575
['NoLeft', 'NoAcc', 'NoProm']  Support :  61.3374
['low', 'NoProm']  Support :  48.3366
['3_TSC', 'NoProm']  Support :  42.0628
['low', 'NoAcc']  Support :  41.8428
['medium', 'NoProm']  Support :  41.7695
['low', 'NoAcc', 'NoProm']  Support :  41.5228
['3_TSC', 'NoAcc']  Support :  36.9891
['medium', 'NoAcc']  Support :  36.7291
['3_TSC', 'NoAcc', 'NoProm']  Support :  36.3091
['medium', 'NoAcc', 'NoProm']  Support :  35.8091
['low', 'NoLeft']  Support :  34.2956
['medium', 'NoLeft']  Support :  34.1956
['low', 'NoLeft', 'NoProm']  Support :  33.9489
['medium', 'NoLeft', 'NoProm']  Support :  33.0222
['3_TSC', 'NoLeft']  Support :  32.3822
['3_TSC', 'NoLeft', 'NoProm']  Support :  31.5888
['medium', 'NoLeft', 'NoAcc']  Support :  28.4419


In [40]:
delimiter=','
target_type='c'
min_nbr_items=2
min_sup=50
min_conf=10

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_closed.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)
CloPattern50 = read_pattern('HR_closed.txt')
Closorted = sorted(CloPattern50, key=lambda k: k['support'],reverse = True)
for r in Closorted[:20]:
    print(r['pattern'],' Support : ',r['support'])

['NoAcc', 'NoProm']  Support :  83.9189
['NoLeft', 'NoProm']  Support :  74.1916
['NoLeft', 'NoAcc']  Support :  62.8575
['NoLeft', 'NoAcc', 'NoProm']  Support :  61.3374


In [41]:
delimiter=','
target_type='c'
min_nbr_items=2
min_sup=80
min_conf=10

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_closed.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)
CloPattern50 = read_pattern('HR_closed.txt')
Closorted = sorted(CloPattern50, key=lambda k: k['support'],reverse = True)
for r in Closorted[:20]:
    print(r['pattern'],' Support : ',r['support'])

['NoAcc', 'NoProm']  Support :  83.9189


In [42]:
#stessa cosa per i frequent
delimiter=','
target_type='s'
min_nbr_items=2
min_sup=20
min_conf=10

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_frequent.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)
FrePattern20 = read_pattern('HR_frequent.txt')
Fresorted = sorted(FrePattern20, key=lambda k: k['support'],reverse = True)
for r in Fresorted[:20]:
    print(r['pattern'],' Support : ',r['support'])
#Vengono uguali ai closed( plausibile )

['NoAcc', 'NoProm']  Support :  83.9189
['NoLeft', 'NoProm']  Support :  74.1916
['NoLeft', 'NoAcc']  Support :  62.8575
['NoLeft', 'NoAcc', 'NoProm']  Support :  61.3374
['low', 'NoProm']  Support :  48.3366
['3_TSC', 'NoProm']  Support :  42.0628
['low', 'NoAcc']  Support :  41.8428
['medium', 'NoProm']  Support :  41.7695
['low', 'NoAcc', 'NoProm']  Support :  41.5228
['3_TSC', 'NoAcc']  Support :  36.9891
['medium', 'NoAcc']  Support :  36.7291
['3_TSC', 'NoAcc', 'NoProm']  Support :  36.3091
['medium', 'NoAcc', 'NoProm']  Support :  35.8091
['low', 'NoLeft']  Support :  34.2956
['medium', 'NoLeft']  Support :  34.1956
['low', 'NoLeft', 'NoProm']  Support :  33.9489
['medium', 'NoLeft', 'NoProm']  Support :  33.0222
['3_TSC', 'NoLeft']  Support :  32.3822
['3_TSC', 'NoLeft', 'NoProm']  Support :  31.5888
['medium', 'NoLeft', 'NoAcc']  Support :  28.4419


In [43]:
delimiter=','
target_type='s'
min_nbr_items=2
min_sup=50
min_conf=10

ret_val = call_apriori('HR_FrequentPatterns.csv', 'HR_frequent.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)
FrePattern50 = read_pattern('HR_frequent.txt')
Fresorted = sorted(FrePattern50, key=lambda k: k['support'],reverse = True)
for r in Fresorted[:20]:
    print(r['pattern'],' Support : ',r['support'])

['NoAcc', 'NoProm']  Support :  83.9189
['NoLeft', 'NoProm']  Support :  74.1916
['NoLeft', 'NoAcc']  Support :  62.8575
['NoLeft', 'NoAcc', 'NoProm']  Support :  61.3374
