In [190]:
%matplotlib inline

import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import *

from collections import defaultdict
from scipy.stats.stats import pearsonr

# Importo dataset

In [191]:
hr = pd.read_csv('HR_comma_sep.csv')

In [192]:
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


# Trasformo le variabili stringhe in numeriche

In [193]:
hr['salary'].replace(['low', 'medium', 'high'], [0, 1, 2], inplace = True)


In [194]:
hr['sales'].replace(['sales', 'technical', 'support', 'IT', 'product_mng', 'marketing', 'RandD', 'accounting', 'hr', 'management'], [0,1,2,3,4,5,6,7,8,9], inplace= True)

In [195]:
hr['SL_100'] = hr['satisfaction_level']*100
hr['LE_100'] = hr['last_evaluation']*100
hr.drop(['satisfaction_level', 'last_evaluation'], axis=1, inplace=True)

Moltiplico i valori delle colonne last_evaluation e satisfaction_level per 100 per poi poter raggrupparle per bin
A questo punto elimino satisfaction_level e last_evaluation perchè non servono più all'analisi (dato doppio altrimenti)

## Visualizzazione dei cambiamenti

In [196]:
hr.head()

Unnamed: 0,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary,SL_100,LE_100
0,2,157,3,0,1,0,0,0,38.0,53.0
1,5,262,6,0,1,0,0,1,80.0,86.0
2,7,272,4,0,1,0,0,1,11.0,88.0
3,5,223,5,0,1,0,0,0,72.0,87.0
4,2,159,3,0,1,0,0,0,37.0,52.0


# Raggruppamenti per variabili continue e average_montly_hours

In [197]:
hr2 = hr
hr2['AMHGroup'] = pd.cut(hr2['average_montly_hours'], bins=range(0, 520, 10), 
                         right=False, labels=range(0, 510, 10))
hr2['LEGroup'] = pd.cut(hr2['LE_100'], bins=range(0, 520, 10), 
                         right=False, labels=range(0, 510, 10))
hr2['SLGroup'] = pd.cut(hr2['SL_100'], bins=range(0, 520, 10), 
                         right=False, labels=range(0, 510, 10))

In [198]:
hr2.drop(['SL_100', 'LE_100', 'average_montly_hours'], axis=1, inplace=True)

Come in precedenza elimino le colonne che porterebbero ad avere un dato doppio

## Visualizzazione dei cambiamenti

In [199]:
hr2.head()

Unnamed: 0,number_project,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary,AMHGroup,LEGroup,SLGroup
0,2,3,0,1,0,0,0,150,50,30
1,5,6,0,1,0,0,1,260,80,80
2,7,4,0,1,0,0,1,270,80,10
3,5,5,0,1,0,0,0,220,80,70
4,2,3,0,1,0,0,0,150,50,30


# Rendo unici i vari valori

In [200]:
hr3 = hr2
hr3['Work_accident'] = hr2['Work_accident'].map({1: 'Y', 0: 'N'}).astype(str) + '_WA'
hr3['left'] = hr2['left'].map({1: 'Y', 0: 'N'}).astype(str)+ '_L'
hr3['promotion_last_5years'] = hr2['promotion_last_5years'].map({1: 'Y', 0: 'N'}).astype(str)+'_P'
hr3['sales'] = hr['sales'].astype(str) + '_D'
hr3['number_project'] = hr2['number_project'].astype(str) + '_NP'
hr3['SLGroup'] = hr2['SLGroup'].astype(str) + '_SL'
hr3['LEGroup'] = hr2['LEGroup'].astype(str) + '_LE'
hr3['time_spend_company'] = hr2['time_spend_company'].astype(str) + '_T'
hr3['AMHGroup'] = hr2['AMHGroup'].astype(str) + '_H'
hr3['salary'] = hr2['salary'].astype(str) + '_S'

## Visualizzazione dei cambiamenti

In [201]:
hr3.head()

Unnamed: 0,number_project,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary,AMHGroup,LEGroup,SLGroup
0,2_NP,3_T,N_WA,Y_L,N_P,0_D,0_S,150_H,50_LE,30_SL
1,5_NP,6_T,N_WA,Y_L,N_P,0_D,1_S,260_H,80_LE,80_SL
2,7_NP,4_T,N_WA,Y_L,N_P,0_D,1_S,270_H,80_LE,10_SL
3,5_NP,5_T,N_WA,Y_L,N_P,0_D,0_S,220_H,80_LE,70_SL
4,2_NP,3_T,N_WA,Y_L,N_P,0_D,0_S,150_H,50_LE,30_SL


# A PRIORI ALGORITHM 

In [218]:
import subprocess

In [219]:
def call_apriori(fileinput, fileoutput, delimiter=',', target_type='s', 
                 min_nbr_items=1, min_sup=2, min_conf=2):
    # apriori
    # -t# {m: maximal, c: closed, s: frequent, r: association rules}
    # -m# minimum number of items per item set/association rule
    # -s# minimum support of an item set, positive: percentage, negative: absolute
    # -c# minimum confidence rule percentage
    # -b# line delimiter (,)
    # The default additional information output format for rules is " (%X, %C)"
    # %X relative body set support as a percentage
    # %C rule confidence as a percentage
    # %L lift

    if target_type == 'r':
        call_cmd = ['./apriori', '-b%s' % delimiter, '-t%s' % target_type, '-m%s' % min_nbr_items, 
                    '-s%s' % min_sup, '-c%s' % min_conf, '-v (%X, %C, %L)', 
                    fileinput, fileoutput]
    else:
        call_cmd = ['./apriori', '-b%s' % delimiter, '-t%s' % target_type, 
                           '-m%s' % min_nbr_items, '-s%s' % min_sup, fileinput, fileoutput]

    ret = subprocess.call(call_cmd,  stdout=open('apriori_stdout.txt', 'w'), 
                          stderr=open('apriori_stderr.txt', 'w'))
    return ret

In [220]:
def read_rules(filename):
    data = open(filename, 'r')
    rules = list()
    for row in data:
        fileds = row.rstrip('\n\r').split(' <- ')
        cons = fileds[0]
        other = fileds[1].split(' (')
        ant = other[0].split(' ')
        other2 = other[1].split(', ')
        sup = float(other2[0])
        conf = float(other2[1])
        lift = float(other2[2].replace(')', ''))
        rule = {
            'ant': ant,
            'cons': cons,
            'sup': sup,
            'conf': conf,
            'lift': lift
        }
        rules.append(rule)
    data.close()
    return rules

In [215]:
hr3.to_csv('hr_for_patterns.csv', sep=',', header=False)