## MODELING

In [1]:
import os
import sys
import pandas
import matplotlib.pyplot as plt
sys.path.append(os.path.abspath("./util"))
from pymining import itemmining, assocrules
from fp_growth import find_frequent_itemsets as get_freq_itemset

%matplotlib inline
plt.style.use('ggplot')
pandas.set_option('display.max_rows', 100, "display.max_columns", 100)

In [24]:
df = pandas.read_csv("./dataset/crimes_census_5poi.csv", sep="\t")
df = df.drop(["Unnamed: 0"], axis=1)
df.shape

(19106, 23)

In [4]:
df.drop(["BLOCKID10"], axis=1).describe()

Unnamed: 0,First_POI,Second_POI,Third_POI,Fourth_POI,Fifth_POI,ARPCIncome,ARPerHEdu,ARPerWork,ARPerRMI5L,ARPer3MU,ARPopDen,ARPerAA,ARHeteInx,ARPerM1724,ARPerHOwn,ARPerSF,NIBRSclass,Clearance_,Place2,Location_T,Report_Dat,Street_Nam
count,19106,19106,19105,19106,19104,19106,19106,19106,19106,19106,19106,19106,19106,19106,19106,19106,19106,19106,19106,19106,19106,19106
unique,3814,3952,4080,4340,4404,7,7,7,9,9,7,9,9,5,9,7,52,10,67,5,36,1094
top,MARMI,hotels,BILLY REID,LENSCRAFTERS,CARLYLE & COMPANY,inc-1/7,edu-6/7,empl-1/7,lt5y-1/9,3mu-3/9,popden-5/7,afro-1/9,hetero-1/9,youngm-3/5,own-1/9,sinpar-6/7,Theft From Motor Vehicle,Open,Private Residence,Indoors,07-2x,TRADE
freq,215,312,215,215,215,2967,2828,3183,2274,2665,3219,2840,3629,4912,3287,3527,2523,11520,3950,8451,684,787


In [5]:
transactions = [row.tolist() for i, row in df.iterrows()]

to_tuple = lambda trans_list: tuple([tuple(trans) for trans in trans_list])
percentage = lambda x, tot=len(transactions): (x * 1.0) / tot

## Frequent Itemset generation

In [6]:
min_supp = 400

#### Relim
https://pdfs.semanticscholar.org/cb3e/76d1773d08545f21daf28cc87b051604aa95.pdf

In [10]:
%%time
relim_input = itemmining.get_relim_input(to_tuple(transactions))
relim_itemsets = itemmining.relim(relim_input, min_support=min_supp)

CPU times: user 36.1 s, sys: 199 ms, total: 36.3 s
Wall time: 37.1 s


#### FP-Growth

In [7]:
%%time
freq_item_generator = get_freq_itemset(transactions, min_supp, include_support=True)

fp_itemsets = {frozenset(itemset[0]): itemset[1] for itemset in freq_item_generator}
print "number of frequent itemsets", len(fp_itemsets), "\n"

number of frequent itemsets 43133 

CPU times: user 44.4 s, sys: 470 ms, total: 44.9 s
Wall time: 46.9 s


## Rule generation

In [8]:
min_conf = 0.97

In [9]:
%%time
rules = assocrules.mine_assoc_rules(fp_itemsets, min_confidence=min_conf)
len(rules)

CPU times: user 2min 11s, sys: 3min 55s, total: 6min 7s
Wall time: 9min 12s


In [11]:
len(rules)

5005518

In [23]:
result_file=open('./dataset/result_file.txt', 'w+')

for rule in rules:
    result_file.write(
        str([j for j in rule[0]]) + " --> " + str([z for z in rule[1]]) + 
        ", supp: " + str(rule[2]) + 
        ", conf: " + str(rule[3]))