## MODELING

In [1]:
import os
import sys
import numpy
import pandas
import matplotlib.pyplot as plt
sys.path.append(os.path.abspath("./util"))
from pymining import itemmining, assocrules
from fp_growth import find_frequent_itemsets as get_freq_itemset

%matplotlib inline
plt.style.use('ggplot')
pandas.set_option('display.max_rows', 100, "display.max_columns", 100)

In [2]:
df = pandas.read_csv("./dataset/crimes_census_5poi_sampled100.csv", index_col=0, sep="\t")
df.shape

(2954, 21)

In [3]:
df = df.drop(["BLOCKID10", "Street_Nam", "ARHeteInx", "ARPerM1724", "ARPerHOwn", "ARPerSF", "ARPerRMI5L", "ARPer3MU"], axis=1)
df = df.drop(["First_POI", "Second_POI", "Third_POI", "Fourth_POI", "Fifth_POI"], axis=1)
df.describe()

Unnamed: 0,ARPCIncome,ARPerHEdu,ARPerWork,ARPopDen,ARPerAA,NIBRSclass,Place2,Report_Dat
count,2954,2954,2954,2954,2954,2954,2954,2954
unique,15,15,15,15,13,52,62,36
top,inc-2/15,edu-10/15,empl-2/15,popden-1/15,afro-1/13,Drug/Narcotic Violations,Private Residence,07-2x
freq,290,309,377,385,488,100,509,106


In [4]:
# df.drop(["BLOCKID10"], axis=1).describe() # VECCHIO

In [4]:
transactions = [row.tolist() for i, row in df.iterrows()]
# utility functions
to_tuple     = lambda trans_list: tuple([tuple(trans) for trans in trans_list])
percentage   = lambda x, tot=len(transactions): (x * 1.0) / tot

def only_itemset_with(cls, freq_itemsets):
    column_set = set(df[cls].values)
    return {itemset: freq_itemsets[itemset] for itemset in freq_itemsets if (itemset & column_set) != set()}


# def only_rules_with(clss, rules):
#     classes_set = set(numpy.array([df[cls].values for cls in clss]).flat)
#     print classes_set
#     return [rule for rule in rules if ((rule[0] | rule[1]) & classes_set) != set()]

def only_rules_with(columns, rules, only_cons=False):
    list_of_sets = [set(df[column].values) for column in columns]
    result = []
    for rule in rules:
        current_itemset = rule[1] if only_cons else rule[0] | rule[1]
        cond = True
        for column_set in list_of_sets:
            if (current_itemset & column_set) == set():
                cond = False
        if cond:
            result.append(rule)
    return result


## Frequent Itemset generation

In [5]:
min_supp = 20

#### Relim
https://pdfs.semanticscholar.org/cb3e/76d1773d08545f21daf28cc87b051604aa95.pdf

In [6]:
%%time
relim_input = itemmining.get_relim_input(to_tuple(transactions))
relim_itemsets = itemmining.relim(relim_input, min_support=min_supp)
print "number of frequent itemsets", len(relim_itemsets), "\n"

number of frequent itemsets 1961 

CPU times: user 173 ms, sys: 4.13 ms, total: 177 ms
Wall time: 178 ms


#### FP-Growth

In [42]:
%%time
freq_item_generator = get_freq_itemset(transactions, min_supp, include_support=True)

fp_itemsets = {frozenset(itemset[0]): itemset[1] for itemset in freq_item_generator}
print "number of frequent itemsets", len(fp_itemsets), "\n"

number of frequent itemsets 49782 

CPU times: user 3.86 s, sys: 64.4 ms, total: 3.93 s
Wall time: 3.91 s


In [7]:
pruned_itemsets = only_itemset_with("NIBRSclass", relim_itemsets)
len(pruned_itemsets)

276

## Rule generation

In [8]:
min_conf = 0.60

In [9]:
%%time
rules = assocrules.mine_assoc_rules(relim_itemsets, min_support=min_supp, min_confidence=min_conf)
print len(rules)

6610
CPU times: user 104 ms, sys: 14.8 ms, total: 119 ms
Wall time: 113 ms


In [None]:
type(rules[0])

#### exploration

In [10]:
# leaving only rules that have crimes
rules_with_crimes = only_rules_with(["NIBRSclass"], rules)
rules_with_crimes_only_cons = only_rules_with(["NIBRSclass"], rules, only_cons=True)
print len(rules_with_crimes)
print len(rules_with_crimes_only_cons)

1677
165


In [13]:
rules_with_crimes[:]

[(frozenset({'Affray',
             'School - Primary or Secondary',
             'afro-13/13',
             'edu-8/15',
             'inc-6/15',
             'popden-3/15'}),
  frozenset({'empl-2/15'}),
  20,
  1.0),
 (frozenset({'Affray',
             'School - Primary or Secondary',
             'afro-13/13',
             'edu-8/15',
             'popden-3/15'}),
  frozenset({'empl-2/15', 'inc-6/15'}),
  20,
  1.0),
 (frozenset({'Affray',
             'School - Primary or Secondary',
             'afro-13/13',
             'edu-8/15'}),
  frozenset({'empl-2/15', 'inc-6/15', 'popden-3/15'}),
  20,
  1.0),
 (frozenset({'Affray', 'School - Primary or Secondary', 'afro-13/13'}),
  frozenset({'edu-8/15', 'empl-2/15', 'inc-6/15', 'popden-3/15'}),
  20,
  1.0),
 (frozenset({'Affray', 'afro-13/13'}),
  frozenset({'School - Primary or Secondary',
             'edu-8/15',
             'empl-2/15',
             'inc-6/15',
             'popden-3/15'}),
  20,
  0.8333333333333334),
 (frozenset(

0

In [89]:
rules_with_crimes

[(frozenset({'School - Primary or Secondary', 'edu-8/15'}),
  frozenset({'empl-2/15'}),
  63,
  0.9692307692307692),
 (frozenset({'School - Primary or Secondary', 'empl-2/15'}),
  frozenset({'edu-8/15'}),
  63,
  1.0),
 (frozenset({'School - Primary or Secondary', 'inc-6/15'}),
  frozenset({'sinpar-13/15'}),
  71,
  0.8987341772151899),
 (frozenset({'School - Primary or Secondary', 'sinpar-13/15'}),
  frozenset({'inc-6/15'}),
  71,
  0.922077922077922),
 (frozenset({'School - Primary or Secondary', 'inc-6/15'}),
  frozenset({'own-7/15'}),
  66,
  0.8354430379746836),
 (frozenset({'School - Primary or Secondary', 'own-7/15'}),
  frozenset({'inc-6/15'}),
  66,
  0.9041095890410958),
 (frozenset({'Hotel/Motel'}), frozenset({'hotels'}), 80, 0.9302325581395349)]

In [23]:
# result_file=open('./dataset/result_file.txt', 'w+')
# 
# for rule in rules:
#     result_file.write(
#         str([j for j in rule[0]]) + "  ->  " + str([z for z in rule[1]]) + 
#         ", supp: " + str(rule[2]) + 
#         ", conf: " + str(rule[3]))