## MODELING

In [1]:
import os
import sys
import numpy
import pandas
import matplotlib.pyplot as plt
sys.path.append(os.path.abspath("./util"))
from pymining import itemmining, assocrules
from fp_growth import find_frequent_itemsets as get_freq_itemset

%matplotlib inline
plt.style.use('ggplot')
pandas.set_option('display.max_rows', 100, "display.max_columns", 100)

In [2]:
df = pandas.read_csv("./dataset/crimes_census_5poi_sampled100.csv", index_col=0, sep="\t")
df.shape

(2954, 21)

In [3]:
df = df.drop(["BLOCKID10", "Street_Nam", "ARHeteInx", "ARPerM1724", "ARPerHOwn", "ARPerSF", "ARPerRMI5L", "ARPer3MU"], axis=1)
df = df.drop(["First_POI", "Second_POI", "Third_POI", "Fourth_POI", "Fifth_POI"], axis=1)
df.describe()

Unnamed: 0,ARPCIncome,ARPerHEdu,ARPerWork,ARPopDen,ARPerAA,NIBRSclass,Place2,Report_Dat
count,2954,2954,2954,2954,2954,2954,2954,2954
unique,15,15,15,15,13,52,61,36
top,inc-2/15,edu-10/15,empl-2/15,popden-1/15,afro-1/13,Drug/Narcotic Violations,Private Residence,09-2x
freq,304,320,391,366,492,100,476,109


In [4]:
transactions = [row.tolist() for i, row in df.iterrows()]

to_tuple   = lambda trans_list: tuple([tuple(trans) for trans in trans_list])
percentage = lambda x, tot=len(transactions): (x * 1.0) / tot

def only_rules_with(columns, rules, only_cons=False):
    list_of_sets = [set(df[column].values) for column in columns]
    result = []
    for rule in rules:
        current_itemset = rule[1] if only_cons else rule[0] | rule[1]
        cond = True
        for column_set in list_of_sets:
            if (current_itemset & column_set) == set():
                cond = False
        if cond:
            result.append(rule)
    return result

def print_that(filepath, rules):
    with open(filepath, 'w+') as result_file:
        for rule in rules:
            result_file.write(rule_to_string(rule))
        
def rule_to_string(rule):
    return str([j for j in rule[0]]) + "  ->  " + str([z for z in rule[1]]) + \
    ", supp: " + str(rule[2]) + \
    ", conf: " + str(rule[3]) + \
    "\n"


## Frequent Itemset generation

In [5]:
min_supp = 10

#### Relim
[Paper 1](https://pdfs.semanticscholar.org/cb3e/76d1773d08545f21daf28cc87b051604aa95.pdf)
[Paper 2](http://www.borgelt.net/papers/relim.pdf)

In [6]:
%%time
relim_input = itemmining.get_relim_input(to_tuple(transactions))
relim_itemsets = itemmining.relim(relim_input, min_support=min_supp)
print "number of frequent itemsets", len(relim_itemsets), "\n"

number of frequent itemsets 5122 

CPU times: user 255 ms, sys: 5.7 ms, total: 261 ms
Wall time: 288 ms


#### FP-Growth

In [7]:
# %%time
# freq_item_generator = get_freq_itemset(transactions, min_supp, include_support=True)
# 
# fp_itemsets = {frozenset(itemset[0]): itemset[1] for itemset in freq_item_generator}
# print "number of frequent itemsets", len(fp_itemsets), "\n"

In [8]:
# pruned_itemsets = only_itemset_with("NIBRSclass", relim_itemsets)
# len(pruned_itemsets)

## Rule generation

In [9]:
min_conf = 0.60

In [10]:
%%time
rules = assocrules.mine_assoc_rules(relim_itemsets, min_support=min_supp, min_confidence=min_conf)
print len(rules)

17267
CPU times: user 251 ms, sys: 15.5 ms, total: 266 ms
Wall time: 299 ms


#### Rules exploration

In [11]:
# leaving only rules that have crimes
rules_with_crimes = only_rules_with(["NIBRSclass"], rules)
rules_with_crimes_in_cons = only_rules_with(["NIBRSclass"], rules, only_cons=True)
print len(rules_with_crimes)
print len(rules_with_crimes_in_cons)

5521
600


In [12]:
print_that('./dataset/result_file.txt', rules_with_crimes)
print_that('./dataset/result_file_only_cons.txt', rules_with_crimes_in_cons)

In [13]:
rules_with_crimes[:1]

[(frozenset({'Forcible Fondling',
             'Private Residence',
             'afro-11/13',
             'edu-15/15',
             'inc-2/15',
             'popden-3/15'}),
  frozenset({'empl-2/15'}),
  13,
  1.0)]

In [14]:
rules_with_crimes_in_cons[-20:]

[(frozenset({'School - Primary or Secondary', 'inc-1/15', 'popden-1/15'}),
  frozenset({'Disorderly Conduct'}),
  23,
  0.696969696969697),
 (frozenset({'Private Residence', 'afro-11/13', 'popden-3/15'}),
  frozenset({'Forcible Fondling'}),
  13,
  0.6842105263157895),
 (frozenset({'Gas Station', 'inc-8/15'}),
  frozenset({'Prostitution', 'afro-12/13'}),
  12,
  0.6),
 (frozenset({'Gas Station', 'afro-12/13', 'inc-8/15'}),
  frozenset({'Prostitution'}),
  12,
  0.6),
 (frozenset({'School - Primary or Secondary', 'edu-13/15'}),
  frozenset({'Disorderly Conduct', 'popden-1/15'}),
  23,
  0.696969696969697),
 (frozenset({'School - Primary or Secondary', 'edu-13/15', 'popden-1/15'}),
  frozenset({'Disorderly Conduct'}),
  23,
  0.696969696969697),
 (frozenset({'Department Store', 'inc-13/15'}),
  frozenset({'Shoplifting', 'empl-12/15'}),
  12,
  0.8),
 (frozenset({'Department Store', 'empl-12/15', 'inc-13/15'}),
  frozenset({'Shoplifting'}),
  12,
  0.8),
 (frozenset({'Department Store', '

---

### Entire dataset

In [16]:
df_entire.columns

Index([u'BLOCKID10', u'First_POI', u'Second_POI', u'Third_POI', u'Fourth_POI',
       u'Fifth_POI', u'ARPCIncome', u'ARPerHEdu', u'ARPerWork', u'ARPerRMI5L',
       u'ARPer3MU', u'ARPopDen', u'ARPerAA', u'ARHeteInx', u'ARPerM1724',
       u'ARPerHOwn', u'ARPerSF', u'NIBRSclass', u'Place2', u'Report_Dat',
       u'Street_Nam'],
      dtype='object')

In [15]:
df_entire = pandas.read_csv("./dataset/crimes_census_5poi.csv", index_col=0, sep="\t")
df_entire = df.drop(["BLOCKID10", "Street_Nam", "ARHeteInx", "ARPerM1724", "ARPerHOwn", 
                     "ARPerSF", "ARPerRMI5L", "ARPer3MU"], axis=1)
df_entire = df.drop(["First_POI", "Second_POI", "Third_POI", "Fourth_POI", "Fifth_POI"], axis=1)

transactions_entire = [row.tolist() for i, row in df_entire.iterrows()]

ValueError: labels ['BLOCKID10' 'Street_Nam' 'ARHeteInx' 'ARPerM1724' 'ARPerHOwn' 'ARPerSF'
 'ARPerRMI5L' 'ARPer3MU'] not contained in axis

In [None]:
%time
min_supp = 250
relim_input_entire = itemmining.get_relim_input(to_tuple(transactions_entire))
relim_itemsets_entire = itemmining.relim(relim_input_entire, min_support=min_supp)
print "number of frequent itemsets", len(relim_itemsets), "\n"

In [None]:
min_conf = 0.90
rules_entire = assocrules.mine_assoc_rules(relim_itemsets_entire, min_support=min_supp, min_confidence=min_conf)
print len(rules)

In [None]:
rules_with_crimes_entire = only_rules_with(["NIBRSclass"], rules_entire)
rules_with_crimes_in_cons_entire = only_rules_with(["NIBRSclass"], rules_entire, only_cons=True)
print len(rules_with_crimes_entire)
print len(rules_with_crimes_in_cons_entire)