## MODELING

In [1]:
import os
import sys
import numpy
import pandas
import matplotlib.pyplot as plt
sys.path.append(os.path.abspath("./util"))
from pymining import itemmining, assocrules
from fp_growth import find_frequent_itemsets as get_freq_itemset

%matplotlib inline
plt.style.use('ggplot')
pandas.set_option('display.max_rows', 10000, "display.max_columns", 100)

In [2]:
df = pandas.read_csv("./dataset/crimes_census_5poi_sampled100.csv", index_col=0, sep="\t")
df.shape

(2954, 21)

In [3]:
## df = df.drop(["BLOCKID10", "Street_Nam", "ARHeteInx", "ARPerM1724", "ARPerHOwn", "ARPerSF", "ARPerRMI5L", "ARPer3MU"], axis=1)
# df = df.drop(["BLOCKID10", "Street_Nam", "ARPerRMI5L", "ARPer3MU", "ARHeteInx", "ARPerHOwn"], axis=1)
# df = df.drop(["ARPopDen", "ARPerAA", "ARPerM1724", "ARPerSF"], axis=1)
# df = df.drop(["First_POI", "Second_POI", "Third_POI", "Fourth_POI", "Fifth_POI"], axis=1)
# df.describe()

In [4]:
transactions = [row.tolist() for i, row in df.iterrows()]

to_tuple   = lambda trans_list: tuple([tuple(trans) for trans in trans_list])
percentage = lambda x, tot=len(transactions): (x * 1.0) / tot

def only_rules_with(columns, rules, only_cons=False):
    list_of_sets = [set(df[column].values) for column in columns]
    result = []
    for rule in rules:
        current_itemset = rule[1] if only_cons else rule[0] | rule[1]
        cond = True
        for column_set in list_of_sets:
            if (current_itemset & column_set) == set():
                cond = False
        if cond:
            result.append(rule)
    return result

def print_that(filepath, rules):
    with open(filepath, 'w+') as result_file:
        for rule in rules:
            result_file.write(rule_to_string(rule))
        
def rule_to_string(rule):
    return str([j for j in rule[0]]) + "  ->  " + str([z for z in rule[1]]) + \
    ", supp: " + str(rule[2]) + \
    ", conf: " + str(rule[3]) + \
    "\n"


## Frequent Itemset generation

In [5]:
# min_supp = 10

#### Relim
[Paper 1](https://pdfs.semanticscholar.org/cb3e/76d1773d08545f21daf28cc87b051604aa95.pdf)
[Paper 2](http://www.borgelt.net/papers/relim.pdf)

In [6]:
# %%time
# relim_input = itemmining.get_relim_input(to_tuple(transactions))
# relim_itemsets = itemmining.relim(relim_input, min_support=min_supp)
# print "number of frequent itemsets", len(relim_itemsets), "\n"

#### FP-Growth

In [7]:
## %%time
## freq_item_generator = get_freq_itemset(transactions, min_supp, include_support=True)
## 
## fp_itemsets = {frozenset(itemset[0]): itemset[1] for itemset in freq_item_generator}
## print "number of frequent itemsets", len(fp_itemsets), "\n"

In [8]:
## pruned_itemsets = only_itemset_with("NIBRSclass", relim_itemsets)
## len(pruned_itemsets)

## Rule generation

In [9]:
# min_conf = 0.60

In [10]:
# %%time
# rules = assocrules.mine_assoc_rules(relim_itemsets, min_support=min_supp, min_confidence=min_conf)
# print len(rules)

#### Rules exploration

In [11]:
# leaving only rules that have crimes
# rules_with_crimes = only_rules_with(["NIBRSclass"], rules)
# rules_with_crimes_in_cons = only_rules_with(["NIBRSclass"], rules, only_cons=True)
# print len(rules_with_crimes)
# print len(rules_with_crimes_in_cons)

In [12]:
# rules_with_crimes_in_cons

In [13]:
# print_that('./dataset/result_file.txt', rules_with_crimes)
# print_that('./dataset/result_file_only_cons.txt', rules_with_crimes_in_cons)

In [14]:
# rules_with_crimes[:1]

In [15]:
# rules_with_crimes_in_cons[:]

---

### Entire dataset

In [16]:
df_entire = pandas.read_csv("./dataset/crimes_census_5poi.csv", index_col=0, sep="\t")

df_entire = df_entire.drop(["BLOCKID10", "Street_Nam", "ARHeteInx", "ARPerHOwn", 
                            "ARPerRMI5L", "ARPer3MU"], axis=1)
df_entire = df_entire.drop(["First_POI", "Second_POI", "Third_POI", "Fourth_POI", "Fifth_POI"], axis=1)
df_entire = df_entire.drop(["ARPopDen", "ARPerAA", "ARPerM1724", "ARPerSF"], axis=1)

transactions_entire = [row.tolist() for i, row in df_entire.iterrows()]
df_entire.shape

(19106, 6)

In [17]:
df_entire.describe()

Unnamed: 0,ARPCIncome,ARPerHEdu,ARPerWork,NIBRSclass,Place2,Report_Dat
count,19106,19106,19106,19106,19106,19106
unique,7,7,7,52,67,36
top,inc-1/7,edu-6/7,empl-1/7,Theft From Motor Vehicle,Private Residence,07-2x
freq,2967,2828,3183,2523,3950,684


In [18]:
## df = df.drop(["BLOCKID10", "Street_Nam", "ARPerRMI5L", "ARPer3MU", "ARHeteInx", "ARPerHOwn"], axis=1)
# df = df.drop(["ARPopDen", "ARPerAA", "ARPerM1724", "ARPerSF"], axis=1)
## df = df.drop(["First_POI", "Second_POI", "Third_POI", "Fourth_POI", "Fifth_POI"], axis=1)
## df.describe()

In [19]:
min_supp = 70

In [20]:
%time
relim_input_entire = itemmining.get_relim_input(to_tuple(transactions_entire))
relim_itemsets_entire = itemmining.relim(relim_input_entire, min_support=min_supp)
print "number of frequent itemsets", len(relim_itemsets_entire), "\n"

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs
number of frequent itemsets 2008 



In [21]:
min_conf = 0.60

In [22]:
%%time
rules_entire = assocrules.mine_assoc_rules(relim_itemsets_entire, min_support=min_supp, min_confidence=min_conf)
print len(rules_entire)

540
CPU times: user 20 ms, sys: 4.45 ms, total: 24.5 ms
Wall time: 21.5 ms


In [23]:
rules_with_crimes_entire = only_rules_with(["NIBRSclass"], rules_entire)
rules_with_crimes_in_cons_entire = only_rules_with(["NIBRSclass"], rules_entire, only_cons=True)
print len(rules_with_crimes_entire)
print len(rules_with_crimes_in_cons_entire)

170
12


In [24]:
rules_with_crimes_in_cons_entire

[(frozenset({'Department Store'}),
  frozenset({'Shoplifting'}),
  220,
  0.6626506024096386)]

---
### Evaluation

In [25]:
def lift(rule_supp, ant_supp, cons_supp):
    return (rule_supp / 1.0) / (ant_supp * cons_supp)

def conviction(cons_supp, rule_conf):
    return (1.0 - cons_supp) / (1.0 - rule_conf)

In [26]:
for rule in rules:
    l = lift(relim_itemsets[rule[0] | rule[1]], relim_itemsets[rule[0]], relim_itemsets[rule[1]])
    if l > 1:
        print l
    # c = conviction(relim_itemsets[rule[1]], rule[3])

NameError: name 'rules' is not defined