## SEQUENCE MINING

In [1]:
import pysal
import numpy
import scipy
import pandas
from pymining import seqmining
from sklearn import preprocessing

%matplotlib inline
pandas.set_option('display.max_rows', 100, "display.max_columns", 100)

In [2]:
db_business = pysal.open('../dataset/Data/CLT_Business_Attri.dbf','r')
df_business = pandas.DataFrame(db_business[:], columns=db_business.header)
print "Business", df_business.shape

db_crimes = pysal.open('../dataset/Data/CrimeIncident_CMPD_2010.dbf','r')
df = pandas.DataFrame(db_crimes[:], columns=db_crimes.header)
print "Crimes", df.shape

Business (24375, 80)
Crimes (67595, 53)


#### Adding blocks to crimes

In [3]:
street2block = df_business.set_index('ST_NAME')["Block_id"].to_dict()
df["BLOCKID10"] = df["Street_Nam"].map(lambda street: street2block.get(street, "not found"))

df = df[df["BLOCKID10"] != "not found"]
df.shape

(50455, 54)

In [4]:
encoder = preprocessing.LabelEncoder()
encoder.fit(list(set(df["NIBRSclass"])))

LabelEncoder()

In [5]:
# groupyng crimes by date
date_dict = dict()
for i, row in df.iterrows():
    date     = row["Report_Dat"][:-5]
    number   = row["Report_Dat"][-4:]
    typology = row["NIBRSclass"]
    block    = row["BLOCKID10"]
    crime    = (number, typology, block)
    date_dict[date] = date_dict.get(date, []) + [crime]

# sorting crimes on number
date_dict = {day: sorted(date_dict[day], key=lambda x: x[0]) for day in date_dict}
# fetching only crimes in Bolck Group 3
sequences_3 = [ [crime[1] for crime in date_dict[day] if crime[2][-4] == "3"] for day in sorted(date_dict.keys())]

In [6]:
with open("../dataset/seqmining.txt", "w+") as output:
    for day in sequences_3:
        for crime in day[:-2]:
            output.write(crime + ",")
        output.write(day[-1] + "\n")

### Daily

In [7]:
%%time
freq_seqs = seqmining.freq_seq_enum(sequences_3, 100)
print len(sequences_3)
print len(freq_seqs)
sorted(freq_seqs)

365
484
CPU times: user 408 ms, sys: 0 ns, total: 408 ms
Wall time: 395 ms


In [8]:
list(freq_seqs)[:5]

[((u'Theft From Motor Vehicle', u'All Other Offenses', u'Simple Assault'),
  139),
 ((u'Burglary/B&E', u'Simple Assault'), 232),
 ((u'Motor Vehicle Theft', u'Theft From Motor Vehicle', u'Burglary/B&E'), 102),
 ((u'Damage/Vandalism Of Property',
   u'All Other Thefts',
   u'All Other Thefts',
   u'Simple Assault'),
  101),
 ((u'Theft From Motor Vehicle',
   u'All Other Thefts',
   u'Drug/Narcotic Violations'),
  145)]

### Monthly

In [9]:
get_month = lambda day: day[:2] + day[-5:] 
months = [get_month(day) for day in date_dict]

In [10]:
month_dict = dict()
for day in sorted(date_dict.keys()):
    month = get_month(day)
    month_dict[month] = month_dict.get(month, []) + date_dict[day]

In [11]:
seqs_monthly_3 = [ [crime[1] for crime in month_dict[month] if crime[2][-4] == "3"] for month in sorted(month_dict.keys())]

In [12]:
with open("../dataset/seq_monthly-mining.txt", "w+") as output:
    for month in seqs_monthly_3:
        for crime in month[:-2]:
            output.write(crime + ",")
        output.write(month[-1] + "\n")

In [38]:
%%time
freq_monthly_seqs = seqmining.freq_seq_enum(seqs_monthly_3, 10)
print len(seqs_monthly_3)
print len(freq_monthly_seqs)
sorted(freq_monthly_seqs)

In [None]:
list(freq_monthly_seqs)[:5]

### CloFAST

In [16]:
!ls ../resources/aprioriallcod

aprioriallcode-sources.jar  aprioriallcod.jar


In [19]:
!java -jar ../resources/aprioriallcod/aprioriallcod.jar\
    /home/chris/Workspace/app/crime-analysis/dataset/seqmining.txt\
    10 ../dataset/results-cloFAST-seqmining.txt FREQUENT

Start loading the dataset
End loading the dataset
Start sequence extraction
End sequence extraction
