# Import Modules and Crime dataset from API

In [1]:
import DAL
import numpy as np
DAL.cleancache()
crime = DAL.create("crime")
crime_list = crime.get_crime_list()
crime_counts = crime.get_crime_counts()
region_lists = crime.get_region_list()

# Definite the kernel function for different kernel estimator

In [2]:

def kernel_space(x,h,n):
    import numpy as np
    import math
    def I(x):
        import numpy as np
        result = np.zeros(len(x))
        result[np.apply_along_axis(np.linalg.norm, 1, x) <= h] = 1
        return result
    if n == 0:
        temp = 0.5*I(x)
        result = temp/np.sum(temp) 
        index = result != 0
        return (result, index)
    
    elif n == 1:
        temp = 1/np.sqrt(2*math.pi)*np.exp(-0.5*np.apply_along_axis(np.linalg.norm, 1, x/h)**2)
        result = temp/np.sum(temp)
        index = result >= 1e-5
        return (result, index)
    
    elif n == 2:
        temp = 0.75*(1-np.apply_along_axis(np.linalg.norm, 1, x/h)**2)*I(x)
        result = temp/np.sum(temp)
        index = result != 0        
        return (result, index)
    

# For this question, I choose Epanechnikov kernel as the kernel function I would use in the following steps and I would test for the bandwith of space in the range [0,001, 0,005, 0.01, 0.05, 0.1] and for the bandwith of time I would try [4,6,8]. In addtion I would choose the last 8 weeks as the held-out data to check my appropriaty of the bandwithes I choose. 

# Build up the time and space weight function 

In [4]:
def time_weight(n, h):
    import numpy as np
    time_list  = np.arange(698)
    result = np.zeros(2*n)
    sample_index = range(110,120)
    h = float(h)
    for i in sample_index:
        timei = time_list[i]
        time_temp = time_list[i-n:i+n] 
        x = time_temp - timei
        temp_coef = 0.75*(1-((x)/h)**2)
        result += temp_coef
    result[:n] = 0    
    weight_time = (result/10)/np.sum(result/10)
    return weight_time



In [5]:
def space_weight(n, h):
    import DAL
    import numpy as np
    crime = DAL.create("crime")
    region_lists = crime.get_region_list()
    center_list = np.array([np.array(region_lists[i]) for i in range(2985)])
    h = float(h)
    result = np.zeros(2*n)
    sample_index = range(1000,1010)
    for i in sample_index:
        centeri = center_list[i]
        center_temp = center_list[i-n:i+n] 
        x = center_temp - centeri
        temp_coef = 0.75*(1-np.apply_along_axis(np.linalg.norm, 1, x/h)**2)
        result += temp_coef
    weight_space = (result/10)/np.sum(result/10)
    weight_space[weight_space <= 0] = 0 
    weight_space = (weight_space)/np.sum(weight_space)
    return weight_space



# Build up the function to compute the log-likelihood function

In [6]:
def log_like(rambda,data):
    import math
    l = 0
    for k in range(10):
        for i in range(2985):
            test_data = np.sum(data[(i,k)][690:])
            try:
                l += test_data*math.log(rambda[k][i])-rambda[k][i]
            except ValueError:
                pass 
    return l         

# Experiment with diffent set of gamma and sigma and choose the combination with the maximum value of log-likelihood function

In [7]:

gamma = [0.001, 0.005, 0.01, 0.05, 0.1]
sigma = [4,6,8]

for hs in gamma:
    for ht in sigma:
        weight_time = time_weight(3,hs)
        weight_sapce = space_weight(5,ht)
        rambda = []
        for k in range(10):
            temp_y = np.zeros(2985)
            for i in range(2985):
                temp_data = crime_counts[(i,k)][:690]
                temp_re = np.convolve(temp_data, weight_time[::-1], "same")
                temp_re[:3] = np.mean(temp_re[3:(2985-1-3)])
                temp_re[(2985-3-1):] = np.mean(temp_re[3:(2985-1-3)])
                temp_y[i] = np.sum(temp_re)
            temp_rambda = np.convolve(temp_y, weight_sapce[::-1], "same") 
            temp_rambda[:5] = np.mean(temp_rambda[5:(2985-1-5)])
            temp_rambda[(2985-1-5):] = np.mean(temp_rambda[5:(2985-1-5)])    
            rambda.append(temp_rambda)
        l = log_like(rambda,crime_counts)    
        print "when the bandwith of time is " + str(ht) + " and the bandwith of space is " + str(hs) + ", then the log-likelihood value is " + str(l) 

when the bandwith of time is 4 and the bandwith of space is 0.001, then the log-likelihood value is -4790498.97609
when the bandwith of time is 6 and the bandwith of space is 0.001, then the log-likelihood value is -4790500.72291
when the bandwith of time is 8 and the bandwith of space is 0.001, then the log-likelihood value is -4790501.33388
when the bandwith of time is 4 and the bandwith of space is 0.005, then the log-likelihood value is -4790498.88774
when the bandwith of time is 6 and the bandwith of space is 0.005, then the log-likelihood value is -4790500.63456
when the bandwith of time is 8 and the bandwith of space is 0.005, then the log-likelihood value is -4790501.24554
when the bandwith of time is 4 and the bandwith of space is 0.01, then the log-likelihood value is -4790498.61165
when the bandwith of time is 6 and the bandwith of space is 0.01, then the log-likelihood value is -4790500.35847
when the bandwith of time is 8 and the bandwith of space is 0.01, then the log-lik

# From the outcomes above, it is easy to choose the combination of bandwith is time for 4 and space for 0.1. Then I would generate the file of the estimate rate for different type of crimes 

In [8]:
ht = 0.1
hs = 4.0 
weight_time = time_weight(3,hs)
weight_sapce = space_weight(5,ht)
for k in range(10):
    temp_y = np.zeros(2985)
    for i in range(2985):
        temp_data = crime_counts[(i,k)][:690]
        temp_re = np.convolve(temp_data, weight_time[::-1], "same")
        temp_re[:3] = np.mean(temp_re[3:(2985-1-3)])
        temp_re[(2985-3-1):] = np.mean(temp_re[3:(2985-1-3)])
        temp_y[i] = np.sum(temp_re)
        temp_rambda = np.convolve(temp_y, weight_sapce[::-1], "same") 
        temp_rambda[:5] = np.mean(temp_rambda[5:(2985-1-5)])
        temp_rambda[(2985-1-5):] = np.mean(temp_rambda[5:(2985-1-5)])    
        rambda.append(temp_rambda)

# Save our result into disk

In [13]:
my_crime_list = ['assult', 'battery', 'burglary', 'criminal-damage', 'deceptive-practice', 'motor-vehicle-theft', 'narcotics', 'other-offense', 'robbery', 'theft']

for k in range(10):
    crime_type = my_crime_list[k]
    result = rambda[k]
    with open("/home/edyue/%s.txt"%crime_type, "w") as fp:
        for i in range(2985):
            fp.write(str(i) + ", " + str(result[i]/10) + "\n")
            
            