# Feature Engineering

### Functions

In [3]:
def haversine(loc1, loc2):
    
    loc1_rad = np.deg2rad(np.asarray(loc1))
    loc2_rad = np.deg2rad(np.asarray(loc2))

    R = 6371  # Earth's radius in km

    delta_lat = loc2_rad[0] - loc1_rad[0]
    delta_long = loc2_rad[1] - loc1_rad[1]

    a = np.sin(delta_lat / 2) ** 2 + np.cos(loc1_rad[0]) \
        * np.cos(loc2_rad[0]) * np.sin(delta_long / 2) ** 2

    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    km = R * c

    return km

In [4]:
def setup_grid_latlong_train(lat_up, lat_down, long_up, long_down, cell_size):
    
    dkx = haversine([lat_up, long_down], [lat_down, long_down]) 
    dky = haversine([lat_up, long_up], [lat_up, long_down])

    Mx = np.ceil(1000 * dkx / (cell_size)) 
    My = np.ceil(1000 * dky/ (cell_size))

    DX = (lat_up - lat_down) / Mx
    DY = (long_up - long_down) / My

    cell_polygons = []

    for i in range(int(Mx)):
        for j in range(int(My)):

            z2 = lat_down + i * (lat_up - lat_down) / Mx
            z1 = z2 + DX
            z4 = long_down + j * (long_up - long_down) / My
            z3 = z4 + DY
        
            V1 = np.asarray([z1, z2, z2, z1])
            V2 = np.asarray([z3, z3, z4, z4])
            
            cell_polygons.append([V1.T, V2.T])
    
    return cell_polygons

In [5]:
def Rotate2D(points, centre, ang):
    
    ang = np.deg2rad(ang)
    rotation_matrix = np.array([[np.cos(ang), np.sin(ang)], 
                                [-np.sin(ang), np.cos(ang)]])
    return np.dot(points - centre, rotation_matrix) + centre

In [6]:
def setup_grid_latlong(df, lat_up, lat_down, long_up, long_down, cell_size):

    df = df[['lat','long']].drop_duplicates()
    df = df.reset_index(drop=True)
    lat = df['lat']
    long = df['long']

    dkx = haversine([lat_up, long_down], [lat_down, long_down]) 
    dky = haversine([lat_up, long_up], [lat_up, long_down])

    Mx = np.ceil(1000 * dkx / (cell_size)) # check why 1000?
    My = np.ceil(1000 * dky / (cell_size))

    DX = (lat_up - lat_down) / Mx
    DY = (long_up - long_down) / My

    N = max(df.shape)

    DX2 = DX * 2
    DY2 = DY / 2

    cell_polygons = []

    for i in range(N):
        
        z2 = lat[i] - DX / 2
        z1 = z2 + DX
        z4 = long[i] - DY / 2
        z3 = z4 + DY

        V1 = np.asarray([z1, z2, z2, z1])
        V2 = np.asarray([z3, z3, z4, z4])
            
        cell_polygons.append([V1.T, V2.T])

        z2 = lat[i] - DX2 / 2
        z1 = z2 + DX2
        z4 = long[i] - DY2 / 2
        z3 = z4 + DY2

        V1 = np.asarray([z1, z2, z2, z1])
        V2 = np.asarray([z3, z3, z4, z4])

        #points = np.column_stack((V1, V2))
        #centre = np.array([lat[i], long[i]])

        points = [k for k in zip(V1, V2)]
        points = np.asarray(points)
        centre = [lat[i], long[i]]

        angles = [45, 135, 0, 90]
        for angle in angles:
            poly_rot = Rotate2D(points, centre, angle)
            v1 = np.asarray([k[0] for k in poly_rot])
            v2 = np.asarray([k[1] for k in poly_rot])

            cell_polygons.append([v1, v2])

            #cell_polygons.append([poly_rot[:, 0], poly_rot[:, 1]])

    return cell_polygons    

In [7]:
from shapely.geometry import Point, Polygon
def cell_crime_counts(df, cell_locations, start_date, end_date):
    
    polygons = [Polygon(np.column_stack(loc)) for loc in cell_locations]

    df_filtered = df[(df['date'] >= start_date) & (df['date'] < end_date)]
    lat_long_points = np.column_stack((df_filtered['lat'], df_filtered['long']))

    cell_counts = np.array([sum([Point(point).within(poly) 
                            for point in lat_long_points]) 
                            for poly in polygons])
    return cell_counts.reshape(-1, 1)

In [8]:
from shapely.geometry import Point, Polygon
def cell_crime_counts(df, cell_locations, start_date, end_date):

    N = len(cell_locations)
    cell_counts = np.zeros((N,1))
    df = df[(df['date'] >= start_date) & (df['date'] < end_date)]

    for i in range(N):
        poly = np.asarray([k for k in zip(cell_locations[i][0], 
                                            cell_locations[i][1])])
        poly = Polygon(poly)
        cell_counts[i] = sum([Point(k).within(poly) 
                            for k in zip(df['lat'], df['long'])])
    
    return cell_counts

In [9]:
from datetime import timedelta
def daterange(start_date, end_date, interval):
    
    interval = timedelta(interval)
    current_date = start_date
    while current_date < end_date:
        yield current_date
        current_date += interval

## Feature engineering

In [10]:
lat_max = data.lat.max()
lat_min = data.lat.min()
long_max = data.long.max()
long_min = data.long.min()

cell_size = 385.36

train_start_date = pd.Timestamp(2021, 6, 30)
train_end_date= pd.Timestamp(2022, 6, 30)
test_start_date= pd.Timestamp(2022, 7, 1)
test_end_date= pd.Timestamp(2022, 12, 31)

cell_locations = setup_grid_latlong_train(lat_max, lat_min, long_max, 
                                        long_min, cell_size)

cell_locations_test = setup_grid_latlong(data[data.date < test_start_date], 
                                        lat_max, lat_min, long_max, 
                                        long_min, cell_size)

In [None]:
X_train = []
Y_train = []

for start_date in daterange(train_start_date, train_end_date, 30):
    target = cell_crime_counts(data, cell_locations, 
                                start_date, start_date+timedelta(90))
    Y_train.append(target)

    print('Y_train {}'.format(start_date))

    lst = []
    time = [timedelta(7), timedelta(30), timedelta(90), 
            timedelta(365)]
    for delta in time:
        lst.append(cell_crime_counts(data, cell_locations, 
                                        start_date-delta, start_date))
    X_train.append(lst)

    print('X_train {}'.format(start_date))

In [None]:
X_test = []
Y_test = []

for start_date in daterange(test_start_date, test_end_date, 15):
    target = cell_crime_counts(data, cell_locations_test, 
                                start_date, start_date+timedelta(90))
    Y_test.append(target)

    print('Y_test {}'.format(start_date))

    lst = []
    time = [timedelta(7), timedelta(30), timedelta(90), 
            timedelta(365)]
    for delta in time:
        lst.append(cell_crime_counts(data, cell_locations_test, 
                                                start_date-delta, start_date))
    X_test.append(lst)

    print('X_test {}'.format(start_date))

In [83]:
with open('Y_train', 'wb') as fp:
    pickle.dump(Y_train, fp)

with open('X_train', 'wb') as fp:
    pickle.dump(X_train, fp)

with open('Y_test', 'wb') as fp:
    pickle.dump(Y_test, fp)

with open('X_test', 'wb') as fp:
    pickle.dump(X_test, fp)