# Finding Anomolies

## Setup

### Import Data

In [1]:
import os

import fiona
import geopandas as gpd
import gpxpy
import numpy as np
import pandas as pd
from multiprocess.pool import Pool
from tqdm.notebook import tqdm, trange

fiona.drvsupport.supported_drivers['kml'] = 'rw'
fiona.drvsupport.supported_drivers['KML'] = 'rw'

storage = "/Volumes/easystore/Drones/"

flight_details = pd.concat(
    [
        chunk
        for chunk in tqdm(
            pd.read_csv(
                f"{storage}/gpx-with-census-data.csv", chunksize=100000, dtype=str
            ),
            desc="Loading data",
        )
    ]
)

Loading data: 0it [00:00, ?it/s]

In [2]:
geocodio_flights = pd.concat(
    [
        chunk
        for chunk in tqdm(
            pd.read_csv(
                f"{storage}/geocodio/all-flights-manifest_geocodio.csv",
                chunksize=100000,
                dtype=str,
            ),
            desc="Loading data",
        )
    ]
)

Loading data: 0it [00:00, ?it/s]

In [3]:
flight_details["geoid"] = flight_details["geoid"].astype(str)
flight_details["len"] = flight_details["geoid"].apply(lambda x: len(x))
flight_details.loc[flight_details["len"] == 14, "geoid"] = "0" + flight_details["geoid"]

### Launch Sites
- BayView Hospital = 60730131022000
- CVPD = 60730123021013
- SharpChula Medical Center = 60730133273000
- SWestern College = 60730134151000


## Functions

In [26]:
def compile_anomolies(anomolies):

    anomolies = anomolies[anomolies['id']!=False].copy()
    anomolies = geocodio_flights[geocodio_flights['id'].isin(anomolies["id"])].copy()
    anomolies['fn'] = anomolies['id'].apply(lambda x: f"/Volumes/easystore/Drones/flights/kml/{x}.kml")

    compiled_flights = []
    for _, row in anomolies.iterrows():
        d = gpd.read_file(row['fn'], driver='KML')


        d['id'] = row['id']
        d['incident_type'] = row['type']
        d['incident_id'] = row['incident_id']
        d['address_map'] = row['address_map']
        d['date'] = row['date']
        d['time'] = row['time_s']

        compiled_flights.append(d)
    return pd.concat(compiled_flights,ignore_index=True)

In [111]:
def get_unique_seconds_for_block(flight):

    times = pd.to_datetime(flight["sequence"])
    grouped_by_block_and_second = flight.groupby(
    ["id","geoid","block_group","tract", times.dt.hour, times.dt.minute, times.dt.second]).count()
    grouped_by_block_and_second.index.names = ["id","geoid","block_group","tract", "hour", "minute", "second"]
    grouped_by_block_and_second = grouped_by_block_and_second.reset_index()[
    ["id","geoid","block_group","tract", "hour", "minute", "second", "type"]
    ]
    grouped_by_block_and_second.columns = ["id","geoid","block_group","tract", "hour", "minute", "second", "count"]
    
    unique_seconds_in_block = grouped_by_block_and_second.groupby(["geoid","block_group","tract"]).count()
    unique_seconds_in_block = unique_seconds_in_block.reset_index()[["geoid","block_group","tract", "hour"]]
    unique_seconds_in_block.columns = ["GEOID20","block_group","tract","seconds"]
    return unique_seconds_in_block

## Searching

### Lingering over more than two NC blocks

In [59]:
def search_one(index):
#   search for flights that spent more than 5 minutes in two blocks that are not consecutive.
    LINGER = 300

    flight_id = flight_ids.iloc[index]
    flight = flight_details[flight_details['id']==flight_id]

    anomoly = {"id":False,"type":False}

    starting_block = flight.iloc[0]['geoid']
    
    unique_seconds_in_block = get_unique_seconds_for_block(flight)
    unique_seconds_in_block =  unique_seconds_in_block[unique_seconds_in_block['GEOID20']!=flight.iloc[0]['geoid']].copy()

    blocks_w_more_45 = unique_seconds_in_block[(unique_seconds_in_block['seconds']>LINGER)].copy()

    if blocks_w_more_45.shape[0] > 1:
        blocks_w_more_45['GEOID20'] = blocks_w_more_45['GEOID20'].astype(int)
        blocks_w_more_45 = blocks_w_more_45.sort_values("GEOID20")
        blocks_w_more_45['diff'] = blocks_w_more_45['GEOID20'].diff().abs().copy()
        result = blocks_w_more_45[(blocks_w_more_45['diff'] > 1)]
        if result.shape[0] > 0:
            anomoly = {"id":flight_id,"type":flight['type'].drop_duplicates().values[0]}
    return anomoly

#### Run Lingering NC Search

In [35]:
flight_ids = flight_details['id'].sample(50).drop_duplicates()
with Pool(2) as pool:

    lingering_nc = list(
        tqdm(pool.imap(search_one, range(0, flight_ids.shape[0])), total=flight_ids.shape[0])
    )
    lingering_nc = pd.DataFrame(lingering_nc)

  0%|          | 0/50 [00:00<?, ?it/s]

In [27]:
lingering_anomolies = compile_anomolies(lingering_nc)
lingering_anomolies = gpd.GeoDataFrame(lingering_anomolies)
lingering_anomolies['incident_type'] = lingering_anomolies['incident_type'].astype(str)

In [28]:
lingering_anomolies[['geometry','incident_type','id']].to_file("../data/lingering-nc.geojson", driver='GeoJSON')

In [29]:
lingering_anomolies.head()

Unnamed: 0,Name,Description,geometry,id,incident_type,incident_id,address_map,date,time
0,Home,,POINT Z (-117.08269 32.63997 22.13559),651d1096e03c7ba2e80d687d00e72ea0,DOMESTIC VIOLENCE,L062565,706 F,7-28-21,9:48am
1,Airdata.com,,"LINESTRING Z (-117.08269 32.63997 22.13559, -1...",651d1096e03c7ba2e80d687d00e72ea0,DOMESTIC VIOLENCE,L062565,706 F,7-28-21,9:48am
2,Home,,POINT Z (-117.07166 32.61747 34.84546),2d6bcf3670ab962c87609165668ddf7a,Fight,CVL40019,1242 Broadway,5-16-22,5:30pm
3,Airdata.com,,"LINESTRING Z (-117.07166 32.61747 34.84546, -1...",2d6bcf3670ab962c87609165668ddf7a,Fight,CVL40019,1242 Broadway,5-16-22,5:30pm


### Lingering over Block that is Not Destination
lingering for more than 5 minutes on a block thats in a diff tract than the stated destination AND that  tract is not the neighboring tract.

In [171]:
LINGER = 240
def search_two(index):
    flight_id = flight_ids.iloc[index]
    flight = flight_details[flight_details['id']==flight_id]

    manifest_details = geocodio_flights[geocodio_flights['id']==flight_id]
    destination_block = manifest_details.iloc[0][['Full FIPS (block)','Census Tract Code', 'Census Block Group']]
    starting_block = flight.iloc[0]['geoid']
    anomoly = {"id":False,"type":False}

    unique_seconds_in_block = get_unique_seconds_for_block(flight)
    unique_seconds_in_block =  unique_seconds_in_block[unique_seconds_in_block['GEOID20']!=starting_block].copy()
    unique_seconds_in_block =  unique_seconds_in_block[unique_seconds_in_block['GEOID20']!=destination_block['Full FIPS (block)']].copy()
    nd_blocks_g_5 = unique_seconds_in_block[unique_seconds_in_block['seconds']>LINGER]
    
    
    if nd_blocks_g_5.shape[0]>0:
        nd_blocks_g_5_diff_tract = nd_blocks_g_5[
            # (nd_blocks_g_5['block_group'].astype(int)!=destination_block['Census Block Group']) &
            (nd_blocks_g_5['tract'].astype(int)!=destination_block['Census Tract Code'])
        ]
        nd_blocks_g_5_diff_tract['diff'] = nd_blocks_g_5_diff_tract['tract'].astype(int).apply(lambda x: abs(x - int(destination_block['Census Tract Code'])))
        nd_blocks_g_5_diff_tract[nd_blocks_g_5_diff_tract['diff']>1]
        if nd_blocks_g_5_diff_tract.shape[0]>0:
            anomoly = {"id":flight_id,"type":flight['type'].drop_duplicates().values[0]}
    
    return anomoly




In [164]:
abs(100-767)

667

In [175]:
flight_ids = flight_details['id'].sample(1000).drop_duplicates()
with Pool(10) as pool:

    lingering_nd = list(
        tqdm(pool.imap(search_one, range(0, flight_ids.shape[0])), total=flight_ids.shape[0])
    )
    lingering_nd = pd.DataFrame(lingering_nd)

  0%|          | 0/950 [00:00<?, ?it/s]

In [178]:
lingering_anomolies = compile_anomolies(lingering_nd)
lingering_anomolies = gpd.GeoDataFrame(lingering_anomolies)
lingering_anomolies['incident_type'] = lingering_anomolies['incident_type'].astype(str)

In [179]:
lingering_anomolies[['geometry','incident_type','id']].to_file("../data/lingering-nd.geojson", driver='GeoJSON')

In [180]:
lingering_anomolies.shape[0]

66