In [None]:
import matplotlib.pyplot as plt # to visualize data
import pandas as pd # to read/write plain tables
import numpy as np

# to display a few webpages within the notebook
from IPython.display import IFrame
%matplotlib inline

import geopandas as gpd # to read/write spatial data
import descartes
from toolz.functoolz import pipe

In [None]:
from src.data import path, load 

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
VO = pipe(
    path.get("VO.pkl"),
    load.local_data
)

In [None]:
print("Number of data points = ", VO.shape[0])

VO_raw_rows = VO.shape[0]

VO.head()

In [None]:
MnR_path = path.get(r"MnR from Rebecca and Located 27-03-2020.csv")
MnR = pd.read_csv(MnR_path)

In [None]:
print("Number of data points = ", MnR.shape[0])

MnR_raw_rows = MnR.shape[0]

MnR.head()

In [None]:
map_of_Dublin = pipe(
    path.get("map_of_Dublin_2016_Electoral_Districts_UG.pkl"),
    load.local_data
)
map_of_Dublin.crs = 'epsg:4326'

map_of_Dublin.plot()

# Create Geo dfs

In [None]:
def create_geodf_from_GPS (df, x, y, crs):
    
    locations = gpd.points_from_xy(x, y)
    geo_df = gpd.GeoDataFrame(df, geometry=locations)
    geo_df.crs = crs
    
    return geo_df

In [None]:
MnR_geo = create_geodf_from_GPS(MnR, y=MnR["Latitude"], x=MnR["Longitude"], crs='epsg:4326')
MnR_geo.plot()

In [None]:
VO_geo = create_geodf_from_GPS(VO, x=VO[" X ITM"], y=VO[" Y ITM"], crs = 'epsg:2157')

VO_geo.plot()

# Clean data

Filter out all Data points falling outside of Dublin (errors...)

__MnR:__

In [None]:
MnR_geo_clean = gpd.sjoin(MnR_geo, map_of_Dublin)

print("Number of data points = ", MnR_geo_clean.shape[0])
print("Number of rows lost in cleaning = ", MnR_raw_rows - MnR_geo_clean.shape[0])

MnR_geo_clean.plot()

__VO:__

VO has outliers at 47, -16 when all should be roughly 53, -6 so remove these ...

In [None]:
VO_geo.head()

In [None]:
mask = VO_geo[" X ITM"] > 100000
VO_geo_clean = VO_geo[mask]

VO_geo_clean.plot()

In [None]:
# VO_geo_clean = gpd.sjoin(map_of_Dublin, VO_geo, how="inner", op="intersects")

print("Number of data points = ", VO_geo_clean.shape[0])
print("Number of rows lost in cleaning = ", VO_raw_rows - VO_geo_clean.shape[0])

VO_geo_clean.plot()

# Plot data

For VO near Dublin

In [None]:
fig, ax = plt.subplots(sharex=True, sharey=True, figsize = (20,16))

map_of_Dublin.plot(ax=ax, color='white', edgecolor='black')
VO_geo_clean.to_crs("epsg:4326").plot(ax=ax, marker='o', color='red')
MnR_geo_clean.plot(ax=ax, marker='o', color='blue')

# fig.savefig("VO_&_MnR_on_Dublin.png")

# Create buffer around MnR bldgs to try and capture nearest VO bldg

In [None]:
fig, ax = plt.subplots(sharex=True, sharey=True, figsize = (20,16))


In [None]:
gpd.GeoDataFrame??

In [None]:
fig, ax = plt.subplots(sharex=True, sharey=True, figsize = (20,16))

map_of_Dublin.plot(ax=ax, color='white', edgecolor='black')

searchbuffer = MnR_geo_clean.buffer(0.0005)

searchbuffer.plot(ax=ax, color='purple')
MnR_geo_clean.plot(ax=ax, color='blue', markersize=0.1)
VO_geo_clean.to_crs("epsg:4326").plot(ax=ax,color='red', markersize=0.1)

fig.savefig('MnR_buffered.png')

In [None]:
searchbuffer = MnR_geo_clean.buffer(0.00015)
MnR_buffered = gpd.GeoDataFrame(
    MnR_geo_clean.drop(columns="geometry"),
    geometry = searchbuffer.to_list(),
    crs = "epsg:4326")

# fig, ax = plt.subplots(sharex=True, sharey=True, figsize = (20,16))
# map_of_Dublin.plot(ax=ax, color='white', edgecolor='black')
# MnR_buffered.plot(ax=ax)
# VO_geo_clean.to_crs("epsg:4326").plot(ax=ax,color='red', markersize=0.1)

In [None]:
# CLEAN COLUMN NAMES
MnR_buffered.drop(columns=["Unnamed: 0", "Unnamed: 0.1", "index_right"], inplace=True)

In [None]:
MnR_and_VO = gpd.sjoin(left_df=MnR_buffered, right_df=VO_geo_clean.to_crs("epsg:4326"), op='intersects', how='inner')

print("MnR to match: ", str(MnR_buffered.shape[0]))
print("MnR matched: ", str(MnR_and_VO.shape[0]))

MnR_and_VO.plot()

In [None]:
MnR_and_VO.columns

In [None]:
compare_addresses = pd.DataFrame(MnR_and_VO[["Address_left", "Address_right"]])
path_comparison = path.set("MnR_and_VO_join_comparison.xlsx", "interim")
compare_addresses.drop_duplicates().to_excel(path_comparison)

In [None]:
MnR_and_VO_path = path.set("MnR_and_VO_joined.csv", "interim")
MnR_and_VO.to_file(MnR_and_VO_path)