The goal of this notebook is to create 3 datasets out of the dataset 'geostops_csv' containing stop ids of stops located in a  15km,20km, and 30km radius from the Zurich train station.

In [None]:
%%local
import geopandas
import ipywidgets as widgets
import os
from geopy.distance import distance
from io import StringIO
from pyhive import hive
import pandas as pd
import plotly.express as px
from ipywidgets import interactive, widgets, interact

## Connect to hive

In [None]:
%%local

username = os.environ['JUPYTERHUB_USER']

# create connection
conn = hive.connect(host=os.environ['HIVE_SERVER_2'], 
                    port=10000,
                    username=username) 
# create cursor
cur = conn.cursor()

## Load stops dataframe

In [3]:
%%local
query = """
    drop table if exists {0}.geostops_csv
""".format(username)
cur.execute(query)

# create external table to read geostops data
query = """
    create external table {0}.geostops_csv(
        stop_id string,
        stop_name string,
        stop_lat string,
        stop_lon string,
        location_type string,
        parent_station string
    )
    row format delimited fields terminated by ';'
    stored as textfile
    location '/data/sbb/csv/geostops/'
    tblproperties ("skip.header.line.count"="1")
""".format(username)
cur.execute(query)

# select all here since it's a small database

query = """
    select * from {0}.geostops_csv
""".format(username)

geostops_df = pd.read_sql(query, conn)
geostops_df.columns = [x.split('.')[1] for x in geostops_df.columns]
geostops_df[['stop_lat', 'stop_lon']] = geostops_df[['stop_lat', 'stop_lon']].astype('float')
geostops_df['stop_id'] = geostops_df['stop_id'].apply(lambda x : x.split(':')[0].replace('P', ''))

## Compute dfs at a distance from Zurich HB

In [39]:
%%local

# define lat/lon pair for zurich
zurich_hb = (47.378177, 8.540192)

# compute distance with respect to zurich_hb for each station
distance_to_zurich = lambda x : distance((x['stop_lat'],x['stop_lon']), zurich_hb).km
geostops_df['distance_to_main'] = geostops_df.apply(distance_to_zurich, axis=1)

geostops_df = geostops_df.drop_duplicates(subset=['stop_id'])

# get df and list of names of stations that are considered for start / end points
df_15km = geostops_df[geostops_df['distance_to_main'] < 15].copy()
df_20km = geostops_df[geostops_df['distance_to_main'] < 20].copy()
df_30km = geostops_df[geostops_df['distance_to_main'] < 30].copy()

## Serialize as string and pass to cluster

In [42]:
%%local

s_15km = StringIO()
df_15km.to_csv(s_15km,encoding='utf-8')
df_15km_val = s_15km.getvalue()

s_20km = StringIO()
df_20km.to_csv(s_20km,encoding='utf-8')
df_20km_val = s_20km.getvalue()

s_30km = StringIO()
df_30km.to_csv(s_30km,encoding='utf-8')
df_30km_val = s_30km.getvalue()

In [43]:
%%send_to_spark -i df_15km_val -t str

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'df_15km_val' as 'df_15km_val' to Spark kernel

In [44]:
%%send_to_spark -i df_20km_val -t str

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'df_20km_val' as 'df_20km_val' to Spark kernel

In [45]:
%%send_to_spark -i df_30km_val -t str

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'df_30km_val' as 'df_30km_val' to Spark kernel

## Save as hdfs

In [48]:
import pandas as pd
from pandas.compat import StringIO

df_15km = pd.read_csv(StringIO(df_15km_val),encoding='utf-8')
df_20km = pd.read_csv(StringIO(df_20km_val),encoding='utf-8')
df_30km = pd.read_csv(StringIO(df_30km_val),encoding='utf-8')

df_15km = df_15km[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]
df_20km = df_20km[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]
df_30km = df_30km[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]

sparkdf_15km = sqlContext.createDataFrame(df_15km)
sparkdf_20km = sqlContext.createDataFrame(df_20km)
sparkdf_30km = sqlContext.createDataFrame(df_30km)

sparkdf_15km.write.save("/user/boesinge/finalproject/stops_15km.parquet")
sparkdf_20km.write.save("/user/boesinge/finalproject/stops_20km.parquet")
sparkdf_30km.write.save("/user/boesinge/finalproject/stops_30km.parquet")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…