In [2]:
import numpy as np
import pandas as pd
import geopandas as gp
import seaborn as sns

import shapely
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

In [3]:
df = gp.read_file("../data/raw/taxi_zones/taxi_zones.shp")
df.head()

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((933100.918 192536.086, 933091.011 19..."
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((1033269.244 172126.008, 103343..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((1026308.770 256767.698, 1026495.593 ..."
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((992073.467 203714.076, 992068.667 20..."
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((935843.310 144283.336, 936046.565 14..."


In [4]:
df.count()

OBJECTID      263
Shape_Leng    263
Shape_Area    263
zone          263
LocationID    263
borough       263
geometry      263
dtype: int64

In [5]:
# Converting it to a more readable output
df['geometry'] = df['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

In [6]:
# Sample lat, long - 40.68848905639242,-73.99116039276123
point = Point(-73.99116039276123, 40.68848905639242) # long, lat

for poly in df["geometry"]:
    if poly.contains(point):
        print(True)
        break

True


## Update

Use only data within manhattan

In [7]:
df = df[df["borough"] == "Manhattan"]

### Split the CitiBike dataset

The idea is to remove some weight and unnnecessary information by striping the citibike datatset off features such as start station name, end station name, start and end lattitude and longtitudes by storing all of this repeating information in a new table. All that would be left is the start and end station ID with the start and end areas from the shapefile.

This start and end would also be used to filter data from the taxi dataset by selecting only those rides within the areas of citibikes.

Note: We will only use station data from the first month of analysis i.e. July 2020 since the program is rapidly expanding and some new stations may have limited data. It also helps the taxi data to stay consisent my choosing these prespecfied areas

In [8]:
initial_citi = pd.read_csv("../data/raw/citi/202007-citibike-tripdata.csv")
initial_citi.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,341,2020-07-01 00:00:01.3020,2020-07-01 00:05:42.5650,3463,E 16 St & Irving Pl,40.735367,-73.987974,2003,1 Ave & E 18 St,40.733812,-73.980544,33861,Subscriber,1986,1
1,321,2020-07-01 00:00:01.3620,2020-07-01 00:05:22.9490,3463,E 16 St & Irving Pl,40.735367,-73.987974,2003,1 Ave & E 18 St,40.733812,-73.980544,31233,Subscriber,1990,2
2,2710,2020-07-01 00:00:06.6290,2020-07-01 00:45:17.3410,426,West St & Chambers St,40.717548,-74.013221,212,W 16 St & The High Line,40.743349,-74.006818,40329,Customer,1969,0
3,2685,2020-07-01 00:00:13.0220,2020-07-01 00:44:58.8640,426,West St & Chambers St,40.717548,-74.013221,212,W 16 St & The High Line,40.743349,-74.006818,17567,Subscriber,1998,1
4,191,2020-07-01 00:00:13.4660,2020-07-01 00:03:24.6230,3615,44 Dr & 21 St,40.748,-73.946093,3127,9 St & 44 Rd,40.74966,-73.9521,43421,Subscriber,1992,1


In [9]:
station_data = initial_citi[initial_citi.columns[3:11]]
station_data.head()

Unnamed: 0,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude
0,3463,E 16 St & Irving Pl,40.735367,-73.987974,2003,1 Ave & E 18 St,40.733812,-73.980544
1,3463,E 16 St & Irving Pl,40.735367,-73.987974,2003,1 Ave & E 18 St,40.733812,-73.980544
2,426,West St & Chambers St,40.717548,-74.013221,212,W 16 St & The High Line,40.743349,-74.006818
3,426,West St & Chambers St,40.717548,-74.013221,212,W 16 St & The High Line,40.743349,-74.006818
4,3615,44 Dr & 21 St,40.748,-73.946093,3127,9 St & 44 Rd,40.74966,-73.9521


In [10]:
stations = {
    'id' : [],
    'name' : [],
    'latitude' : [],
    'longtitude' : []
}

In [11]:
def station_check(row, first = True):
    col_num = 0
    if not first:
        col_num = 4

    if not (row[col_num] in stations['id']):
        stations['id'].append(row[col_num])
        stations['name'].append(row[col_num + 1])
        stations['latitude'].append(row[col_num + 2])
        stations['longtitude'].append(row[col_num + 3])

for index, row in station_data.iterrows():
    station_check(row)
    station_check(row, False)


In [12]:
stations_df = pd.DataFrame(stations).sort_values(by=['id'])
stations_df.head()

Unnamed: 0,id,name,latitude,longtitude
293,72,W 52 St & 11 Ave,40.767272,-73.993929
701,79,Franklin St & W Broadway,40.719116,-74.006667
559,82,St James Pl & Pearl St,40.711174,-74.000165
113,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323
145,116,W 17 St & 8 Ave,40.741776,-74.001497


In [13]:
stations_df.reset_index(inplace=True)
stations_df.drop(["index"], axis=1, inplace=True)
stations_df.head()

Unnamed: 0,id,name,latitude,longtitude
0,72,W 52 St & 11 Ave,40.767272,-73.993929
1,79,Franklin St & W Broadway,40.719116,-74.006667
2,82,St James Pl & Pearl St,40.711174,-74.000165
3,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323
4,116,W 17 St & 8 Ave,40.741776,-74.001497


Now to associate each latitude and longtitude with a zone in the shapefile

In [14]:
areas = []
for index, row in stations_df.iterrows():
    point = Point(row["longtitude"], row["latitude"])
    
    flag = False
    # Check in shapefile
    for index2, row2 in df.iterrows():
        if row2["geometry"].contains(point):
            areas.append(row2["LocationID"])
            flag = True
            break

    if not flag:
        print("Reached np nan")
        areas.append(np.nan)

Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np nan
Reached np

In [15]:
len(areas)

1007

In [16]:
stations_df["Taxi area code"] = areas
stations_df.head()

Unnamed: 0,id,name,latitude,longtitude,Taxi area code
0,72,W 52 St & 11 Ave,40.767272,-73.993929,50.0
1,79,Franklin St & W Broadway,40.719116,-74.006667,231.0
2,82,St James Pl & Pearl St,40.711174,-74.000165,45.0
3,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,
4,116,W 17 St & 8 Ave,40.741776,-74.001497,68.0


In [17]:
stations_df[stations_df.isnull().any(axis=1)]

Unnamed: 0,id,name,latitude,longtitude,Taxi area code
3,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,
5,119,Park Ave & St Edwards St,40.696089,-73.978034,
6,120,Lexington Ave & Classon Ave,40.686768,-73.959282,
9,143,Clinton St & Joralemon St,40.692395,-73.993379,
10,144,Nassau St & Navy St,40.698399,-73.980689,
...,...,...,...,...,...
989,4019,St. Ann's Av & Bruckner Blvd,40.803399,-73.919763,
991,4021,E 138 St & Grand Concourse,40.813224,-73.930605,
995,4029,Courtlandt Ave & E 149 St,40.816402,-73.919549,
996,4032,E 155 St & Courtlandt Ave,40.820570,-73.917579,


As seen above, these are drop stations found in New Jersey which we wil not include in our analysis

In [18]:
stations_df.dropna(inplace=True)

In [19]:
stations_df["Taxi area code"] = stations_df["Taxi area code"].astype(int)
stations_df.head()

Unnamed: 0,id,name,latitude,longtitude,Taxi area code
0,72,W 52 St & 11 Ave,40.767272,-73.993929,50
1,79,Franklin St & W Broadway,40.719116,-74.006667,231
2,82,St James Pl & Pearl St,40.711174,-74.000165,45
4,116,W 17 St & 8 Ave,40.741776,-74.001497,68
7,127,Barrow St & Hudson St,40.731724,-74.006744,158


In [19]:
stations_df.to_csv("../data/raw/citi/bike stations.csv", index=False)

## UPDATE

---

From Feb 2021, all stations in the CitiBike network have been renamed. And so we will need to match old IDs to new IDs using the address

In [20]:
# old file - stations_df
citi_21 = pd.read_csv("../data/raw/citi/202102-citibike-tripdata.csv")
citi_21.head()

  citi_21 = pd.read_csv("../data/raw/citi/202102-citibike-tripdata.csv")


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,BBA33D73DECE976F,docked_bike,2021-02-26 16:38:54,2021-02-26 16:44:37,E 84 St & Park Ave,7243.04,E 78 St & 2 Ave,7057.07,40.778626,-73.95772,40.772797,-73.955778,casual
1,B63D7AFF9AC5B6D4,docked_bike,2021-02-17 11:09:11,2021-02-17 11:26:47,Macon St & Nostrand Ave,4214.03,Bond St & Fulton St,4479.06,40.680983,-73.950047,40.689622,-73.983043,member
2,52B829195C469D99,docked_bike,2021-02-26 18:33:29,2021-02-26 19:05:41,Macon St & Nostrand Ave,4214.03,Lefferts Pl & Franklin Ave,4222.02,40.680983,-73.950047,40.680342,-73.955769,casual
3,19C84ECA2B468476,docked_bike,2021-02-26 12:48:35,2021-02-26 13:07:24,Macon St & Nostrand Ave,4214.03,Bond St & Fulton St,4479.06,40.680983,-73.950047,40.689622,-73.983043,member
4,C0DDB771E70D9DF5,docked_bike,2021-02-25 17:23:22,2021-02-25 17:28:20,Madison Ave & E 26 St,6131.12,W 37 St & 5 Ave,6398.06,40.742685,-73.986713,40.75038,-73.98339,member


In [21]:
citi_21["rideable_type"].unique()
# All bikes seem to start from being docked

array(['docked_bike'], dtype=object)

In [22]:
# Check number of stations
len(citi_21["start_station_id"].unique())

2440

In [23]:
len(stations_df)

482

As seen above, many stations have been added in citibike's phase 3 expansion plan. However, to keep all results consistent we will only consider stations that were present from the start of our analysis

In [148]:
citi_21.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,BBA33D73DECE976F,docked_bike,2021-02-26 16:38:54,2021-02-26 16:44:37,E 84 St & Park Ave,7243.04,E 78 St & 2 Ave,7057.07,40.778626,-73.95772,40.772797,-73.955778,casual
1,B63D7AFF9AC5B6D4,docked_bike,2021-02-17 11:09:11,2021-02-17 11:26:47,Macon St & Nostrand Ave,4214.03,Bond St & Fulton St,4479.06,40.680983,-73.950047,40.689622,-73.983043,member
2,52B829195C469D99,docked_bike,2021-02-26 18:33:29,2021-02-26 19:05:41,Macon St & Nostrand Ave,4214.03,Lefferts Pl & Franklin Ave,4222.02,40.680983,-73.950047,40.680342,-73.955769,casual
3,19C84ECA2B468476,docked_bike,2021-02-26 12:48:35,2021-02-26 13:07:24,Macon St & Nostrand Ave,4214.03,Bond St & Fulton St,4479.06,40.680983,-73.950047,40.689622,-73.983043,member
4,C0DDB771E70D9DF5,docked_bike,2021-02-25 17:23:22,2021-02-25 17:28:20,Madison Ave & E 26 St,6131.12,W 37 St & 5 Ave,6398.06,40.742685,-73.986713,40.75038,-73.98339,member


In [24]:
start = citi_21[['start_station_name', 'start_station_id', 'start_lat', 'start_lng']]
start.columns = ["name", "new_id", "latitude", "longtitude"]
end = citi_21[['end_station_name', 'end_station_id', 'end_lat', 'end_lng']]
end.columns = ["name", "new_id", "latitude", "longtitude"]

citi_21_stations = pd.concat([start, end])
citi_21_stations = citi_21_stations.drop_duplicates(subset=["new_id"])
citi_21_stations.columns = ["name", "new_id", "new_latitude", "new_longtitude"]
citi_21_stations.head()

Unnamed: 0,name,new_id,new_latitude,new_longtitude
0,E 84 St & Park Ave,7243.04,40.778626,-73.95772
1,Macon St & Nostrand Ave,4214.03,40.680983,-73.950047
4,Madison Ave & E 26 St,6131.12,40.742685,-73.986713
7,W 29 St & 9 Ave,6416.06,40.750072,-73.998392
8,Frederick Douglass Blvd & W 112 St,7631.23,40.801694,-73.957145


In [25]:
# Old stations
stations_df.head()

Unnamed: 0,id,name,latitude,longtitude,Taxi area code
0,72,W 52 St & 11 Ave,40.767272,-73.993929,50
1,79,Franklin St & W Broadway,40.719116,-74.006667,231
2,82,St James Pl & Pearl St,40.711174,-74.000165,45
4,116,W 17 St & 8 Ave,40.741776,-74.001497,68
7,127,Barrow St & Hudson St,40.731724,-74.006744,158


In [26]:
new_stations = stations_df.merge(citi_21_stations, how="left", on="name")
new_stations = new_stations.drop_duplicates(subset=["id"])
new_stations.head()

Unnamed: 0,id,name,latitude,longtitude,Taxi area code,new_id,new_latitude,new_longtitude
0,72,W 52 St & 11 Ave,40.767272,-73.993929,50,6926.01,40.767272,-73.993928
2,79,Franklin St & W Broadway,40.719116,-74.006667,231,5430.08,40.719116,-74.006667
4,82,St James Pl & Pearl St,40.711174,-74.000165,45,5167.06,40.711174,-74.000165
6,116,W 17 St & 8 Ave,40.741776,-74.001497,68,6148.02,40.741776,-74.001497
8,127,Barrow St & Hudson St,40.731724,-74.006744,158,5805.05,40.731724,-74.006744


As seen above, the latitude and longtitude are slightly off but not significantly different. And so merging on latitude and longtitude would be hard to match.

For the remaining data that have null values, we will try to match the latitude and longtitude with an deviation of 0.000001 to see if station names have changed

In [27]:
new_stations[new_stations.isna().any(axis=1)]

Unnamed: 0,id,name,latitude,longtitude,Taxi area code,new_id,new_latitude,new_longtitude
22,167,E 39 St & 3 Ave,40.748901,-73.976049,170,,,
29,195,Liberty St & Broadway,40.709056,-74.010434,87,,,
76,280,E 10 St & 5 Ave,40.73332,-73.995101,113,,,
141,339,Avenue D & E 12 St,40.725806,-73.974225,4,,,
266,459,W 20 St & 11 Ave,40.746745,-74.007756,246,,,
317,493,W 45 St & 6 Ave,40.7568,-73.982912,230,,,
356,518,E 39 St & 2 Ave,40.747804,-73.973442,170,,,
361,523,W 38 St & 8 Ave,40.754666,-73.991382,100,,,
414,3137,5 Ave & E 73 St,40.772828,-73.966853,43,,,
465,3168,Central Park West & W 85 St (old),40.784727,-73.969617,43,,,


In [28]:
len(new_stations[new_stations.isna().any(axis=1)])

20

In [29]:
citi_21_stations.head()

Unnamed: 0,name,new_id,new_latitude,new_longtitude
0,E 84 St & Park Ave,7243.04,40.778626,-73.95772
1,Macon St & Nostrand Ave,4214.03,40.680983,-73.950047
4,Madison Ave & E 26 St,6131.12,40.742685,-73.986713
7,W 29 St & 9 Ave,6416.06,40.750072,-73.998392
8,Frederick Douglass Blvd & W 112 St,7631.23,40.801694,-73.957145


In [30]:
for index, row in new_stations[new_stations.isna().any(axis=1)].iterrows():
    # Check in citi_22 stations
    if not citi_21_stations[
        (abs(row["latitude"] - citi_21_stations["new_latitude"]) < 0.000005)
        &
        (abs(row["longtitude"] - citi_21_stations["new_longtitude"]) < 0.000005)
    ].empty:
        print(row)
        temp = citi_21_stations[
            (abs(row["latitude"] - citi_21_stations["new_latitude"]) < 0.000005)
            &
            (abs(row["longtitude"] - citi_21_stations["new_longtitude"]) < 0.000005)
        ]
        print(temp)
        #print(temp.iloc[0]["new_id"])
        new_stations.at[index, "new_id"] = temp.iloc[0]["new_id"]
        new_stations.at[index, "new_latitude"] = temp.iloc[0]["new_latitude"]
        new_stations.at[index, "new_longtitude"] = temp.iloc[0]["new_longtitude"]
        print("\n\nDone at")
        print(new_stations.iloc[index])

id                           3137
name              5 Ave & E 73 St
latitude                40.772828
longtitude             -73.966853
Taxi area code                 43
new_id                        NaN
new_latitude                  NaN
new_longtitude                NaN
Name: 414, dtype: object
                  name   new_id  new_latitude  new_longtitude
451    5 Ave & E 72 St  7100.07     40.772828      -73.966852
65785  5 Ave & E 72 St  7100.07     40.772828      -73.966853


Done at
id                           3798
name              W 40 St & 5 Ave
latitude                40.752269
longtitude             -73.982079
Taxi area code                164
new_id                    6474.02
new_latitude            40.752269
new_longtitude         -73.982079
Name: 809, dtype: object


In [31]:
len(new_stations[new_stations.isna().any(axis=1)])

19

In [32]:
new_stations[new_stations.isna().any(axis=1)]

Unnamed: 0,id,name,latitude,longtitude,Taxi area code,new_id,new_latitude,new_longtitude
22,167,E 39 St & 3 Ave,40.748901,-73.976049,170,,,
29,195,Liberty St & Broadway,40.709056,-74.010434,87,,,
76,280,E 10 St & 5 Ave,40.73332,-73.995101,113,,,
141,339,Avenue D & E 12 St,40.725806,-73.974225,4,,,
266,459,W 20 St & 11 Ave,40.746745,-74.007756,246,,,
317,493,W 45 St & 6 Ave,40.7568,-73.982912,230,,,
356,518,E 39 St & 2 Ave,40.747804,-73.973442,170,,,
361,523,W 38 St & 8 Ave,40.754666,-73.991382,100,,,
465,3168,Central Park West & W 85 St (old),40.784727,-73.969617,43,,,
482,3182,Yankee Ferry Terminal,40.686931,-74.016966,103,,,


In [164]:
new_stations.tail()

Unnamed: 0,id,name,latitude,longtitude,Taxi area code,new_id,new_latitude,new_longtitude
985,4041,Adam Clayton Powell Blvd & W 151 St,40.825289,-73.936232,42,8009.05,40.825289,-73.936232
986,4044,8 Ave & W 38 St,40.75461,-73.99177,68,6526.05,40.75461,-73.99177
987,4045,West End Ave & W 60 St,40.77237,-73.99005,143,7059.08,40.77237,-73.99005
988,4058,Madison Av & E 51 St,40.75863,-73.97513,161,6659.09,40.75863,-73.97513
989,4064,Southern Blvd & E 142 St,40.807347,-73.908098,168,7719.13,40.807347,-73.908097


In [33]:
# Save new_stations
new_stations.reset_index(drop=True, inplace=True)

In [34]:
# Check for null values
new_stations.isnull().any()

id                False
name              False
latitude          False
longtitude        False
Taxi area code    False
new_id             True
new_latitude       True
new_longtitude     True
dtype: bool

Upon further inspection, some stations have shutdown and others have moved location and so it is better removing such stations

In [36]:
final_stations = new_stations.dropna()
print(final_stations.count())
final_stations.head()

id                463
name              463
latitude          463
longtitude        463
Taxi area code    463
new_id            463
new_latitude      463
new_longtitude    463
dtype: int64


Unnamed: 0,id,name,latitude,longtitude,Taxi area code,new_id,new_latitude,new_longtitude
0,72,W 52 St & 11 Ave,40.767272,-73.993929,50,6926.01,40.767272,-73.993928
1,79,Franklin St & W Broadway,40.719116,-74.006667,231,5430.08,40.719116,-74.006667
2,82,St James Pl & Pearl St,40.711174,-74.000165,45,5167.06,40.711174,-74.000165
3,116,W 17 St & 8 Ave,40.741776,-74.001497,68,6148.02,40.741776,-74.001497
4,127,Barrow St & Hudson St,40.731724,-74.006744,158,5805.05,40.731724,-74.006744


In [37]:
final_stations.to_csv("../data/raw/citi/bike stations.csv", index=False)

In [38]:
import pandas as pd
check = pd.read_csv("../data/raw/citi/bike stations.csv")
check.head()

Unnamed: 0,id,name,latitude,longtitude,Taxi area code,new_id,new_latitude,new_longtitude
0,72,W 52 St & 11 Ave,40.767272,-73.993929,50,6926.01,40.767272,-73.993928
1,79,Franklin St & W Broadway,40.719116,-74.006667,231,5430.08,40.719116,-74.006667
2,82,St James Pl & Pearl St,40.711174,-74.000165,45,5167.06,40.711174,-74.000165
3,116,W 17 St & 8 Ave,40.741776,-74.001497,68,6148.02,40.741776,-74.001497
4,127,Barrow St & Hudson St,40.731724,-74.006744,158,5805.05,40.731724,-74.006744


In [39]:
check.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              463 non-null    int64  
 1   name            463 non-null    object 
 2   latitude        463 non-null    float64
 3   longtitude      463 non-null    float64
 4   Taxi area code  463 non-null    int64  
 5   new_id          463 non-null    object 
 6   new_latitude    463 non-null    float64
 7   new_longtitude  463 non-null    float64
dtypes: float64(4), int64(2), object(2)
memory usage: 29.1+ KB


In [40]:
# check[check["new_id"][:3] == "SYS"]

check['SYS start'] = list(
    map(lambda x: x.startswith('SYS'), check['new_id']))

check[check["SYS start"] == True]

Unnamed: 0,id,name,latitude,longtitude,Taxi area code,new_id,new_latitude,new_longtitude,SYS start
244,3245,NYCBS DEPOT - DELANCEY,40.716444,-73.982331,232,SYS014,40.716444,-73.982331,True


In [13]:
# Check spark left join
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col

sp = SparkSession.builder.appName("Bike test").getOrCreate()
sp

22/08/19 18:28:41 WARN Utils: Your hostname, J-L resolves to a loopback address: 127.0.1.1; using 172.18.124.253 instead (on interface eth0)
22/08/19 18:28:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/19 18:28:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/19 18:28:44 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [14]:
bikes_sp = sp.createDataFrame(check)
bikes_sp.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+---+--------------------+-----------+------------+--------------+-------+------------+--------------+---------+
| id|                name|   latitude|  longtitude|Taxi area code| new_id|new_latitude|new_longtitude|SYS start|
+---+--------------------+-----------+------------+--------------+-------+------------+--------------+---------+
| 72|    W 52 St & 11 Ave|40.76727216|-73.99392888|            50|6926.01|   40.767272|    -73.993928|    false|
| 79|Franklin St & W B...|40.71911552|-74.00666661|           231|5430.08|   40.719116|    -74.006667|    false|
| 82|St James Pl & Pea...|40.71117416|-74.00016545|            45|5167.06|   40.711174|    -74.000165|    false|
| 83|Atlantic Ave & Fo...|40.68382604|-73.97632328|            97|4354.07|   40.683826|    -73.976323|    false|
|116|     W 17 St & 8 Ave|40.74177603|-74.00149746|            68|6148.02|   40.741776|    -74.001497|    false|
+---+--------------------+-----------+------------+--------------+-------+------------+---------

                                                                                

In [15]:
citijan22 = sp.read.parquet("../data/curated/citi/2022-1/")
citijan22.show(5)

+-------------------+-------------------+----------------+--------------+
|         started_at|           ended_at|start_station_id|end_station_id|
+-------------------+-------------------+----------------+--------------+
|2022-01-28 17:12:46|2022-01-28 17:27:03|         4519.04|       3928.08|
|2022-01-04 22:12:48|2022-01-04 22:18:27|         6140.05|       5971.08|
|2022-01-08 11:34:22|2022-01-08 11:43:14|         5763.03|       5422.04|
|2022-01-26 19:04:18|2022-01-26 19:09:05|         6140.05|       6331.01|
|2022-01-10 15:57:12|2022-01-10 16:10:18|         6140.05|       6115.09|
+-------------------+-------------------+----------------+--------------+
only showing top 5 rows



In [17]:
small = bikes_sp[["new_id", "Taxi area code"]]
small.show()

+-------+--------------+
| new_id|Taxi area code|
+-------+--------------+
|6926.01|            50|
|5430.08|           231|
|5167.06|            45|
|4354.07|            97|
|6148.02|            68|
|4700.06|            97|
|4452.03|            17|
|5805.05|           158|
|5687.04|           125|
|4605.04|            33|
|4812.02|            66|
| 5359.1|           231|
|5476.03|             4|
|5492.05|           144|
|4531.05|            33|
|5721.14|           114|
| 6498.1|           233|
|6064.08|           234|
|6708.02|           230|
|6004.07|           137|
+-------+--------------+
only showing top 20 rows



In [19]:
left = citijan22.join(small, small.new_id == citijan22.start_station_id, how="left")
left.show(5)

+-------------------+-------------------+----------------+--------------+-------+--------------+
|         started_at|           ended_at|start_station_id|end_station_id| new_id|Taxi area code|
+-------------------+-------------------+----------------+--------------+-------+--------------+
|2022-01-14 12:01:15|2022-01-14 12:10:02|         4066.15|       4634.02|4066.15|            61|
|2022-01-28 08:06:20|2022-01-28 08:26:38|         4066.15|       5348.02|4066.15|            61|
|2022-01-15 20:32:39|2022-01-15 20:49:01|         4190.06|       4634.02|4190.06|           189|
|2022-01-06 18:39:06|2022-01-06 18:44:55|         4395.07|       4700.06|4395.07|            97|
|2022-01-28 17:12:46|2022-01-28 17:27:03|         4519.04|       3928.08|4519.04|            17|
+-------------------+-------------------+----------------+--------------+-------+--------------+
only showing top 5 rows



22/08/20 00:04:24 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 5628037 ms exceeds timeout 120000 ms
22/08/20 00:04:24 WARN SparkContext: Killing executors is not supported by current scheduler.
