## Import data and preprocess

In [1]:
# !pip install s2sphere
# !pip install mapsplotlib

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
import s2sphere

In [3]:
conf = SparkConf().setAppName("msds697_project")
sc = SparkContext(conf=conf)
sc.setLogLevel("OFF")

In [4]:
ss = SparkSession.builder.getOrCreate()

In [5]:
# schema = StructType([StructField("End_Lat",DoubleType(),True),
#  StructField("End_Lon",DoubleType(),True),
#  StructField("Fare_Amt",DoubleType(),True),
#  StructField("Passenger_Count",IntegerType(),True),
#  StructField("Payment_Type",StringType(),True),
#  StructField("Rate_Code",StringType(),True),
#  StructField("Start_Lat",DoubleType(),True),
#  StructField("Start_Lon",DoubleType(),True),
#  StructField("Tip_Amt",DoubleType(),True),
#  StructField("Tolls_Amt",DoubleType(),True),
#  StructField("Total_Amt",DoubleType(),True),
#  StructField("Trip_Distance",DoubleType(),True),
#  StructField("Trip_Dropoff_DateTime",StringType(),True),
#  StructField("Trip_Pickup_DateTime",StringType(),True),
#  StructField("_id",StringType(),True),
#  StructField("mta_tax",StringType(),True),
#  StructField("store_and_forward",StringType(),True),
#  StructField("surcharge",DoubleType(),True),
#  StructField("vendor_name",StringType(),True)])

In [5]:
def toFloatSafe(v):
    try:
        return float(v)
    except ValueError:
        return v

In [6]:
def lowercase_cols(df):
    for col in df.columns:
        new_col = col.lower()
        if new_col != col:
            df = df.withColumnRenamed(col, new_col)
    return df

def toDatetime(df, col_name):
    df = df.withColumn(col_name + "_2", to_timestamp(df[col_name], 'yyyy-MM-dd HH:mm:ss'))
    df = df.drop(col_name).withColumnRenamed(col_name + "_2", col_name)
    return df


In [7]:
nyc_header = "vendor_name,Trip_Pickup_DateTime,Trip_Dropoff_DateTime,Passenger_Count,Trip_Distance,Start_Lon,Start_Lat,Rate_Code,store_and_forward,End_Lon,End_Lat,Payment_Type,Fare_Amt,surcharge,mta_tax,Tip_Amt,Tolls_Amt,Total_Amt".split(",")

nyc_df = sc.textFile('../data/ny/yellow_tripdata_2009-05.csv')\
            .map(lambda x:  x.split(","))\
            .filter(lambda x: (len(x) == len(nyc_header)) & (x != nyc_header))\
            .map(lambda x: [toFloatSafe(v) for v in x])\
            .toDF([x.lower() for x in nyc_header])

keep_cols = ['trip_pickup_datetime',
 'trip_dropoff_datetime',
 'trip_distance',
 'start_lon',
 'start_lat',
 'end_lon',
 'end_lat']

nyc_df = nyc_df.select(keep_cols)
nyc_df = lowercase_cols(nyc_df)

In [8]:
nyc_df.printSchema()

root
 |-- trip_pickup_datetime: string (nullable = true)
 |-- trip_dropoff_datetime: string (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- start_lon: double (nullable = true)
 |-- start_lat: double (nullable = true)
 |-- end_lon: double (nullable = true)
 |-- end_lat: double (nullable = true)



In [9]:
nyc_df.show(5)

+--------------------+---------------------+-------------+----------+---------+----------+---------+
|trip_pickup_datetime|trip_dropoff_datetime|trip_distance| start_lon|start_lat|   end_lon|  end_lat|
+--------------------+---------------------+-------------+----------+---------+----------+---------+
| 2009-05-27 07:41:05|  2009-05-27 07:42:28|          0.3|-73.974105|40.742892|-73.973769|40.746405|
| 2009-05-27 07:51:06|  2009-05-27 07:58:43|          1.9|-74.008148|40.738854|-74.015638|40.714536|
| 2009-05-15 15:22:02|  2009-05-15 15:30:15|          1.4|-73.973343|40.764047|-73.952663|40.769895|
| 2009-05-26 22:06:37|  2009-05-26 22:15:33|          2.2|-74.005256| 40.71973|-74.005523|40.745585|
| 2009-05-27 12:51:39|  2009-05-27 13:00:46|          0.9|-73.997714|40.741363|-73.994257| 40.75105|
+--------------------+---------------------+-------------+----------+---------+----------+---------+
only showing top 5 rows



### Import SF DATA

In [10]:
sf_header = "lat_y,long_y,lat_x,long_x,dt_ts_y,dt_ts_x".split(",")

sf_new_header = ['end_lat',
             'end_lon',
             'start_lat',
             'start_lon',
             'trip_dropoff_datetime',
             'trip_pickup_datetime']

sf_df = sc.textFile("../data/sf_output/sf_data.csv")\
            .map(lambda x:  x.split(","))\
            .filter(lambda x: (len(x) == len(sf_header)) & (x != sf_header))\
            .map(lambda x: [toFloatSafe(v) for v in x])\
            .toDF(sf_new_header)
sf_df.printSchema()

root
 |-- end_lat: double (nullable = true)
 |-- end_lon: double (nullable = true)
 |-- start_lat: double (nullable = true)
 |-- start_lon: double (nullable = true)
 |-- trip_dropoff_datetime: string (nullable = true)
 |-- trip_pickup_datetime: string (nullable = true)



## proprcess sf and nyc

In [11]:
cell_level = 13

def coord_to_id_fun(x, cell_level=cell_level):
    lat, lon = x
    from s2sphere import CellId, LatLng
    cell_id = CellId.from_lat_lng(LatLng.from_degrees(lat, lon))\
    .parent(cell_level).to_token()
    return cell_id

def get_corners(s2CellId_str, cell_level=cell_level+1, cluster=1):
    from s2sphere import CellId, LatLng, Cell
    c1 = Cell(CellId(int(s2CellId_str,16)<<(60 - 2*cell_level)))
    v0 = LatLng.from_point(c1.get_vertex(0)) # lat/lon of upper/left corner
    v1 = LatLng.from_point(c1.get_vertex(1)) # lat/lon of lower/left corner
    v2 = LatLng.from_point(c1.get_vertex(2)) # lat/lon of lower/right corner
    v3 = LatLng.from_point(c1.get_vertex(3)) # lat/lon of upper/right corner
    return ((v0.lat().degrees, v0.lng().degrees, cluster),
            (v1.lat().degrees, v1.lng().degrees, cluster),
            (v2.lat().degrees, v2.lng().degrees, cluster),
            (v3.lat().degrees, v3.lng().degrees, cluster))

def distance_fun(x):
    import math
    lat1, lon1, lat2, lon2 = x
    radius = 6371 # km

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c
    return d


coord_to_id = udf(lambda x: coord_to_id_fun(x), StringType())
distance = udf(lambda x: distance_fun(x), FloatType())

def process_features(df):
    df = toDatetime(df, "trip_pickup_datetime")
    df = toDatetime(df, "trip_dropoff_datetime")

    df = df.withColumn('dow_dropoff', date_format('trip_dropoff_datetime', 'u').cast(IntegerType()))\
    .withColumn('strdow_dropoff', date_format('trip_dropoff_datetime', 'E'))\
    .withColumn('dow_pickup', date_format('trip_pickup_datetime', 'u').cast(IntegerType()))\
    .withColumn('strdow_pickup', date_format('trip_pickup_datetime', 'E'))\
    .withColumn('hour_pickup', date_format('trip_pickup_datetime', 'H').cast(IntegerType()))\
    .withColumn('hour_dropoff', date_format('trip_dropoff_datetime', 'H').cast(IntegerType()))
    
    df = df.withColumn("is_weekend", when((df.dow_pickup >= 6) | (df.dow_dropoff >= 6), 1).otherwise(0))

    df = df.withColumn("start_cell_id", coord_to_id(array("start_lat", "start_lon")))
    df = df.withColumn("end_cell_id", coord_to_id(array("end_lat", "end_lon")))
    df = df.withColumn("distance_km", distance(array("start_lat", "start_lon","end_lat", "end_lon")))
    return df

In [12]:
sf_df = process_features(sf_df)
sf_df.cache()
sf_df.printSchema()

root
 |-- end_lat: double (nullable = true)
 |-- end_lon: double (nullable = true)
 |-- start_lat: double (nullable = true)
 |-- start_lon: double (nullable = true)
 |-- trip_pickup_datetime: timestamp (nullable = true)
 |-- trip_dropoff_datetime: timestamp (nullable = true)
 |-- dow_dropoff: integer (nullable = true)
 |-- strdow_dropoff: string (nullable = true)
 |-- dow_pickup: integer (nullable = true)
 |-- strdow_pickup: string (nullable = true)
 |-- hour_pickup: integer (nullable = true)
 |-- hour_dropoff: integer (nullable = true)
 |-- is_weekend: integer (nullable = false)
 |-- start_cell_id: string (nullable = true)
 |-- end_cell_id: string (nullable = true)
 |-- distance_km: float (nullable = true)



In [13]:
nyc_df = process_features(nyc_df)
nyc_df.cache()
nyc_df.printSchema()

root
 |-- trip_distance: double (nullable = true)
 |-- start_lon: double (nullable = true)
 |-- start_lat: double (nullable = true)
 |-- end_lon: double (nullable = true)
 |-- end_lat: double (nullable = true)
 |-- trip_pickup_datetime: timestamp (nullable = true)
 |-- trip_dropoff_datetime: timestamp (nullable = true)
 |-- dow_dropoff: integer (nullable = true)
 |-- strdow_dropoff: string (nullable = true)
 |-- dow_pickup: integer (nullable = true)
 |-- strdow_pickup: string (nullable = true)
 |-- hour_pickup: integer (nullable = true)
 |-- hour_dropoff: integer (nullable = true)
 |-- is_weekend: integer (nullable = false)
 |-- start_cell_id: string (nullable = true)
 |-- end_cell_id: string (nullable = true)
 |-- distance_km: float (nullable = true)



### select same columns with the same order

In [14]:
cols_in_order = 'start_lon|start_lat|end_lon|end_lat|trip_pickup_datetime|trip_dropoff_datetime|dow_dropoff|strdow_dropoff|dow_pickup|strdow_pickup|hour_pickup|hour_dropoff|is_weekend|start_cell_id|end_cell_id|distance_km'.split('|')
sf_df = sf_df.select(cols_in_order)
nyc_df = nyc_df.select(cols_in_order)

In [15]:
sf_df.show(5)

+-------------------+------------------+-------------------+------------------+--------------------+---------------------+-----------+--------------+----------+-------------+-----------+------------+----------+-------------+-----------+-----------+
|          start_lon|         start_lat|            end_lon|           end_lat|trip_pickup_datetime|trip_dropoff_datetime|dow_dropoff|strdow_dropoff|dow_pickup|strdow_pickup|hour_pickup|hour_dropoff|is_weekend|start_cell_id|end_cell_id|distance_km|
+-------------------+------------------+-------------------+------------------+--------------------+---------------------+-----------+--------------+----------+-------------+-----------+------------+----------+-------------+-----------+-----------+
|         -122.39724|          37.74977|-122.41438000000001|37.755520000000004| 2008-05-17 07:51:10|  2008-05-17 07:55:58|          6|           Sat|         6|          Sat|          7|           7|         1|     808f7fac|   808f7e3c|  1.6369368|
|   

In [16]:
sf_df.printSchema()

root
 |-- start_lon: double (nullable = true)
 |-- start_lat: double (nullable = true)
 |-- end_lon: double (nullable = true)
 |-- end_lat: double (nullable = true)
 |-- trip_pickup_datetime: timestamp (nullable = true)
 |-- trip_dropoff_datetime: timestamp (nullable = true)
 |-- dow_dropoff: integer (nullable = true)
 |-- strdow_dropoff: string (nullable = true)
 |-- dow_pickup: integer (nullable = true)
 |-- strdow_pickup: string (nullable = true)
 |-- hour_pickup: integer (nullable = true)
 |-- hour_dropoff: integer (nullable = true)
 |-- is_weekend: integer (nullable = false)
 |-- start_cell_id: string (nullable = true)
 |-- end_cell_id: string (nullable = true)
 |-- distance_km: float (nullable = true)



In [17]:
nyc_df.printSchema()

root
 |-- start_lon: double (nullable = true)
 |-- start_lat: double (nullable = true)
 |-- end_lon: double (nullable = true)
 |-- end_lat: double (nullable = true)
 |-- trip_pickup_datetime: timestamp (nullable = true)
 |-- trip_dropoff_datetime: timestamp (nullable = true)
 |-- dow_dropoff: integer (nullable = true)
 |-- strdow_dropoff: string (nullable = true)
 |-- dow_pickup: integer (nullable = true)
 |-- strdow_pickup: string (nullable = true)
 |-- hour_pickup: integer (nullable = true)
 |-- hour_dropoff: integer (nullable = true)
 |-- is_weekend: integer (nullable = false)
 |-- start_cell_id: string (nullable = true)
 |-- end_cell_id: string (nullable = true)
 |-- distance_km: float (nullable = true)



In [18]:
nyc_df.show(5)

+----------+---------+----------+---------+--------------------+---------------------+-----------+--------------+----------+-------------+-----------+------------+----------+-------------+-----------+-----------+
| start_lon|start_lat|   end_lon|  end_lat|trip_pickup_datetime|trip_dropoff_datetime|dow_dropoff|strdow_dropoff|dow_pickup|strdow_pickup|hour_pickup|hour_dropoff|is_weekend|start_cell_id|end_cell_id|distance_km|
+----------+---------+----------+---------+--------------------+---------------------+-----------+--------------+----------+-------------+-----------+------------+----------+-------------+-----------+-----------+
|-73.974105|40.742892|-73.973769|40.746405| 2009-05-27 07:41:05|  2009-05-27 07:42:28|          3|           Wed|         3|          Wed|          7|           7|         0|     89c2590c|   89c25904| 0.39165202|
|-74.008148|40.738854|-74.015638|40.714536| 2009-05-27 07:51:06|  2009-05-27 07:58:43|          3|           Wed|         3|          Wed|          

### Join SF and NYC data

In [49]:
nyc_sf_df.unpersist()

DataFrame[start_lon: double, start_lat: double, end_lon: double, end_lat: double, trip_pickup_datetime: timestamp, trip_dropoff_datetime: timestamp, dow_dropoff: int, strdow_dropoff: string, dow_pickup: int, strdow_pickup: string, hour_pickup: int, hour_dropoff: int, is_weekend: int, start_cell_id: string, end_cell_id: string, distance_km: float, city: string]

In [45]:
# nyc_sf_df = nyc_df.union(sf_df)
# nyc_sf_df.cache()
# %time nyc_sf_df.count()

CPU times: user 8.41 ms, sys: 3.41 ms, total: 11.8 ms
Wall time: 38.1 s


15249659

In [46]:
len(nyc_sf_df.columns)

17

In [47]:
nyc_sf_df.show(5)

+----------+---------+----------+---------+--------------------+---------------------+-----------+--------------+----------+-------------+-----------+------------+----------+-------------+-----------+-----------+----+
| start_lon|start_lat|   end_lon|  end_lat|trip_pickup_datetime|trip_dropoff_datetime|dow_dropoff|strdow_dropoff|dow_pickup|strdow_pickup|hour_pickup|hour_dropoff|is_weekend|start_cell_id|end_cell_id|distance_km|city|
+----------+---------+----------+---------+--------------------+---------------------+-----------+--------------+----------+-------------+-----------+------------+----------+-------------+-----------+-----------+----+
|-73.974105|40.742892|-73.973769|40.746405| 2009-05-27 07:41:05|  2009-05-27 07:42:28|          3|           Wed|         3|          Wed|          7|           7|         0|     89c2590c|   89c25904| 0.39165202|  NY|
|-74.008148|40.738854|-74.015638|40.714536| 2009-05-27 07:51:06|  2009-05-27 07:58:43|          3|           Wed|         3|    

In [54]:
# nyc_df.write.csv("../data/nyc_sf_df")

### Gather features by cell

In [48]:
def pivot_column(df, group_col, pv_col, prefix):
    orig_cols = set(df.columns)
    new_df = df.groupBy(group_col).pivot(pv_col).count()
    new_cols = new_df.columns
    new_cols = [c if c in orig_cols else prefix + c for c in new_cols]
    new_df = new_df.toDF(*new_cols)
    new_df = new_df.na.fill(0, list(set(new_cols) - orig_cols))
    return new_df

In [39]:
def create_features(df):
    dow_pickup_df = pivot_column(df, group_col = "start_cell_id", 
                                     pv_col = "dow_pickup", prefix = "dow_pickup_")\
                                     .withColumnRenamed("start_cell_id", "cell_id")

    dow_dropoff_df = pivot_column(df, group_col = "end_cell_id", 
                                      pv_col = "dow_dropoff", prefix = "dow_dropoff_")\
                                      .withColumnRenamed("end_cell_id", "cell_id")

    hr_pickup_wkday_df = pivot_column(df.filter(df.dow_pickup<6), 
                                                               group_col = "start_cell_id", 
                                                               pv_col = "hour_pickup", 
                                                               prefix = "hr_pickup_wkday")\
                                                               .withColumnRenamed("start_cell_id", "cell_id")

    hr_dropoff_wkday_df = pivot_column(df.filter(df.dow_dropoff<6), 
                                                               group_col = "end_cell_id", 
                                                               pv_col = "hour_dropoff", 
                                                               prefix = "hr_dropoff_wkday")\
                                                               .withColumnRenamed("end_cell_id", "cell_id")

    hr_pickup_wkend_df = pivot_column(df.filter(df.dow_pickup>=6), 
                                                               group_col = "start_cell_id", 
                                                               pv_col = "hour_pickup", 
                                                               prefix = "hr_pickup_wkend")\
                                                               .withColumnRenamed("start_cell_id", "cell_id")

    hr_dropoff_wkend_df = pivot_column(df.filter(df.dow_dropoff>=6), 
                                                               group_col = "end_cell_id", 
                                                               pv_col = "hour_dropoff", 
                                                               prefix = "hr_dropoff_wkend")\
                                                               .withColumnRenamed("end_cell_id", "cell_id")

    pickup_km_df = df.groupBy("start_cell_id").agg(avg("distance_km"),stddev("distance_km"))\
                        .withColumnRenamed("avg(distance_km)", "avg_pickup_km")\
                        .withColumnRenamed("stddev_samp(distance_km)", "std_pickup_km")\
                        .withColumnRenamed("start_cell_id", "cell_id")

    dropoff_km_df = df.groupBy("end_cell_id").agg(avg("distance_km"),stddev("distance_km"))\
                        .withColumnRenamed("avg(distance_km)", "avg_dropoff_km")\
                        .withColumnRenamed("stddev_samp(distance_km)", "std_dropoff_km")\
                        .withColumnRenamed("end_cell_id", "cell_id")

    cell_df = merge_dfs([dow_pickup_df, dow_dropoff_df, 
              hr_pickup_wkday_df, hr_dropoff_wkday_df,
              hr_pickup_wkend_df, hr_dropoff_wkend_df,
              pickup_km_df, dropoff_km_df
             ])

    cell_df = cell_df.na.fill(0)

    cell_df.printSchema()
    
    feature_columns = [c for c in cell_df.columns if c != "cell_id"]

    for c in feature_columns:
        cell_df = cell_df.withColumn(c+"_f", cell_df[c].cast(FloatType()))
        cell_df = cell_df.drop(c)
        cell_df = cell_df.withColumnRenamed(c+"_f",c)

    cell_df.cache()

    print(cell_df.count(), len(cell_df.columns))

    return cell_df

In [40]:
def merge_dfs(dfs):
    first_df = None
    for df in dfs:
        if first_df is None:
            first_df = df
        else:
            first_df = first_df.join(df, "cell_id", "left_outer")
    return first_df

In [50]:
nyc_df_cell = create_features(nyc_df)
nyc_df_cell.cache()

root
 |-- cell_id: string (nullable = true)
 |-- dow_pickup_1: long (nullable = true)
 |-- dow_pickup_2: long (nullable = true)
 |-- dow_pickup_3: long (nullable = true)
 |-- dow_pickup_4: long (nullable = true)
 |-- dow_pickup_5: long (nullable = true)
 |-- dow_pickup_6: long (nullable = true)
 |-- dow_pickup_7: long (nullable = true)
 |-- dow_dropoff_1: long (nullable = true)
 |-- dow_dropoff_2: long (nullable = true)
 |-- dow_dropoff_3: long (nullable = true)
 |-- dow_dropoff_4: long (nullable = true)
 |-- dow_dropoff_5: long (nullable = true)
 |-- dow_dropoff_6: long (nullable = true)
 |-- dow_dropoff_7: long (nullable = true)
 |-- hr_pickup_wkday0: long (nullable = true)
 |-- hr_pickup_wkday1: long (nullable = true)
 |-- hr_pickup_wkday2: long (nullable = true)
 |-- hr_pickup_wkday3: long (nullable = true)
 |-- hr_pickup_wkday4: long (nullable = true)
 |-- hr_pickup_wkday5: long (nullable = true)
 |-- hr_pickup_wkday6: long (nullable = true)
 |-- hr_pickup_wkday7: long (nullable =

DataFrame[cell_id: string, dow_pickup_1: float, dow_pickup_2: float, dow_pickup_3: float, dow_pickup_4: float, dow_pickup_5: float, dow_pickup_6: float, dow_pickup_7: float, dow_dropoff_1: float, dow_dropoff_2: float, dow_dropoff_3: float, dow_dropoff_4: float, dow_dropoff_5: float, dow_dropoff_6: float, dow_dropoff_7: float, hr_pickup_wkday0: float, hr_pickup_wkday1: float, hr_pickup_wkday2: float, hr_pickup_wkday3: float, hr_pickup_wkday4: float, hr_pickup_wkday5: float, hr_pickup_wkday6: float, hr_pickup_wkday7: float, hr_pickup_wkday8: float, hr_pickup_wkday9: float, hr_pickup_wkday10: float, hr_pickup_wkday11: float, hr_pickup_wkday12: float, hr_pickup_wkday13: float, hr_pickup_wkday14: float, hr_pickup_wkday15: float, hr_pickup_wkday16: float, hr_pickup_wkday17: float, hr_pickup_wkday18: float, hr_pickup_wkday19: float, hr_pickup_wkday20: float, hr_pickup_wkday21: float, hr_pickup_wkday22: float, hr_pickup_wkday23: float, hr_dropoff_wkday0: float, hr_dropoff_wkday1: float, hr_dro

In [51]:
sf_df_cell = create_features(sf_df)
sf_df_cell.cache()

root
 |-- cell_id: string (nullable = true)
 |-- dow_pickup_1: long (nullable = true)
 |-- dow_pickup_2: long (nullable = true)
 |-- dow_pickup_3: long (nullable = true)
 |-- dow_pickup_4: long (nullable = true)
 |-- dow_pickup_5: long (nullable = true)
 |-- dow_pickup_6: long (nullable = true)
 |-- dow_pickup_7: long (nullable = true)
 |-- dow_dropoff_1: long (nullable = true)
 |-- dow_dropoff_2: long (nullable = true)
 |-- dow_dropoff_3: long (nullable = true)
 |-- dow_dropoff_4: long (nullable = true)
 |-- dow_dropoff_5: long (nullable = true)
 |-- dow_dropoff_6: long (nullable = true)
 |-- dow_dropoff_7: long (nullable = true)
 |-- hr_pickup_wkday0: long (nullable = true)
 |-- hr_pickup_wkday1: long (nullable = true)
 |-- hr_pickup_wkday2: long (nullable = true)
 |-- hr_pickup_wkday3: long (nullable = true)
 |-- hr_pickup_wkday4: long (nullable = true)
 |-- hr_pickup_wkday5: long (nullable = true)
 |-- hr_pickup_wkday6: long (nullable = true)
 |-- hr_pickup_wkday7: long (nullable =

DataFrame[cell_id: string, dow_pickup_1: float, dow_pickup_2: float, dow_pickup_3: float, dow_pickup_4: float, dow_pickup_5: float, dow_pickup_6: float, dow_pickup_7: float, dow_dropoff_1: float, dow_dropoff_2: float, dow_dropoff_3: float, dow_dropoff_4: float, dow_dropoff_5: float, dow_dropoff_6: float, dow_dropoff_7: float, hr_pickup_wkday0: float, hr_pickup_wkday1: float, hr_pickup_wkday2: float, hr_pickup_wkday3: float, hr_pickup_wkday4: float, hr_pickup_wkday5: float, hr_pickup_wkday6: float, hr_pickup_wkday7: float, hr_pickup_wkday8: float, hr_pickup_wkday9: float, hr_pickup_wkday10: float, hr_pickup_wkday11: float, hr_pickup_wkday12: float, hr_pickup_wkday13: float, hr_pickup_wkday14: float, hr_pickup_wkday15: float, hr_pickup_wkday16: float, hr_pickup_wkday17: float, hr_pickup_wkday18: float, hr_pickup_wkday19: float, hr_pickup_wkday20: float, hr_pickup_wkday21: float, hr_pickup_wkday22: float, hr_pickup_wkday23: float, hr_dropoff_wkday0: float, hr_dropoff_wkday1: float, hr_dro

In [52]:
nyc_df_cell = nyc_df_cell.withColumn("city", lit("NY"))
sf_df_cell = sf_df_cell.withColumn("city", lit("SF"))

In [54]:
nyc_sf_df_cell = nyc_df_cell.union(sf_df_cell)
nyc_sf_df_cell.cache()
%time nyc_sf_df.count()

CPU times: user 6.63 ms, sys: 2.64 ms, total: 9.27 ms
Wall time: 2.76 s


15249659

In [55]:
(nyc_sf_df_cell.count(), len(nyc_sf_df_cell.columns))

(6687, 116)

In [56]:
nyc_df_cell.unpersist()
sf_df_cell.unpersist()

DataFrame[cell_id: string, dow_pickup_1: float, dow_pickup_2: float, dow_pickup_3: float, dow_pickup_4: float, dow_pickup_5: float, dow_pickup_6: float, dow_pickup_7: float, dow_dropoff_1: float, dow_dropoff_2: float, dow_dropoff_3: float, dow_dropoff_4: float, dow_dropoff_5: float, dow_dropoff_6: float, dow_dropoff_7: float, hr_pickup_wkday0: float, hr_pickup_wkday1: float, hr_pickup_wkday2: float, hr_pickup_wkday3: float, hr_pickup_wkday4: float, hr_pickup_wkday5: float, hr_pickup_wkday6: float, hr_pickup_wkday7: float, hr_pickup_wkday8: float, hr_pickup_wkday9: float, hr_pickup_wkday10: float, hr_pickup_wkday11: float, hr_pickup_wkday12: float, hr_pickup_wkday13: float, hr_pickup_wkday14: float, hr_pickup_wkday15: float, hr_pickup_wkday16: float, hr_pickup_wkday17: float, hr_pickup_wkday18: float, hr_pickup_wkday19: float, hr_pickup_wkday20: float, hr_pickup_wkday21: float, hr_pickup_wkday22: float, hr_pickup_wkday23: float, hr_dropoff_wkday0: float, hr_dropoff_wkday1: float, hr_dro

`nyc_sf_df_cell` is the final dataframe