# Hands-on Lab: Student Assignments

In [None]:
# load modules
import pandas as pd
import numpy as np
import pygdf 

In [None]:
from numba import jit

In [None]:
%load_ext autotime

In [None]:
# Yellow Cab Taxi data.  
taxi_file_1 = '../data/nytaxi/yellow_tripdata_2009-01.csv'
taxi_file_2 = '../data/nytaxi/yellow_tripdata_2009-02.csv'
taxi_file_3 = '../data/nytaxi/yellow_tripdata_2009-03.csv'
taxi_file_4 = '../data/nytaxi/yellow_tripdata_2009-04.csv'
taxi_file_5 = '../data/nytaxi/yellow_tripdata_2009-05.csv'
taxi_file_6 = '../data/nytaxi/yellow_tripdata_2009-06.csv'
taxi_file_7 = '../data/nytaxi/yellow_tripdata_2009-07.csv'

In [None]:
# Define the data
cols = [
    "vendor_name",
    "Trip_Pickup_DateTime",
    "Trip_Dropoff_DateTime",
    "Passenger_Count",
    "Trip_Distance",
    "Start_Lon",
    "Start_Lat",
    "Rate_Code",
    "store_and_forward",
    "End_Lon",
    "End_Lat",
    "Payment_Type",
    "Fare_Amt",
    "surcharge",
    "mta_tax",
    "Tip_Amt",
    "Tolls_Amt",
    "Total_Amt"
]


dtypes = {    
    "vendor_name"           : str,
    "Trip_Pickup_DateTime"  : str,
    "Trip_Dropoff_DateTime" : str,
    "Passenger_Count"       : "int8",
    "Trip_Distance"         : "float16",
    "Start_Lon"             : float,
    "Start_Lat"             : float,
    "Rate_Code"             : float,
    "store_and_forward"     : float,
    "End_Lon"               : float,
    "End_Lat"               : float,
    "Payment_Type"          : str,
    "Fare_Amt"              : "float32",
    "surcharge"             : "float16",
    "mta_tax"               : float,
    "Tip_Amt"               : "float16",
    "Tolls_Amt"             : "float16",
    "Total_Amt"             : "float16"

}

In [None]:
# the data file contains a header line that needs to be skipped
df1 = pd.read_csv(taxi_file_1,  names=cols, dtype=dtypes, skiprows=1)
#df2 = pd.read_csv(taxi_file_2,  names=cols, dtype=dtypes, skiprows=1)
#df3 = pd.read_csv(taxi_file_3,  names=cols, dtype=dtypes, skiprows=1)
#df4 = pd.read_csv(taxi_file_4,  names=cols, dtype=dtypes, skiprows=1)
#df5 = pd.read_csv(taxi_file_5,  names=cols, dtype=dtypes, skiprows=1)
#df6 = pd.read_csv(taxi_file_6,  names=cols, dtype=dtypes, skiprows=1)
#df7 = pd.read_csv(taxi_file_7,  names=cols, dtype=dtypes, skiprows=1)

df = pd.concat([df1])


In [None]:
df.dtypes

# Clean-up the data (on CPU since we are using Strings)

In [None]:
# Followiong fields are typically blank, so drop
df = df.drop(columns=['Rate_Code', 'store_and_forward','mta_tax'])

In [None]:
def Convert_to_Category (df, columns):
    for column in columns:
        df[column] = df[column].str.upper()
        df[column] = df[column].astype('category')
    
    return df

In [None]:
df = Convert_to_Category(df, ['vendor_name', 'Payment_Type'])

###  Convert categories to integer -  Note please remember the category mappings

In [None]:
df['vendor_name'].cat.categories

In [None]:
df['vendor_name'] = df['vendor_name'].cat.codes

In [None]:
df['Payment_Type'].cat.categories

In [None]:
df['Payment_Type'] = df['Payment_Type'].cat.codes

In [None]:
df.dtypes

In [None]:
def ExtractHour (time_col) :
    date   = pd.to_datetime(time_col, format='%Y-%m-%d %H:%M:%S')
    hour   = date.dt.hour
    
    return hour

In [None]:
def ExtractDayOfWeek (time_col) :
    date   = pd.to_datetime(time_col, format='%Y-%m-%d %H:%M:%S')
    dow   = date.dt.dayofweek
    
    return dow

In [None]:
df['Pickup_Hour'] = ExtractHour(df['Trip_Pickup_DateTime'])
df['Dropoff_Hour'] = ExtractHour(df['Trip_Dropoff_DateTime'])

In [None]:
# Only needed if doing multiple files since each file is one day
df['Pickup_DayOfWeek'] = ExtractDayOfWeek(df['Trip_Pickup_DateTime'])
df['Dropoff_DayOfWeek'] = ExtractDayOfWeek(df['Trip_Dropoff_DateTime'])

In [None]:
# drop the dates columns, the following questions just need the hour
df = df.drop(columns=['Trip_Pickup_DateTime', 'Trip_Dropoff_DateTime'])

## Create the GDF and start processing on the GPU

In [None]:
gdf = pygdf.DataFrame.from_pandas(df)

### Scale the Lat and Lon to make grouping easier


In [None]:
from math import floor

def round_latlon(x):
    scale = 5
    return floor(x * scale) / scale

def round_columns(df, columns) :
    for column in columns:
        df[column] = df[column].applymap(round_latlon)
        
    return df

In [None]:
# use applymap
gdf = round_columns(gdf, ['Start_Lon', 'Start_Lat', 'End_Lon', 'End_Lat'])

---
<p>
# Assignment 
<p>


## What is the average tip ratio?  

## What Hour of the day yeilds the highest average Tips?

## Did the longest trip get a good trip?

## Do cash fares tip better than credit?