In [None]:
from datetime import datetime
from datetime import timedelta  
import numpy as np
import pandas as pd
import random
import time

# Filtering for a month

In [None]:
def validate_pick_up(row):
    if (row['pickup_latitude'] <=  40.66345 and row['pickup_latitude'] >=  40.63596) and (row['pickup_longitude']<=   -73.7664489 and row['pickup_longitude'] >=   -73.8132848):
        return True
    else:
        return False

In [None]:
def validate_drop_off(row):
    lat = row['dropoff_latitude']
    long = row['dropoff_longitude']
    if((lat > (1.509835492*long + 152.4025459)) and (long < ((12.9717472 - lat)/0.377522883)) and (long > ((12.75605988 - lat)/0.377522883)) and (lat < ((1.509835492)*long + 152.5011075))):
        return True
    else:
        return False

In [None]:
def drop_irrelevant_columns(filtered_data):
    filtered_data = filtered_data.drop(columns = ["payment_type","fare_amount","extra","mta_tax","tip_amount","tolls_amount","improvement_surcharge","total_amount","RatecodeID","store_and_fwd_flag","VendorID"])
    return filtered_data

In [None]:
def filter_data(df=None, path_to_file=None):
    
    if df is None:
        df = pd.read_csv(path_to_file)
    
        df['tpep_pickup_datetime'] = [datetime.strptime(pickup_time, '%Y-%m-%d %H:%M:%S') 
                                         for pickup_time in df['tpep_pickup_datetime']]
        
        
        df['tpep_dropoff_datetime'] = [datetime.strptime(dropoff_time, '%Y-%m-%d %H:%M:%S')
                                          for dropoff_time in df['tpep_dropoff_datetime']]
    
    df= df.sort_values(by='tpep_pickup_datetime')
    
    boolean_series = df.apply(validate_pick_up, axis = 1)
    filtered_pickup = df[boolean_series]
    boolean_series = filtered_pickup.apply(validate_drop_off, axis = 1)
    filtered_data = filtered_pickup[boolean_series]
    

    filtered_data = filtered_data.loc[filtered_data['total_amount']<500]
    filtered_data = filtered_data.loc[filtered_data['total_amount']>0]

    filtered_data['time_diff'] = filtered_data['tpep_dropoff_datetime'] - filtered_data['tpep_pickup_datetime']
    filtered_data['time_diff'] = filtered_data['time_diff'].apply(lambda x: x.seconds/60)

    filtered_data = filtered_data.loc[filtered_data['time_diff'] <=180.00]
    filtered_data = filtered_data.loc[filtered_data['trip_distance'] < 100.00]
    filtered_data['trip_distance'] = filtered_data['trip_distance'] * 1609.34
    filtered_data = filtered_data.loc[filtered_data['trip_distance'] > 18000.00]
    return filtered_data

In [None]:
def generateRandomBoolean(x):   
    return random.getrandbits(1)

In [None]:
def generateRandomNoOfPassengers(x):
    return random.choice([1, 2])

In [None]:
df = None
#initial filtering
filtered_data = filter_data(df,'RideSharingData/yellow_tripdata_2016-03.csv')
#drop irrelevant columns
df = drop_irrelevant_columns(filtered_data)
#assigning random willingness to walk
df['isWillingToWalk'] = df.apply(generateRandomBoolean, axis = 1)
#assigning random passenger count
df['numberOfPassengers'] = df.apply(generateRandomNoOfPassengers, axis = 1)

In [None]:
df.to_csv('RideSharingData/filtered_data_Mar2016.csv',sep=",");


In [None]:
df.shape

# Filtering for specific date and time


In [None]:
# filter on basis of date_time
def filter_on_start_datetime(start_date_time, stop_date_time, df):
    
    df['tpep_pickup_datetime'] = [datetime.strptime(pickup_time, '%Y-%m-%d %H:%M:%S') 
                                         for pickup_time in df['tpep_pickup_datetime']]
        
        
    df['tpep_dropoff_datetime'] = [datetime.strptime(dropoff_time, '%Y-%m-%d %H:%M:%S')
                                          for dropoff_time in df['tpep_dropoff_datetime']]
    

    if start_date_time and stop_date_time:
        df = df.loc[(df['tpep_pickup_datetime'] >= start_date_time) 
                    & (df['tpep_pickup_datetime'] <= stop_date_time)]
    return df

In [None]:
start_date_time_str = '2016-06-10 08:00:00'
start_date_time = datetime.strptime(start_date_time_str, '%Y-%m-%d %H:%M:%S') 

stop_date_time_str = '2016-06-10 22:00:00'
stop_date_time = datetime.strptime(stop_date_time_str, '%Y-%m-%d %H:%M:%S') 

#load monthly data
monthly_df = pd.read_csv("RideSharingData/filtered_data_Jun2016.csv")
daily_df = filter_on_start_datetime(start_date_time, stop_date_time, monthly_df)

In [None]:
daily_df.to_csv('RideSharingData/dataset_C.csv',sep=",");