In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import random
import math
import xgboost as xgb

from copy import copy
import pickle

In [4]:
#load test data
taxi_test = pd.read_csv('data/test.csv')

In [5]:
#load xgb model
model = pickle.load(open('xgb_model.sav', 'rb'))



## Testing Data / Data Preprocessing

In [39]:
test_set = taxi_test.head(5)

In [40]:
test_set

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [11]:
def clean_df(input_df):
    
    df = input_df.copy()
    
    col_list = ['pickup_longitude', 
        'pickup_latitude', 
        'dropoff_longitude', 
        'dropoff_latitude', 
        'trip_duration',
        'pickup_minute',
        'pickup_hour',
        'pickup_month',
        'pickup_day',
        'pickup_weekday']
    
    #changing datetime column to datetime class
    df['pickup_datetime_hold'] = pd.to_datetime(df['pickup_datetime'])
    
    #adding minute column
    df['pickup_minute'] = df.apply(lambda x: x.pickup_datetime_hold.minute, axis = 1) 

    #adding hour column
    df['pickup_hour'] = df.apply(lambda x: x.pickup_datetime_hold.hour, axis = 1) 

    #adding month column
    df['pickup_month'] = df.apply(lambda x: x.pickup_datetime_hold.month, axis = 1) 

    #adding day of month column
    df['pickup_day'] = df.apply(lambda x: x.pickup_datetime_hold.day, axis = 1) 

    #adding day of week column
    df['pickup_weekday'] = df.apply(lambda x: datetime.weekday(x.pickup_datetime_hold), axis = 1) 
    
    return(df[df.columns.intersection(col_list)])

In [41]:
#sample dataset
test_df = clean_df(test_set)[['pickup_longitude', 'pickup_latitude']]

#Sample date
test_date = datetime(2020, 5, 17, 17)

In [42]:
test_df

Unnamed: 0,pickup_longitude,pickup_latitude
0,-73.988129,40.732029
1,-73.964203,40.679993
2,-73.997437,40.737583
3,-73.95607,40.7719
4,-73.970215,40.761475


## Auxiliary Functions

In [26]:
#converting seconds into hour, minutes, seconds, microseconds
def convert(start_time, add): 
    add = add % (24 * 3600) 
    hour = add // 3600
    add %= 3600
    minutes = add // 60
    add %= 60
      
    return(start_time + timedelta(seconds=add, minutes=minutes, hours=hour)) 

In [27]:
#setup function to only keep columns we want
def setup(x):
    hold = pd.concat([x, x.shift(-1)], axis = 1).dropna()
    hold.columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'] 
    return(hold)

In [28]:
#prepare dataframe to be inputted into model
def set_model_input(x, ind, time):
    hold = x.iloc[ind].to_frame().T
    hold['pickup_datetime'] = time
    return(clean_df(hold))

In [29]:
#return total time of a set of trips
def get_total_time(x, start_time):
    
    #setting up intial input df, time, and holding array
    temp = setup(x)
    start_time_hold = start_time
    time_total = [0] * len(temp)
    
    for i in range(len(temp)):
        hold = set_model_input(temp, i, start_time_hold)
        trip_dur = (np.exp(model.predict(xgb.DMatrix(hold))) - 1)[0]
        time_total[i] = trip_dur
        start_time_hold = convert(start_time_hold, trip_dur)
        
    return(sum(time_total))

In [43]:
get_total_time(test_df, test_date)

5373.680145263672

# GA functions

In [32]:
#get intial population for GA
def get_init_pop(x, num):
    
    hold = []
    pop = list(range(x))
    
    for i in range(num):
        hold.append((random.sample(pop,x)))
        
    return(hold)

In [44]:
init_pop =  get_init_pop(len(test_df),20)

In [45]:
init_pop

[[2, 1, 4, 3, 0],
 [4, 3, 1, 2, 0],
 [4, 2, 1, 3, 0],
 [4, 0, 3, 2, 1],
 [1, 4, 3, 2, 0],
 [1, 2, 0, 4, 3],
 [4, 3, 0, 2, 1],
 [1, 2, 3, 4, 0],
 [1, 0, 4, 3, 2],
 [4, 3, 1, 2, 0],
 [4, 1, 3, 0, 2],
 [0, 2, 4, 3, 1],
 [1, 4, 0, 3, 2],
 [4, 3, 1, 2, 0],
 [4, 0, 3, 2, 1],
 [0, 4, 2, 3, 1],
 [1, 2, 0, 4, 3],
 [3, 0, 1, 2, 4],
 [2, 1, 0, 4, 3],
 [0, 4, 3, 1, 2]]

In [47]:
#this is a supplmentary function which will help with our threshold
def condense (input_df):
    return(input_df.groupby(list(set(input_df.columns) - set(['rank', 'chance']))).sum().reset_index())

In [50]:
#TEST function for possible future otpimization
def rank_test(input_pop, orig_pts, datetime):
    
    hold_df = pd.DataFrame(input_pop)
    condensed = hold_df.drop_duplicates()
    
    total_time_hold = [0] * len(condensed)
    
    for i in range(len(condensed)):
        total_time_hold[i] = get_total_time(orig_pts.reindex(input_pop[i]), datetime)
    
    #add total time column / fitness 
    condensed['total_time'] = total_time_hold
    
    joined_df = pd.merge(hold_df, condensed,how='left', on=list(hold_df.columns))

    #add rank column which gives the highest rank to the combination with the lowest total trip duration
    joined_df['rank'] = len(joined_df) - joined_df['total_time'].rank() + 1
    
    #add chance column which uses rank to calculate probability of being chosen as a parent
    ##higher fitness means higher chance of breeding
    joined_df['chance'] = joined_df['rank']*2/(len(joined_df)*(len(joined_df)+1))
    
    return(joined_df)   

In [51]:
#prepare/rank intial population
def rank(input_pop, orig_pts, datetime):
    
    hold_df = pd.DataFrame(input_pop)
    
    total_time_hold = [0] * len(input_pop)
    
    for i in range(len(input_pop)):
        total_time_hold[i] = get_total_time(orig_pts.reindex(input_pop[i]), datetime)
    
    #add total time column / fitness 
    hold_df['total_time'] = total_time_hold
    
    #add rank column which gives the highest rank to the combination with the lowest total trip duration
    hold_df['rank'] = len(hold_df) - hold_df['total_time'].rank() + 1
    
    #add chance column which uses rank to calculate probability of being chosen as a parent
    ##higher fitness means higher chance of breeding
    hold_df['chance'] = hold_df['rank']*2/(len(hold_df)*(len(hold_df)+1))
    
    return(hold_df)   

In [52]:
rank(init_pop, test_df, test_date)

Unnamed: 0,0,1,2,3,4,total_time,rank,chance
0,2,1,4,3,0,5051.935791,5.0,0.02381
1,4,3,1,2,0,4981.280121,9.0,0.042857
2,4,2,1,3,0,5336.970886,3.0,0.014286
3,4,0,3,2,1,5050.988281,6.5,0.030952
4,1,4,3,2,0,3750.175018,18.0,0.085714
5,1,2,0,4,3,3480.735504,19.5,0.092857
6,4,3,0,2,1,3796.187225,16.0,0.07619
7,1,2,3,4,0,4565.534729,14.0,0.066667
8,1,0,4,3,2,4292.394775,15.0,0.071429
9,4,3,1,2,0,4981.280121,9.0,0.042857


Create function to get a set of parents from the new input data frame

In [53]:
#choose a set parents from our population based on the fitness
##parents shouls be equal 
def get_parent(input_df):
    pop_size = len(input_df)
    which_parent = np.random.choice(pop_size, pop_size, p = input_df['chance'])
    num_chrom = len(list(set(input_df.columns) - set(['total_time', 'rank', 'chance'])))
    parent = input_df.reindex(which_parent).iloc[:,:num_chrom].to_numpy()
    return(parent)

We want half of chromosomes from each parent. If there are duplicates we will randomly choose with the options left.

In [54]:
#cross parent chromosomes
def cross(parent1, parent2):
    
    #how many chromosomes we want to keep from each parent
    num_parent = len(parent1)
    num_chrom = len(parent1[0])
    num_one = math.ceil(num_chrom/2)
    num_zero = num_chrom - math.ceil(num_chrom/2) #safer option than using floor
    
    chrom_filter_source = ([1] * num_one) + ([0]*num_zero)
    
    #create #create a "filter" for our chromosomes
    chrom_filter = []
    for i in range(num_parent):
        chrom_filter.append((random.sample(chrom_filter_source, num_chrom)))
        
    #create opposite filter for our second parent
    chrom_filter2 = abs(np.subtract(chrom_filter, 1))
    
    #need to add 1 because our lowest number right now is 0 if left alone when filtered, we will have multipl 0's
    new_parent1 = np.multiply(np.add(parent1,1), chrom_filter)  
    new_parent2 = np.multiply(np.add(parent2,1), chrom_filter2)
    
    #loop through all parents and cross chromosomes
    for i in range(num_parent):
        #only look at couples with matching chromosomes
        if(any(x in new_parent1[i] for x in new_parent2[i])):
            #available chromosomes to choose from to fill gaps
            not_set = list(set(list(range(num_chrom+1))[1:]) - set(new_parent1[i] + new_parent2[i]))
            #fill gaps
            for idx, j in enumerate(new_parent2[i]):
                if (j in new_parent1[i] and j != 0):
                    insert = random.sample(not_set, 1)[0]
                    new_parent1[i,idx] = insert
                    new_parent2[i,idx] = 0
                    not_set.remove(insert) #once a chromosome is used remove it from possible choices
    
    #add them together as a cross
    return(np.add(new_parent1, new_parent2))

In [55]:
#getting the next generation
def get_next_gen(input_df):
    p1 = get_parent(input_df)
    p2 = get_parent(input_df)
    next_gen = np.subtract(cross(p1, p2),1)
    return(next_gen)
    

In [57]:
get_next_gen(rank(init_pop, test_df, test_date))

array([[1, 4, 3, 2, 0],
       [1, 0, 4, 3, 2],
       [0, 2, 4, 3, 1],
       [2, 4, 0, 3, 1],
       [1, 4, 0, 2, 3],
       [0, 2, 4, 3, 1],
       [4, 2, 1, 3, 0],
       [1, 2, 0, 3, 4],
       [1, 0, 4, 2, 3],
       [4, 1, 0, 2, 3],
       [1, 2, 0, 4, 3],
       [1, 2, 3, 4, 0],
       [1, 4, 2, 3, 0],
       [1, 2, 0, 4, 3],
       [4, 0, 3, 2, 1],
       [4, 0, 3, 2, 1],
       [0, 2, 4, 3, 1],
       [4, 1, 0, 2, 3],
       [1, 2, 0, 4, 3],
       [4, 3, 1, 2, 0]], dtype=int64)

In [58]:
test_df

Unnamed: 0,pickup_longitude,pickup_latitude
0,-73.988129,40.732029
1,-73.964203,40.679993
2,-73.997437,40.737583
3,-73.95607,40.7719
4,-73.970215,40.761475


In [80]:
test_start = pd.DataFrame(taxi_test[['pickup_longitude', 'pickup_latitude']].loc[100]).transpose()
test_start

Unnamed: 0,pickup_longitude,pickup_latitude
100,-73.966743,40.764004


In [121]:
def get_optimal_order(start_loc, input_df, date, init_pop_size, gens):

    start_time = datetime.now() #timer
    
    #getting initial population
    init_pop =  get_init_pop(len(input_df), init_pop_size)
    
    for i in range(gens - 1):
        #ranking the population
        ranked = rank(start_loc, init_pop, input_df, date)

        best = ranked.sort_values(by=['rank'], ascending=False).iloc[0] 
    
        #creating next generation
        next_gen = get_next_gen(ranked)
        init_pop = next_gen
    
    final_gen = condense(rank(start_loc, next_gen, input_df, date))
    
    end_time = datetime.now()
    
    print(end_time - start_time)
    return(final_gen)
    
#     return(final_gen.iloc[:3])

In [126]:
get_optimal_order(test_start, test_df, test_date, 20, 2)

0:00:03.578031


Unnamed: 0,0,1,2,3,4,total_time,rank,chance
0,0,2,1,4,3,5217.695831,9.0,0.042857
1,0,2,4,1,3,5497.351715,5.0,0.02381
2,0,3,1,4,2,6371.368958,1.0,0.004762
3,0,3,4,2,1,5261.753357,21.0,0.1
4,0,4,2,3,1,6105.118774,2.0,0.009524
5,3,0,2,1,4,5217.559174,10.0,0.047619
6,3,0,2,4,1,4787.468414,16.0,0.07619
7,4,0,2,3,1,4780.965195,35.0,0.166667
8,4,2,0,1,3,4620.952927,20.0,0.095238
9,4,2,0,3,1,4677.90593,19.0,0.090476


In [109]:
#prepare/rank intial population
def rank(start_pt, input_pop, orig_pts, datetime):
    
    hold_df = pd.DataFrame(input_pop)
    
    total_time_hold = [0] * len(input_pop)
    
    for i in range(len(input_pop)):
              
        total_time_hold[i] = get_total_time(pd.concat([start_pt, orig_pts.reindex(input_pop[i])]), datetime)
    
    #add total time column / fitness 
    hold_df['total_time'] = total_time_hold
    
    #add rank column which gives the highest rank to the combination with the lowest total trip duration
    hold_df['rank'] = len(hold_df) - hold_df['total_time'].rank() + 1
    
    #add chance column which uses rank to calculate probability of being chosen as a parent
    ##higher fitness means higher chance of breeding
    hold_df['chance'] = hold_df['rank']*2/(len(hold_df)*(len(hold_df)+1))
    
    return(hold_df)   

In [114]:
from itertools import permutations 

In [116]:
perm = list(permutations(list(test_set.index))) 
test_full = rank(perm, test_set, test_date)

In [123]:
init_pop

[[2, 1, 4, 3, 0],
 [4, 3, 1, 2, 0],
 [4, 2, 1, 3, 0],
 [4, 0, 3, 2, 1],
 [1, 4, 3, 2, 0],
 [1, 2, 0, 4, 3],
 [4, 3, 0, 2, 1],
 [1, 2, 3, 4, 0],
 [1, 0, 4, 3, 2],
 [4, 3, 1, 2, 0],
 [4, 1, 3, 0, 2],
 [0, 2, 4, 3, 1],
 [1, 4, 0, 3, 2],
 [4, 3, 1, 2, 0],
 [4, 0, 3, 2, 1],
 [0, 4, 2, 3, 1],
 [1, 2, 0, 4, 3],
 [3, 0, 1, 2, 4],
 [2, 1, 0, 4, 3],
 [0, 4, 3, 1, 2]]

In [120]:
test_df.reindex(init_pop[1])

Unnamed: 0,pickup_longitude,pickup_latitude
4,-73.970215,40.761475
3,-73.95607,40.7719
1,-73.964203,40.679993
2,-73.997437,40.737583
0,-73.988129,40.732029


In [118]:
get_total_time(pd.concat([test_start, test_df.reindex(init_pop[1])]), test_date)

5188.636764526367

In [133]:
bok_data=pd.DataFrame(data=dict(x=[], y=[]))

In [136]:
bok_data.columns = ('pickup_longitude', 'pickup_latitude')

In [137]:
bok_data

Unnamed: 0,pickup_longitude,pickup_latitude


In [138]:
testz=dict(x=[], y=[])

In [155]:
coordList=[]

In [140]:
Coords=(-10,10)

In [154]:
bok_data.loc[len(bok_data)] = [1,2]
bok_data

Unnamed: 0,pickup_longitude,pickup_latitude
0,1.0,2.0
1,1.0,2.0
2,1.0,2.0


In [143]:
dict(x=[i[0] for i in coordList], y=[i[1] for i in coordList])

{'x': [-10], 'y': [10]}

{'x': [], 'y': []}