# 1 - Preparation

In [56]:
## Enable matplotlib inline
%matplotlib inline
import matplotlib.pyplot as plt

## Imports
import pandas as pd
pd.set_option('mode.chained_assignment',None)
pd.set_option('display.mpl_style', 'default') 
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

import numpy as np
from sklearn.cluster import DBSCAN

## 2.1 - Define functions

In [57]:
import math

## --------------------------------------------------
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    
    sklearn implementation
    2 arcsin(sqrt(sin^2(0.5*dx)cos(x1)cos(x2)sin^2(0.5*dy)))
    
    
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [58]:
## --------------------------------------------------
def trip_work(trip):
    """
    Calculates the score for a trip
    """
    
    work = 0.0
    total_weight = 10.0 + trip['Weight'].sum()
    lon = 0
    lat = 90
    
    for i, row in trip.iterrows():
        current_lon = row['Longitude']
        current_lat = row['Latitude']
        current_w   = row['Weight']
        
        distance = haversine(lon, lat, current_lon, current_lat)
        work += distance * total_weight
        
        total_weight -= current_w
        lon = current_lon
        lat = current_lat
        
    work += haversine(lon, lat, 0, 90) * 10.0
    
    return work
        

In [59]:
## --------------------------------------------------
def make_trips(df, maxd=1.0):
    """
    Make the trips by longitudinal binning and latitude ordering
    """
    
    df = df.sort_values(by='Longitude')
    
    ## Bin the longitude axis such that every bin sums up to 990 pounds
    trip_numbers = []
    trip_number = 1
    weight = 10.0
    longitude = gifts_df['Longitude'].values[0]
    max_delta_longitude = maxd

    for i, row in df.iterrows():
        current_weight = row['Weight']
        current_longitude = row['Longitude']
        if (weight + current_weight > 1000.0) or (current_longitude - longitude > max_delta_longitude):
            trip_number += 1
            weight = 10.0 + current_weight
            longitude = current_longitude
        else:
            weight += current_weight
        trip_numbers.append((row['GiftId'], trip_number))
        
    trip_numbers = np.array(trip_numbers, dtype=int)
    trip_numbers = pd.DataFrame(trip_numbers, columns=['GiftId', 'TripId'])
    
    df = pd.merge(df, trip_numbers)
    
    df = df.sort_values(by=['TripId','Latitude'], ascending=[True,False])
    
    return df

In [60]:
## --------------------------------------------------
def score(df):
    """
    Calculates the total score on the entire dataset
    """
    
    n = df['TripId'].max()
    x = 0
    for i in range(1, n+1):
        trip = df[df['TripId'] == i]
        x += trip_work(trip)
    return x, n

# 2 - Load

In [61]:
## Load data
gifts_df = pd.read_csv('gifts.csv')

## Convert longitude and latitude to radians in new columns
gifts_df['lon_rad'] = np.deg2rad(gifts_df['Longitude'].values)
gifts_df['lat_rad'] = np.deg2rad(gifts_df['Latitude'].values)

# 2 - Clustering

In [62]:
## Frame the DBSCAN parameters in terms of the problem
earth_radius = 6371.0   # km
minimum_distance = 200.0 # km
eps = minimum_distance/earth_radius

## Do the clustering
clustering = DBSCAN(eps=eps, min_samples=50, metric='haversine')
gifts_df['cluster'] = clustering.fit_predict(gifts_df[['lat_rad', 'lon_rad']].values)

In [63]:
# from cv2 import imread, cvtColor, COLOR_BGR2RGB

# ## ----------------------------------------------
# def read_image(path):
#     """
#     Read an image and convert to RGB using openCV
#     """
#     img = imread(path)
#     return cvtColor(img, COLOR_BGR2RGB)

# ## ----------------------------------------------
# def transform(x,y):
#     """
#     transform the latitude/longitude coordinates to the coordinates of the image
#     """
    
#     new_x = (x + 180)*(earth.shape[1]/360.0)
#     new_y = -(y - 90)*(earth.shape[0]/180.0)
    
#     return new_x, new_y

# ## Load earth background
# earth = read_image('earth.jpg')

# ## Get gift data from dataframe
# x = gifts_df['Longitude'].values
# y = gifts_df['Latitude'].values
# w = gifts_df['Weight'].values
# c = gifts_df['cluster'].values % 50

# x_, y_ = transform(x,y)

# plt.figure(figsize=(50,25))
# plt.imshow(earth)
# plt.scatter(x_, y_, s=w, c=c)

# 3 - Longitudinal binning

In [64]:
n_clusters = gifts_df['cluster'].max()

clusters = []
n_gifts_covered = 0

## Partition the dataframe in clusters
for i in range(n_clusters):
    cluster = gifts_df[gifts_df['cluster'] == i]
    cluster = make_trips(cluster, maxd=360.0)
    cluster['TripId'] = cluster['TripId'] + n_gifts_covered
    n_gifts_covered = cluster['TripId'].max()
    clusters.append(cluster)
    
gifts_df = pd.concat(clusters)

In [65]:
## Evaluate final score
x,n = score(gifts_df)
print 'Final score:', x, 'in', n, 'trips'

Final score: 12415954406.0 in 1431 trips


In [66]:
# import random
# from matplotlib import colors

# c = colors.cnames.keys()

# fig = plt.figure(frameon=False)
# fig.set_size_inches(80,40)

# ax = plt.Axes(fig, [0., 0., 1., 1.])
# ax.set_axis_off()
# fig.add_axes(ax)

# ax.imshow(earth, aspect='auto')

# n_trips = gifts_df['TripId'].max()
# for i in range(1, n_trips+1):
#     trip = gifts_df[gifts_df['TripId'] == i]
    
#     x = trip['Longitude'].values
#     y = trip['Latitude'].values
    
#     x_, y_ = transform(x,y)
#     ax.plot(x_,y_,c=random.choice(c))
        
# fig.savefig('trips.png')

In [None]:
trips_df = gifts_df[['GiftId', 'TripId']]
trips_df.to_csv('trips.csv', index=False)