In [2]:
import pandas as pd
import numpy as np
import Geohash.geohash as geohash
import os
import warnings
warnings.filterwarnings('ignore')

# Prepare train & test sets

In [3]:
train_df = pd.read_csv('train.csv', low_memory=False)
test_df = pd.read_csv('test.csv', low_memory=False)

In [3]:
def getDistance(latA, lonA, latB, lonB):  
    ra = 6378140  # radius of equator: meter  
    rb = 6356755  # radius of polar: meter  
    flatten = (ra - rb) / ra  	# Partial rate of the earth  
    # change angle to radians  
    radLatA = np.pi*(latA)/180
    radLonA = np.pi*(lonA)/180  
    radLatB = np.pi*(latB)/180  
    radLonB = np.pi*(lonB)/180  
    try: 
        pA = np.arctan(rb / ra * np.tan(radLatA))  
        pB = np.arctan(rb / ra * np.tan(radLatB))  
        x = np.arccos(np.sin(pA) * np.sin(pB) + np.cos(pA) * np.cos(pB) * np.cos(radLonA - radLonB))  
        c1 = (np.sin(x) - x) * (np.sin(pA) + np.sin(pB))**2 / np.cos(x / 2)**2  
        c2 = (np.sin(x) + x) * (np.sin(pA) - np.sin(pB))**2 / np.sin(x / 2)**2  
        dr = flatten / 8 * (c1 - c2)  
        distance = ra * (x + dr)  
        return distance # meter  
    except:
        return 0.0000001

In [7]:
distance = getDistance(train_df['start_lat'], train_df['start_lon'] , train_df['end_lat'], train_df['end_lon'])
train_df['distance'] = distance

In [4]:
precisions = [12,11,10,9,8,7,5,4,3]

In [7]:
def geoEncoding(data,precision=5):
    # encode the lat and lon data using geohash
    tmp = data[['start_lat','start_lon']]
    geohashList = []
    for i in tmp.values:
        geohashList.append(geohash.encode(i[0],i[1],precision))
    data['geohash{}'.format(precision)] = geohashList
    return data

def dateConvert(data,isTrain):
    # print 'convert string to datetime'
    data['start_time'] = pd.to_datetime(data['start_time'])
    # encoding start lat lon to geohash
    for pres in precisions:
        data = geoEncoding(data,pres)
    if isTrain:
        data['end_time'] = pd.to_datetime(data['end_time'])
    data['weekday'] = data['start_time'].dt.weekday + 1
    data['if_weekend'] = ( data['weekday'] >= 6 ).astype(int)
    data['hour'] = data['start_time'].dt.hour
    #data['morning_night'] = (data['hour']/4).astype(int)
    return data

In [8]:
train_df = dateConvert(train_df, True)
test_df = dateConvert(test_df, False)
train_df.to_csv('new_train.csv',index=False)
test_df.to_csv('new_test.csv',index=False)

# Predict

In [12]:
train_df = pd.read_csv('new_train.csv', low_memory=False)
test_df = pd.read_csv('new_test.csv', low_memory=False)

In [9]:
train_df.head()

Unnamed: 0,r_key,out_id,start_time,end_time,start_lat,start_lon,end_lat,end_lon,geohash12,geohash11,geohash10,geohash9,geohash8,geohash7,geohash5,geohash4,geohash3,weekday,if_weekend,hour
0,SDK-XJ_609994b4d50a8a07a64d41d1f70bbb05,2016061820000b,2018-01-20 10:13:43,2018-01-20 10:19:04,33.783415,111.60366,33.779811,111.605885,wqp25w569v02,wqp25w569v0,wqp25w569v,wqp25w569,wqp25w56,wqp25w5,wqp25,wqp2,wqp,6,1,10
1,SDK-XJ_4c2f29d94c9478623711756e4ae34cc5,2016061820000b,2018-02-12 17:40:51,2018-02-12 17:58:13,34.810763,115.549264,34.814875,115.549374,ww4nj3h7mh81,ww4nj3h7mh8,ww4nj3h7mh,ww4nj3h7m,ww4nj3h7,ww4nj3h,ww4nj,ww4n,ww4,1,0,17
2,SDK-XJ_3570183177536a575b9da67a86efcd62,2016061820000b,2018-02-13 14:52:24,2018-02-13 15:24:33,34.640284,115.539024,34.813136,115.559243,ww4jj4hfq2uw,ww4jj4hfq2u,ww4jj4hfq2,ww4jj4hfq,ww4jj4hf,ww4jj4h,ww4jj,ww4j,ww4,2,0,14
3,SDK-XJ_78d749a376e190685716a51a6704010b,2016061820000b,2018-02-13 17:23:08,2018-02-13 17:39:02,34.81828,115.542039,34.813141,115.559217,ww4nj4rphtud,ww4nj4rphtu,ww4nj4rpht,ww4nj4rph,ww4nj4rp,ww4nj4r,ww4nj,ww4n,ww4,2,0,17
4,SDK-XJ_3b249941c27834f5e43d43a9114e4909,2016061820000b,2018-02-13 18:06:02,2018-02-13 19:02:51,34.813278,115.55926,34.786126,115.874361,ww4nj9edjcms,ww4nj9edjcm,ww4nj9edjc,ww4nj9edj,ww4nj9ed,ww4nj9e,ww4nj,ww4n,ww4,2,0,18


In [50]:
# ruler geohash
def ruler(train,test):
    base = test
    for c in precisions:
        # case1: find the cases of the same car, same time, and similar geohash on weekday/weekend
        tmp = train.groupby(['out_id', 'hour', 'if_weekend', 'geohash{}'.format(c)],as_index=False)[['end_lat', 'end_lon']].median().rename(
            columns={'end_lat': 'end_lat_median{}'.format(c), 'end_lon': 'end_lon_median{}'.format(c)})
        base = pd.merge(base, tmp,on=['out_id', 'hour','if_weekend', 'geohash{}'.format(c)], how='left', copy=False)
        
        # case2: find the cases of the same car, similar geohash, on weekday/weekend
        tmp = train.groupby(['out_id', 'if_weekend' , 'geohash{}'.format(c)], as_index=False)[['end_lat', 'end_lon']].median().rename(
            columns={'end_lat': 'end_lat_weekday{}'.format(c), 'end_lon': 'end_lon_weekday{}'.format(c)})
        base = pd.merge(base, tmp, on=['out_id', 'if_weekend' , 'geohash{}'.format(c)], how='left', copy=False)

        # case2': find the cases of the same car, similar geohash, on weekday/weekend
        tmp = train.groupby(['out_id', 'geohash{}'.format(c)], as_index=False)[['end_lat', 'end_lon']].median().rename(
            columns={'end_lat': 'end_lat_idex{}'.format(c), 'end_lon': 'end_lon_idex{}'.format(c)})
        base = pd.merge(base, tmp, on=['out_id', 'geohash{}'.format(c)], how='left', copy=False)
        
        # case3: find the cases of similar time and similar geohash
        tmp = train.groupby(['hour','geohash{}'.format(c)], as_index=False)[['end_lat', 'end_lon']].median().rename(
            columns={'end_lat': 'end_lat_only{}'.format(c), 'end_lon': 'end_lon_only{}'.format(c)})
        base = pd.merge(base, tmp, on=['hour','geohash{}'.format(c)], how='left', copy=False)
        
        # case4: find the cases of the same time and similar geohash
        tmp = train.groupby(['morning_night','geohash{}'.format(c)], as_index=False)[['end_lat', 'end_lon']].median().rename(
            columns={'end_lat': 'end_lat_large{}'.format(c), 'end_lon': 'end_lon_large{}'.format(c)})
        base = pd.merge(base, tmp, on=['morning_night','geohash{}'.format(c)], how='left', copy=False)
        

    # if case1-12 does not exist, fill it with other values in case1
    for c in precisions[0:6]:
        base['end_lat_median_final'] = base['end_lat_median12'].fillna(base['end_lat_median{}'.format(c)])
        base['end_lon_median_final'] = base['end_lon_median12'].fillna(base['end_lon_median{}'.format(c)])

    # if case1 dose not exist, fill it with case3
    for c in precisions[0:6]:
        base['end_lat_median_final'] = base['end_lat_median_final'].fillna(base['end_lat_weekday{}'.format(c)])
        base['end_lon_median_final'] = base['end_lon_median_final'].fillna(base['end_lon_weekday{}'.format(c)])
    
    for c in precisions:
        base['end_lat_median_final'] = base['end_lat_median_final'].fillna(base['end_lat_idex{}'.format(c)])
        base['end_lon_median_final'] = base['end_lon_median_final'].fillna(base['end_lon_idex{}'.format(c)])

    # if case1 and case2 do not exist, fill it with case1
    for c in precisions:
        base['end_lat_median_final'] = base['end_lat_median_final'].fillna(base['end_lat_only{}'.format(c)])
        base['end_lon_median_final'] = base['end_lon_median_final'].fillna(base['end_lon_only{}'.format(c)])

    for c in precisions:
        base['end_lat_median_final'] = base['end_lat_median_final'].fillna(base['end_lat_large{}'.format(c)])
        base['end_lon_median_final'] = base['end_lon_median_final'].fillna(base['end_lon_large{}'.format(c)])
    
    print('Finally, there are ', base['end_lat_median_final'].isna().sum(), ' values have no value.')
    # if nothing exist fill it with the start position
    base['end_lat_median_final'] = base['end_lat_median_final'].fillna(base['start_lat'])
    base['end_lon_median_final'] = base['end_lon_median_final'].fillna(base['start_lon'])
    return base

In [10]:
def ruler(train,test):
    base = test
    for c in precisions:
        # case1: find the cases of the same car, same time, and similar geohash
        tmp = train.groupby(['out_id', 'hour','if_weekend','geohash{}'.format(c)],as_index=False)[['end_lat', 'end_lon']].median().rename(
            columns={'end_lat': 'end_lat_median{}'.format(c), 'end_lon': 'end_lon_median{}'.format(c)})
        base = pd.merge(base, tmp,on=['out_id', 'hour','if_weekend','geohash{}'.format(c)], how='left', copy=False)
        
        # case2: find the cases of the same car, similar geohash
        tmp = train.groupby(['out_id', 'geohash{}'.format(c)], as_index=False)[['end_lat', 'end_lon']].median().rename(
            columns={'end_lat': 'end_lat_{}'.format(c), 'end_lon': 'end_lon_{}'.format(c)})
        base = pd.merge(base, tmp, on=['out_id', 'geohash{}'.format(c)], how='left', copy=False)

        # case3: find the cases of the same time and similar geohash
        tmp = train.groupby(['hour','geohash{}'.format(c)], as_index=False)[['end_lat', 'end_lon']].median().rename(
            columns={'end_lat': 'end_lat_only{}'.format(c), 'end_lon': 'end_lon_only{}'.format(c)})
        base = pd.merge(base, tmp, on=['hour','geohash{}'.format(c)], how='left', copy=False)

    # if case1-12 does not exist, fill it with other values in case1
    for c in precisions[0:6]:
        base['end_lat_median_final'] = base['end_lat_median12'].fillna(base['end_lat_median{}'.format(c)])
        base['end_lon_median_final'] = base['end_lon_median12'].fillna(base['end_lon_median{}'.format(c)])

    # if case1 dose not exist, fill it with case3
    for c in precisions:
        base['end_lat_median_final'] = base['end_lat_median_final'].fillna(base['end_lat_{}'.format(c)])
        base['end_lon_median_final'] = base['end_lon_median_final'].fillna(base['end_lon_{}'.format(c)])

    # if case1 and case2 do not exist, fill it with case1
    for c in precisions[0:8]:
        base['end_lat_median_final'] = base['end_lat_median_final'].fillna(base['end_lat_only{}'.format(c)])
        base['end_lon_median_final'] = base['end_lon_median_final'].fillna(base['end_lon_only{}'.format(c)])

    print('Finally, there are ', base['end_lat_median_final'].isna().sum(), ' values have no value.')
    # if nothing exist fill it with the start position
    base['end_lat_median_final'] = base['end_lat_median_final'].fillna(base['start_lat'])
    base['end_lon_median_final'] = base['end_lon_median_final'].fillna(base['start_lon'])
    return base

In [11]:
base = ruler(train_df,test_df)

submit = base[['r_key','end_lat_median_final','end_lon_median_final']]
submit.columns = ['r_key','end_lat','end_lon']
submit.to_csv('./new_result.csv',index=False)

Finally, there are  182  values have no value.


### Test 1
- 'out_id', 'hour','geohash{}'.format(c)
- 'out_id', 'geohash{}'.format(c)
- 'hour','geohash{}'.format(c)
-  score: 0.508572

### Test 2
- 'out_id', 'hour', 'if_weekend', 'geohash{}'
- 'out_id', 'if_weekend' , 'geohash{}'
- 'out_id', 'geohash{}'
- 'hour','geohash{}'
- 'morning_night','geohash{}'
- 4686 non values
- score : 0.51

### Test 3
- precisions = [12,11,10,9,8,7,5,4,3]
- 'out_id', 'hour','geohash{}'.format(c)
- 'out_id', 'geohash{}'.format(c)
- 'hour','geohash{}'.format(c)
-  score: 0.508572