# Geohash

In [3]:
import pandas as pd
import numpy as np
import Geohash.geohash as geohash
import os
import warnings
warnings.filterwarnings('ignore')
from math import radians, atan, tan, sin, acos, cos

In [4]:
def getDistance(latA, lonA, latB, lonB):
    # calcualte distance from lat and lon
    ra = 6378140  # radius of equator: meter
    rb = 6356755  # radius of polar: meter
    flatten = (ra - rb) / ra  # Partial rate of the earth
    # change angle to radians
    radLatA = radians(latA)
    radLonA = radians(lonA)
    radLatB = radians(latB)
    radLonB = radians(lonB)
    try:
        pA = atan(rb / ra * tan(radLatA))
        pB = atan(rb / ra * tan(radLatB))
        x = acos(sin(pA) * sin(pB) + cos(pA) * cos(pB) * cos(radLonA - radLonB))
        c1 = (sin(x) - x) * (sin(pA) + sin(pB)) ** 2 / cos(x / 2) ** 2
        c2 = (sin(x) + x) * (sin(pA) - sin(pB)) ** 2 / sin(x / 2) ** 2
        dr = flatten / 8 * (c1 - c2)
        distance = ra * (x + dr)
        return distance  # meter
    except:
        return 0.0000001

In [8]:
def getDistanceFromDF(data):
    # calcuate the error (distance)
    tmp = data[['end_lat','end_lon','end_lat_median12','end_lon_median12']].astype(float)
    error = []
    for i in tmp.values:
        # print i[0],i[1],i[2],i[3]
        t = getDistance(i[0],i[1],i[2],i[3])
        error.append(t)
    print(np.sum(f(np.array(error))) / tmp.shape[0])

def f(d):
    # calculate the sccore
    return 1 / (1 + np.exp(-(d-1000)/250))

def geoEncoding(data,precision=5):
    # encode the lat and lon data using geohash
    tmp = data[['start_lat','start_lon']]
    geohashList = []
    for i in tmp.values:
        geohashList.append(geohash.encode(i[0],i[1],precision))
    data['geohash{}'.format(precision)] = geohashList
    return data


def dateConvert(data,isTrain):
    # print 'convert string to datetime'
    data['start_time'] = pd.to_datetime(data['start_time'])
    # encoding start lat lon to geohash
    data = geoEncoding(data,12)
    data = geoEncoding(data,11)
    data = geoEncoding(data,10)
    data = geoEncoding(data,9)
    data = geoEncoding(data,8)
    data = geoEncoding(data,7)
    data = geoEncoding(data,6)
    if isTrain:
        data['end_time'] = pd.to_datetime(data['end_time'])
    data['weekday'] = data['start_time'].dt.weekday + 1
    data['hour'] = data['start_time'].dt.hour
    return data

In [5]:
train_df = pd.read_csv('train.csv', low_memory=False)
test_df = pd.read_csv('test.csv', low_memory=False)

In [6]:
train_df.head()

Unnamed: 0,r_key,out_id,start_time,end_time,start_lat,start_lon,end_lat,end_lon
0,SDK-XJ_609994b4d50a8a07a64d41d1f70bbb05,2016061820000b,2018-01-20 10:13:43,2018-01-20 10:19:04,33.783415,111.60366,33.779811,111.605885
1,SDK-XJ_4c2f29d94c9478623711756e4ae34cc5,2016061820000b,2018-02-12 17:40:51,2018-02-12 17:58:13,34.810763,115.549264,34.814875,115.549374
2,SDK-XJ_3570183177536a575b9da67a86efcd62,2016061820000b,2018-02-13 14:52:24,2018-02-13 15:24:33,34.640284,115.539024,34.813136,115.559243
3,SDK-XJ_78d749a376e190685716a51a6704010b,2016061820000b,2018-02-13 17:23:08,2018-02-13 17:39:02,34.81828,115.542039,34.813141,115.559217
4,SDK-XJ_3b249941c27834f5e43d43a9114e4909,2016061820000b,2018-02-13 18:06:02,2018-02-13 19:02:51,34.813278,115.55926,34.786126,115.874361


In [None]:
# ruler geohash
def ruler(train,test):
    base = test
    for c in [12,11,10,9,8,7]:
        # case1: find the cases of the same car, same time, and similar geohash
        tmp = train.groupby(['out_id', 'hour','geohash{}'.format(c)],as_index=False)[['end_lat', 'end_lon']].median().rename(
            columns={'end_lat': 'end_lat_median{}'.format(c), 'end_lon': 'end_lon_median{}'.format(c)})
        base = pd.merge(base, tmp,on=['out_id', 'hour','geohash{}'.format(c)], how='left', copy=False)
        
        # case2: find the cases of the same car, similar geohash
        tmp = train.groupby(['out_id', 'geohash{}'.format(c)], as_index=False)[['end_lat', 'end_lon']].median().rename(
            columns={'end_lat': 'end_lat_{}'.format(c), 'end_lon': 'end_lon_{}'.format(c)})
        base = pd.merge(base, tmp, on=['out_id', 'geohash{}'.format(c)], how='left', copy=False)

        # case3: find the cases of the same time and similar geohash
        tmp = train.groupby(['hour','geohash{}'.format(c)], as_index=False)[['end_lat', 'end_lon']].median().rename(
            columns={'end_lat': 'end_lat_only{}'.format(c), 'end_lon': 'end_lon_only{}'.format(c)})
        base = pd.merge(base, tmp, on=['hour','geohash{}'.format(c)], how='left', copy=False)

    # if case1-12 does not exist, fill it with other values in case1
    for c in [12,11,10,9,8,7]:
        base['end_lat_median12'] = base['end_lat_median12'].fillna(base['end_lat_median{}'.format(c)])
        base['end_lon_median12'] = base['end_lon_median12'].fillna(base['end_lon_median{}'.format(c)])

    # if case1 dose not exist, fill it with case3
    for c in [12, 11, 10, 9, 8, 7]:
        base['end_lat_median12'] = base['end_lat_median12'].fillna(base['end_lat_{}'.format(c)])
        base['end_lon_median12'] = base['end_lon_median12'].fillna(base['end_lon_{}'.format(c)])

    # if case1 and case2 do not exist, fill it with case1
    for c in [12, 11, 10, 9, 8, 7]:
        base['end_lat_median12'] = base['end_lat_median12'].fillna(base['end_lat_only{}'.format(c)])
        base['end_lon_median12'] = base['end_lon_median12'].fillna(base['end_lon_only{}'.format(c)])

    # if nothing exist fill it with the start position
    base['end_lat_median12'] = base['end_lat_median12'].fillna(base['start_lat'])
    base['end_lon_median12'] = base['end_lon_median12'].fillna(base['start_lon'])
    return base

if os.path.exists('./cache/trainT.csv') and os.path.exists('./cache/trainS.csv') and os.path.exists('./cache/valT.csv') 
     and os.path.exists('./cache/test.csv'):
    print('read from cache')
    trainT = pd.read_csv('./cache/trainT.csv')
    trainS = pd.read_csv('./cache/trainS.csv')
    valT = pd.read_csv('./cache/valT.csv')
    test = pd.read_csv('./cache/test.csv')
else:
    print('begin')
    train = pd.read_csv('../data/train.csv')
    trainT = train[train['start_time']<='2018-06-30 23:59:59']
    valT = train[train['start_time']>'2018-06-30 23:59:59']
    test = pd.read_csv('../data/test.csv')

    trainT = dateConvert(trainT,True)
    trainS = dateConvert(train,True)
    valT = dateConvert(valT,True)
    test = dateConvert(test,False)

    trainT.to_csv('./cache/trainT.csv',index=False)
    trainS.to_csv('./cache/trainS.csv',index=False)
    valT.to_csv('./cache/valT.csv',index=False)
    test.to_csv('./cache/test.csv',index=False)

base = ruler(trainT,valT)
getDistanceFromDF(base)

print('submit')
base = ruler(trainT,test)

submit = base[['r_key','end_lat_median12','end_lon_median12']]
submit.columns = ['r_key','end_lat','end_lon']
submit.to_csv('./new_result.csv',index=False)

# Official solution

In [None]:
# 简单的方案，就是先统计用户星期最喜欢去的地方，之后对这些地方标记一下，如果未来真的去过，标记1否则0
def dateConvert(data,isTrain):
    # convert the data format
    print 'convert string to datetime'
    data['start_time'] = pd.to_datetime(data['start_time'])
    if isTrain:
        data['end_time'] = pd.to_datetime(data['end_time'])
    data['weekday'] = data['start_time'].dt.weekday + 1
    return data

def latitude_longitude_to_go(data,isTrain):
    # take five numbers after '.'
    tmp = data[['start_lat','start_lon']]
    start_geohash = []
    for t in tmp.values:
        start_geohash.append(str(round(t[0],5)) + '_' + str(round(t[1],5)))
    data['startGo'] = start_geohash

    if isTrain:
        tmp = data[['end_lat','end_lon']]
        end_geohash = []
        for t in tmp.values:
            end_geohash.append(str(round(t[0],5))+ '_' + str(round(t[1],5)))
        data['endGo'] = end_geohash
    return data

# 用户去过最多的三个地方
def getMostTimesCandidate(candidate):
    mostTimeCandidate = candidate[candidate['start_time']<='2018-06-30 23:59:59']
    mostTimeCandidate = mostTimeCandidate[['out_id','endGo','end_lat','end_lon','weekday']]
    mostTimeCandidate_3 = mostTimeCandidate.groupby(['out_id','endGo','weekday'],as_index=False)['endGo'].agg(
        {'mostCandidateCount':'count'})
    mostTimeCandidate_3.sort_values(['mostCandidateCount','out_id'],inplace=True,ascending=False)
    mostTimeCandidate_3 = mostTimeCandidate_3.groupby(['out_id','weekday']).tail(7)
    return mostTimeCandidate_3

# 经纬度和 string 转化
def geoHashToLatLoc(data):
    tmp = data[['endGo']]
    predict_end_lat = []
    predict_end_lon = []
    for i in tmp.values:
        lats, lons = str(i[0]).split('_')
        predict_end_lat.append(lats)
        predict_end_lon.append(lons)
    data['predict_end_lat'] = predict_end_lat
    data['predict_end_lon'] = predict_end_lon
    return data

def calcGeoHasBetween(go1,go2):
    latA, lonA = str(go1).split('_')
    latB, lonB = str(go2).split('_')
    distence = getDistance(float(latA), float(lonA), float(latB), float(lonB))
    return distence

# start to end distance
def calcGeoHasBetweenMain(data):
    distance = []
    tmp = data[['endGo','startGo']]
    for i in tmp.values:
        distance.append(calcGeoHasBetween(i[0],i[1]) / 1000 )
    data['distance'] = distance
    return data

In [None]:
# prepare the training and the test data set
print 'begin'
# 用1-6月去提取最常去的地方
# 用7 月 7 月去训练
train = pd.read_csv('train.csv')
print train['start_time'].min(),train['start_time'].max()
print train[train['start_time']>'2018-06-30 23:59:59'].shape
print train[train['start_time']<='2018-06-30 23:59:59'].shape

test = pd.read_csv('test.csv')
print test['start_time'].min(),test['start_time'].max()
print test.shape

trainIndex = train.shape[0]
testIndex = test.shape[0]

print trainIndex,testIndex

train = dateConvert(train,True)
test = dateConvert(test,False)

train = latitude_longitude_to_go(train,True)
test = latitude_longitude_to_go(test,False)

train.to_csv('train1.csv',index=False)
test.to_csv('test1.csv',index=False)

print '##############################################'

In [None]:
# prepare the most viewed places
userMostTimes3loc = getMostTimesCandidate(train)
# the validation set
val = train[train['start_time']>'2018-06-30 23:59:59']
val = val[['r_key','out_id','end_lat','end_lon','weekday','startGo','endGo','start_lat','start_lon']]
val.rename(columns={'endGo':'trueEndGo'},inplace=True)
val = pd.merge(val,userMostTimes3loc,on=['out_id','weekday'],how='left',copy=False)
val['endGo'] = val['endGo'].fillna(val['startGo'])
val['flag1'] = val['trueEndGo'] == val['endGo']
val['flag1'] = val['flag1'].astype(int)
val = calcGeoHasBetweenMain(val)
# test data set
test = test[['r_key','out_id','weekday','startGo','start_lat','start_lon']]
test = pd.merge(test,userMostTimes3loc,on=['out_id','weekday'],how='left',copy=False)
test['endGo'] = test['endGo'].fillna(test['startGo'])
test = calcGeoHasBetweenMain(test)

In [None]:
# use regression to predict probability 
feature = ['start_lat','start_lon','weekday','distance','mostCandidateCount']
from sklearn.linear_model import LogisticRegression

print 'training'
lr = LogisticRegression()
lr.fit(val[feature].fillna(-1).values, val['flag1'].values)
print 'predicting'
pre = lr.predict_proba(val[feature].fillna(-1).values)[:,1]

val_result = val[['r_key','endGo','end_lat','end_lon',]]
val_result['predict'] = pre
val_result = val_result.sort_values(['predict'],ascending=False)
val_result = val_result.drop_duplicates(['r_key'])

val = geoHashToLatLoc(val)
print 'loss is :'
getDistanceFromDF(val)

subPre = lr.predict_proba(test[feature].fillna(-1).values)[:,1]
test_result = test[['r_key','endGo']]
test_result['predict'] = subPre

test_result = test_result.sort_values(['predict'],ascending=False)
test_result = test_result.drop_duplicates(['r_key'])
test_result = geoHashToLatLoc(test_result)

submit = test_result[['r_key','predict_end_lat','predict_end_lon']]
submit.columns = ['r_key','end_lat','end_lon']
submit.to_csv('./result.csv',index=False)