In [1]:
import pandas as pd
import numpy as np

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
%matplotlib inline
plt.rcParams['figure.figsize'] = (10,7)

In [9]:
dtrain = pd.read_csv('data/merchants_train.csv', encoding='cp1251', delimiter=';')
dtest = pd.read_csv('data/merchants_test.csv', encoding='cp1251', delimiter=';')
tract = pd.read_csv('data/transactions.csv', encoding='cp1251', delimiter=',')

In [6]:
lat_mode = tract.latitude.mode().values[0]
lon_mode = tract.longitude.mode().values[0]

tract = tract[(tract.latitude != 0) | (tract.longitude != 0)]
tract = tract[(tract.latitude != lat_mode) | (tract.longitude != lon_mode)]

In [8]:
def lexsort_based_compressor(data):                 
    sorted_data =  data[np.lexsort(data.T),:]
    row_mask = np.append([True],np.any(np.diff(sorted_data,axis=0),1))
    return sorted_data[row_mask]

In [9]:
# добавление промежуточных точек
new_rows = []
for ID in tract.merchant_id.unique():
    coords = tract.loc[tract.merchant_id == ID, ['latitude', 'longitude']].values
    if coords.shape[0] == 1:
        continue
    coords = lexsort_based_compressor(coords)
    ids = np.ones((coords.shape[0] - 1, 1), dtype=int) * ID
    new_rows.append(np.hstack((ids, (coords[1:] + np.roll(coords, 1, axis=0)[1:]) / 2)))
    
new_rows = np.vstack(new_rows)

In [11]:
tract = tract.append(pd.DataFrame(new_rows, columns=['merchant_id', 'latitude', 'longitude']), ignore_index=True)
tract['merchant_id'] = tract['merchant_id'].astype(int)

In [20]:
inds = np.zeros((len(tract),), dtype=int)
train_inds = np.zeros((len(tract),), dtype=bool)
vals = tract.merchant_id.values
train_id = dtrain.merchant_id.unique()
i = 0
for elem in tract.merchant_id.unique():
    inds[vals == elem] = i
    i += 1
    if elem in train_id:
        train_inds[vals == elem] = True
        
inds_tr = np.zeros((np.sum(train_inds,)), dtype=int)
train_vals = tract.merchant_id.values[train_inds]
i = 0
for elem in train_id:
    inds_tr[train_vals == elem] = i
    i += 1

In [21]:
# подсчет числа соседей
tract['neighbors_number'] = 0
tract['neighbors_number_norm'] = 0
for ID in tract.merchant_id.unique():
    coords = tract.loc[tract.merchant_id == ID, ['latitude', 'longitude']].values
    dist = np.zeros((coords.shape[0], coords.shape[0]))
    for i in range(coords.shape[0] - 1):
        for j in range(i + 1, coords.shape[0]):
            dist[j, i] = dist[i, j] = float(np.all(np.abs(coords[i] - coords[j]) <= 0.002))
    neighbors = np.sum(dist, axis=0)
    tract.loc[tract.merchant_id == ID, 'neighbors_number'] = neighbors
    tract.loc[tract.merchant_id == ID, 'neighbors_number_norm'] = neighbors / coords.shape[0]

In [22]:
tract['lat_mean'] = tract.merchant_id.map(tract.groupby(tract.merchant_id).latitude.mean())
tract['lat_median'] = tract.merchant_id.map(tract.groupby(tract.merchant_id).latitude.median())
tract['lat_min'] = tract.merchant_id.map(tract.groupby(tract.merchant_id).latitude.min())
tract['lat_max'] = tract.merchant_id.map(tract.groupby(tract.merchant_id).latitude.max())

tract['lon_mean'] = tract.merchant_id.map(tract.groupby(tract.merchant_id).longitude.mean())
tract['lon_median'] = tract.merchant_id.map(tract.groupby(tract.merchant_id).longitude.median())
tract['lon_min'] = tract.merchant_id.map(tract.groupby(tract.merchant_id).longitude.min())
tract['lon_max'] = tract.merchant_id.map(tract.groupby(tract.merchant_id).longitude.max())

tract['lat_max_min'] = tract['lat_max'] - tract['lat_min']
tract['lon_max_min'] = tract['lon_max'] - tract['lon_min']

tract['lat_median_dist'] = np.abs(tract['latitude'].values - tract['lat_median'].values)
tract['lat_mean_dist'] = np.abs(tract['latitude'].values - tract['lat_mean'].values)

tract['lon_median_dist'] = np.abs(tract['longitude'].values - tract['lon_median'].values)
tract['lon_mean_dist'] = np.abs(tract['longitude'].values - tract['lon_mean'].values)

tract['median_dist_l2'] = tract['lat_median_dist'] ** 2 + tract['lon_median_dist'] ** 2

tract['median_dist_l1'] = tract['lat_median_dist'] + tract['lon_median_dist']

tract['median_dist_l1_max'] = tract[['lat_median_dist', 'lon_median_dist']].max(axis=1)
tract['mean_dist_l1_max'] = tract[['lat_mean_dist', 'lon_mean_dist']].max(axis=1)

tract['median_dist_l1_min'] = tract[['lat_median_dist', 'lon_median_dist']].min(axis=1)
tract['mean_dist_l1_min'] = tract[['lat_mean_dist', 'lon_mean_dist']].min(axis=1)

In [24]:
feature_names = ['lat_median_dist', 'lat_mean_dist', 'lon_median_dist', 'lon_mean_dist','median_dist_l2', 
                 'median_dist_l1', 'median_dist_l1_max', 'lat_max_min', 'lon_max_min',
                 'mean_dist_l1_max', 'median_dist_l1_min', 'mean_dist_l1_min',
                 'neighbors_number', 'neighbors_number_norm']

In [25]:
X_train = tract.loc[train_inds, feature_names]
X_test = tract.loc[np.logical_not(train_inds), feature_names]
y_train = np.logical_and(
    np.abs(tract.loc[train_inds, 'latitude'].values - dtrain.latitude.values[inds_tr]) <= 0.002,
    np.abs(tract.loc[train_inds, 'longitude'].values - dtrain.longitude.values[inds_tr]) <= 0.002
).astype(int)

In [27]:
coord_test = tract.loc[np.logical_not(train_inds), ['merchant_id', 'latitude', 'longitude']]

In [None]:
from xgboost.sklearn import XGBClassifier

In [None]:
xgb = XGBClassifier(
 learning_rate=0.01,
 n_estimators=5000,
 max_depth=11,
 min_child_weight=1,
 gamma=0.2,
 subsample=0.8,
 colsample_bytree=0.8,
 objective='binary:logistic',
 scale_pos_weight=1,
 seed=27,
 nthread=4)

xgb.fit(X_train, y_train, eval_metric='auc')
answer = xgb.predict_proba(X_test)

coord_test['p'] = answer[:, 1]

In [10]:
# берем точку с наибольшей предсказанной вероятностью
dtest['_ID_'] = dtest.merchant_id.values
dtest['_LAT_'] = 0
dtest['_LON_'] = 0

for ID in coord_test.merchant_id.unique():
    best_match = coord_test.loc[coord_test.merchant_id == ID, 'p'].sort_values()
    dtest.loc[dtest.merchant_id == ID, ['_LAT_', '_LON_']] = \
        coord_test.loc[best_match.index.values[-1], ['latitude', 'longitude']].values

dtest._LAT_.fillna(0, inplace=True)
dtest._LON_.fillna(0, inplace=True)

In [12]:
dtest[['_ID_', '_LAT_', '_LON_']].to_csv('output.csv', index=False)

* public test: 0.2592
* private test: 0.2732