In [1]:
import pandas as pd
import numpy as np

In [2]:
from common.serialization import pickle_load, pickle_save

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
%matplotlib inline
plt.rcParams['figure.figsize'] = (10,7)

In [5]:
dtrain = pd.read_csv('data/merchants_train.csv', encoding='cp1251', delimiter=';')

In [6]:
dtest = pd.read_csv('data/merchants_test.csv', encoding='cp1251', delimiter=';')

In [7]:
tract = pd.read_csv('data/transactions.csv', encoding='cp1251', delimiter=',')

In [7]:
tract.describe()

Unnamed: 0,merchant_id,latitude,longitude
count,249353.0,249353.0,249353.0
mean,444104.7,49.761256,33.758102
std,250658.9,17.722934,17.774541
min,178.0,-83.445345,-176.15565
25%,363986.0,55.604737,30.382891
50%,410295.0,55.750347,37.594847
75%,587922.0,55.855153,37.661401
max,1934268.0,84.830761,171.866882


In [9]:
dtrain.describe()

Unnamed: 0,merchant_id,latitude,longitude
count,6482.0,6482.0,6482.0
mean,513126.5,56.15635,40.0825
std,272869.5,2.912782,12.433761
min,178.0,40.143636,20.463191
25%,403326.2,55.690821,37.402091
50%,423620.5,55.775864,37.604161
75%,669633.0,56.774367,37.844208
max,1869302.0,68.970401,142.730432


In [7]:
tract.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249353 entries, 0 to 249352
Data columns (total 5 columns):
merchant_id              249353 non-null int64
latitude                 249353 non-null float64
longitude                249353 non-null float64
real_transaction_dttm    249353 non-null object
record_date              249353 non-null object
dtypes: float64(2), int64(1), object(2)
memory usage: 9.5+ MB


In [8]:
tract = tract[(tract.latitude > 38) & (tract.longitude > 19) & (tract.latitude < 70)]

In [9]:
inds = np.zeros((len(tract),), dtype=int)
train_inds = np.zeros((len(tract),), dtype=bool)
vals = tract.merchant_id.values
train_id = dtrain.merchant_id.unique()
i = 0
for elem in tract.merchant_id.unique():
    inds[vals == elem] = i
    i += 1
    if elem in train_id:
        train_inds[vals == elem] = True
        
inds_tr = np.zeros((np.sum(train_inds,)), dtype=int)
train_vals = tract.merchant_id.values[train_inds]
i = 0
for elem in train_id:
    inds_tr[train_vals == elem] = i
    i += 1

In [29]:
f = lambda x: np.array(x.split(':')).astype(int).dot(np.array([24 * 60, 60, 1]))

In [30]:
len(tract.groupby(tract.merchant_id).latitude.std().values)

9929

In [32]:
len(tract.groupby(tract.merchant_id).latitude.min().values)

9929

In [10]:
tract['lat_mean'] = tract.groupby(tract.merchant_id).latitude.mean().values[inds]
tract['lat_median'] = tract.groupby(tract.merchant_id).latitude.median().values[inds]
tract['lat_min'] = tract.groupby(tract.merchant_id).latitude.min().values[inds]
tract['lat_max'] = tract.groupby(tract.merchant_id).latitude.max().values[inds]
tract['lat_std'] = tract.groupby(tract.merchant_id).latitude.std().values[inds]

tract['lon_mean'] = tract.groupby(tract.merchant_id).longitude.mean().values[inds]
tract['lon_median'] = tract.groupby(tract.merchant_id).longitude.median().values[inds]
tract['lon_min'] = tract.groupby(tract.merchant_id).longitude.min().values[inds]
tract['lon_max'] = tract.groupby(tract.merchant_id).longitude.max().values[inds]
tract['lon_std'] = tract.groupby(tract.merchant_id).longitude.std().values[inds]

In [11]:
#tract = tract[np.logical_and((np.abs(tract.latitude.values - tract.lat_median.values) < 3 * tract.lat_std.values),
#              (np.abs(tract.longitude.values - tract.lon_median.values) < 3 * tract.lon_std.values))]

In [12]:
inds = np.zeros((len(tract),), dtype=int)
train_inds = np.zeros((len(tract),), dtype=bool)
vals = tract.merchant_id.values
train_id = dtrain.merchant_id.unique()
i = 0
for elem in tract.merchant_id.unique():
    inds[vals == elem] = i
    i += 1
    if elem in train_id:
        train_inds[vals == elem] = True
        
inds_tr = np.zeros((np.sum(train_inds,)), dtype=int)
train_vals = tract.merchant_id.values[train_inds]
i = 0
for elem in train_id:
    inds_tr[train_vals == elem] = i
    i += 1
    
tract['lat_mean'] = tract.groupby(tract.merchant_id).latitude.mean().values[inds]
tract['lat_median'] = tract.groupby(tract.merchant_id).latitude.median().values[inds]
tract['lat_min'] = tract.groupby(tract.merchant_id).latitude.min().values[inds]
tract['lat_max'] = tract.groupby(tract.merchant_id).latitude.max().values[inds]
tract['lat_std'] = tract.groupby(tract.merchant_id).latitude.std().values[inds]

tract['lon_mean'] = tract.groupby(tract.merchant_id).longitude.mean().values[inds]
tract['lon_median'] = tract.groupby(tract.merchant_id).longitude.median().values[inds]
tract['lon_min'] = tract.groupby(tract.merchant_id).longitude.min().values[inds]
tract['lon_max'] = tract.groupby(tract.merchant_id).longitude.max().values[inds]
tract['lon_std'] = tract.groupby(tract.merchant_id).longitude.std().values[inds]

In [11]:
lat_max_min = tract['lat_max'] - tract['lat_min']
lon_max_min = tract['lon_max'] - tract['lon_min']
lat_max_min[lat_max_min == 0] = 1
lon_max_min[lon_max_min == 0] = 1

tract['lat_median_dist'] = np.abs(tract['latitude'].values - tract['lat_median'].values)
tract['lat_median_dist_norm'] = tract['lat_median_dist'] / lat_max_min
tract['lat_mean_dist'] = np.abs(tract['latitude'].values - tract['lat_mean'].values)
tract['lat_mean_dist_norm'] = tract['lat_mean_dist'] / lat_max_min

tract['lon_median_dist'] = np.abs(tract['longitude'].values - tract['lon_median'].values)
tract['lon_median_dist_norm'] = tract['lon_median_dist'] / lon_max_min
tract['lon_mean_dist'] = np.abs(tract['longitude'].values - tract['lon_mean'].values)
tract['lon_mean_dist_norm'] = tract['lon_mean_dist'] / lon_max_min

tract['median_dist_l2'] = tract['lat_median_dist'] ** 2 + tract['lon_median_dist'] ** 2
tract['median_dist_l2_norm'] = tract['median_dist_l2'] / (lat_max_min ** 2 + lon_max_min ** 2)

tract['median_dist_l1'] = tract['lat_median_dist'] + tract['lon_median_dist']
tract['median_dist_l1_norm'] = tract['median_dist_l1'] / (lat_max_min + lon_max_min)

tract['in_median_rec2'] = (tract['lat_median_dist'] < 0.002) & (tract['lon_median_dist'] < 0.002)
tract['in_median_rec4'] = (tract['lat_median_dist'] < 0.004) & (tract['lon_median_dist'] < 0.004)

In [46]:
y_train[-30:]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

In [43]:
len(y_train)

142762

In [40]:
np.corrcoef(tract.loc[train_inds].in_median_rec2.values, y_train)

array([[ 1.        ,  0.12108473],
       [ 0.12108473,  1.        ]])

In [44]:
tract.loc[train_inds, 'in_median_rec2']

1          True
3          True
4          True
7          True
8         False
9         False
10        False
11        False
12        False
13         True
14        False
16        False
18        False
19         True
20        False
21        False
22        False
23        False
24        False
25        False
26        False
27        False
28        False
29         True
30        False
31        False
32        False
33         True
34        False
35        False
          ...  
249309    False
249310    False
249311    False
249312    False
249313    False
249315    False
249316    False
249317    False
249318    False
249319    False
249320    False
249321    False
249322     True
249323    False
249324     True
249325     True
249326     True
249327     True
249328     True
249329     True
249330     True
249331     True
249332     True
249333     True
249334    False
249335    False
249336    False
249338    False
249339    False
249340    False
Name: in_median_rec2, dt

In [12]:
feature_names = ['lat_median_dist', 'lat_mean_dist', 'lon_median_dist', 'lon_mean_dist','median_dist_l2', 
                 'median_dist_l1', 'in_median_rec2', 'in_median_rec4', 'lat_std', 'lon_std']

In [13]:
X_train = tract.loc[train_inds, feature_names]
X_test = tract.loc[np.logical_not(train_inds), feature_names]
y_train = np.logical_and(
    np.abs(tract.loc[train_inds, 'latitude'].values - dtrain.latitude.values[inds_tr]) <= 0.002,
    np.abs(tract.loc[train_inds, 'longitude'].values - dtrain.longitude.values[inds_tr]) <= 0.002
).astype(int)

In [14]:
coord_test = tract.loc[np.logical_not(train_inds), ['merchant_id', 'latitude', 'longitude']]

In [14]:
pickle_save(X_train, 'data/task2_data/dtrain.pkl')
pickle_save(X_test, 'data/task2_data/dtest.pkl')
pickle_save(y_train, 'data/task2_data/y_train.pkl')
pickle_save(tract.loc[np.logical_not(train_inds), 'merchant_id'], 'data/task2_data/_ID_.pkl')

In [15]:
feat_imp = pd.read_csv('task2_outputs/feat_imp2_2.csv', header=None)

In [16]:
feat_imp.head(10)

Unnamed: 0,0,1
0,in_median_rec2,14.098823
1,median_dist_l1,9.302887
2,median_dist_l2,5.921181
3,lon_median_dist,4.867092
4,lat_median_dist,4.860608
5,lon_std,4.667937
6,lat_std,4.207646
7,lat_mean_dist,4.049809
8,lon_mean_dist,3.949497
9,in_median_rec4,3.447172


In [17]:
prob = pd.read_csv('task2_outputs/output_xgboost2_2.csv')

In [20]:
coord_test['p'] = prob._VAL_.values

In [18]:
a = prob.groupby(prob._ID_)._VAL_.max()

In [19]:
len(prob.groupby(prob._ID_))

3500

In [28]:
len(lat_med)

3524

In [21]:
lat_med = tract.groupby(tract.merchant_id).latitude.median()[dtest.merchant_id.values].values
lon_med = tract.groupby(tract.merchant_id).longitude.median()[dtest.merchant_id.values].values
dtest['_ID_'] = dtest.merchant_id.values
dtest['_LAT_'] = lat_med
dtest['_LON_'] = lon_med
for ID, val in zip(a.index.values, a.values):
    c = coord_test.loc[coord_test.loc[coord_test.merchant_id == ID, 'p'].argmax(), ['latitude', 'longitude']].values
    dtest.loc[dtest.merchant_id == ID, '_LAT_'] = c[0]
    dtest.loc[dtest.merchant_id == ID, '_LON_'] = c[1]

dtest._LAT_.fillna(0, inplace=True)
dtest._LON_.fillna(0, inplace=True)

In [27]:
dtest2 = pd.read_csv('task2_outputs/output1.csv')

In [29]:
dtest2._LAT_ != dtest._LAT_

0        True
1       False
2        True
3        True
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13       True
14      False
15      False
16      False
17      False
18       True
19      False
20       True
21       True
22       True
23      False
24       True
25      False
26      False
27      False
28      False
29       True
        ...  
3494    False
3495     True
3496    False
3497    False
3498     True
3499     True
3500    False
3501    False
3502     True
3503     True
3504    False
3505    False
3506    False
3507     True
3508    False
3509    False
3510    False
3511    False
3512    False
3513     True
3514     True
3515     True
3516    False
3517     True
3518     True
3519    False
3520     True
3521     True
3522    False
3523     True
Name: _LAT_, dtype: bool

In [35]:
dtest.head(10)

Unnamed: 0,merchant_id,latitude,longitude,_ID_,_LAT_,_LON_
0,361,,,361,56.824988,60.675723
1,428,,,428,55.69175,37.535619
2,490,,,490,59.925744,30.31449
3,1175,,,1175,55.740985,37.626249
4,1239,,,1239,55.800972,37.543166
5,1476,,,1476,59.908242,30.483399
6,1601,,,1601,59.944429,30.27326
7,1628,,,1628,43.446689,39.950644
8,1676,,,1676,46.339428,48.023002
9,1916,,,1916,56.303634,43.946093


In [36]:
dtest2.head(10)

Unnamed: 0,_ID_,_LAT_,_LON_
0,361,56.807831,60.611228
1,428,55.69175,37.535619
2,490,59.875699,30.361785
3,1175,55.743663,37.565374
4,1239,55.800972,37.543166
5,1476,59.908242,30.483399
6,1601,59.944429,30.27326
7,1628,43.446689,39.950644
8,1676,46.339428,48.023002
9,1916,56.303634,43.946094


In [22]:
dtest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3524 entries, 0 to 3523
Data columns (total 6 columns):
merchant_id    3524 non-null int64
latitude       0 non-null float64
longitude      0 non-null float64
_ID_           3524 non-null int64
_LAT_          3524 non-null float64
_LON_          3524 non-null float64
dtypes: float64(4), int64(2)
memory usage: 165.3 KB


In [None]:
dtest

In [47]:
dtest[['_ID_', '_LAT_', '_LON_']].to_csv('task2_outputs/output2_2.csv', index=False)

In [64]:
lat_mean = tract.groupby(tract.merchant_id).latitude.mean()[dtrain.merchant_id.values].values
lon_mean = tract.groupby(tract.merchant_id).longitude.mean()[dtrain.merchant_id.values].values

In [65]:
lat_med = tract.groupby(tract.merchant_id).latitude.median()[dtrain.merchant_id.values].values
lon_med = tract.groupby(tract.merchant_id).longitude.median()[dtrain.merchant_id.values].values

In [82]:
dtest['_ID_'] = dtest.merchant_id
lat_med = tract.groupby(tract.merchant_id).latitude.median()[dtest.merchant_id.values].values
lon_med = tract.groupby(tract.merchant_id).longitude.median()[dtest.merchant_id.values].values
dtest['_LAT_'] = lat_med
dtest['_LON_'] = lon_med
dtest._LAT_.fillna(0, inplace=True)
dtest._LON_.fillna(0, inplace=True)
del dtest['merchant_id']
del dtest['latitude']
del dtest['longitude']

In [84]:
dtest.to_csv('task2_outputs/output_median.csv', index=False)

In [19]:
tract['real_latitude'] = 0
for ID, lat in dtrain[['merchant_id', 'latitude']].values:
    tract.loc[tract.merchant_id == ID, 'real_latitude'] = lat

In [21]:
?np.corrcoef

* сводить к бинарной классификации
* выкинуть самое частое

1
* ['lat_median_dist', 'lat_mean_dist', 'lon_median_dist', 'lon_mean_dist',
   'median_dist_l2', 'median_dist_l1']
* {'seed': 27, 'min_child_weight': 1, 'gamma': 0.0, 'learning_rate': 0.01, 'reg_lambda': 1, 'colsample_bytree': 0.8, 'subsample': 0.8, 'base_score': 0.5, 'max_delta_step': 0, 'silent': 1, 'colsample_bylevel': 1, 'reg_alpha': 0, 'max_depth': 5, 'nthread': 4, 'missing': None, 'objective': 'binary:logistic', 'scale_pos_weight': 1, 'n_estimators': 5000}
* max_depth можно побольше
* validation - 0.158518 logloss предсказывание попадания в прямоугольник
* test - 0.1902


2
* ['lat_median_dist', 'lat_mean_dist', 'lon_median_dist', 'lon_mean_dist','median_dist_l2', 
   'median_dist_l1', 'in_median_rec2', 'in_median_rec4', 'lat_std', 'lon_std']
* tract = tract[np.logical_and((np.abs(tract.latitude.values - tract.lat_median.values) < 3 *  tract.lat_std.values),
 (np.abs(tract.longitude.values - tract.lon_median.values) < 3 * tract.lon_std.values))]

* {'colsample_bylevel': 1, 'reg_alpha': 0, 'base_score': 0.5, 'min_child_weight': 1, 'nthread': 4, 'colsample_bytree': 0.8, 'silent': 1, 'learning_rate': 0.01, 'reg_lambda': 1, 'max_delta_step': 0, 'max_depth': 9, 'subsample': 0.8, 'objective': 'binary:logistic', 'missing': None, 'scale_pos_weight': 1, 'n_estimators': 5000, 'gamma': 0.0, 'seed': 27}
* maxdepth можно больше
* validation - 0.105192 0.002144
* test - 0.1873

2_2
* ['lat_median_dist', 'lat_mean_dist', 'lon_median_dist', 'lon_mean_dist','median_dist_l2', 
   'median_dist_l1', 'in_median_rec2', 'in_median_rec4', 'lat_std', 'lon_std']
* {'colsample_bylevel': 1, 'reg_alpha': 0, 'base_score': 0.5, 'min_child_weight': 1, 'nthread': 4, 'colsample_bytree': 0.8, 'silent': 1, 'learning_rate': 0.01, 'reg_lambda': 1, 'max_delta_step': 0, 'max_depth': 9, 'subsample': 0.8, 'objective': 'binary:logistic', 'missing': None, 'scale_pos_weight': 1, 'n_estimators': 5000, 'gamma': 0.0, 'seed': 27}
* maxdepth можно больше
* validation - 0.102746 0.002802
* test - 0.1854