In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import geopandas # Геоданные
from shapely import wkt, wkb # Для перевода геоданных из 16-ричного формата
from shapely.geometry import Point, Polygon # импорт геометрии
from matplotlib import pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/train-test/xakaton_train.csv
/kaggle/input/train-test/xakaton_test.csv
/kaggle/input/train-new/train_new.csv
/kaggle/input/all-files/velocity.csv
/kaggle/input/all-files/set.csv
/kaggle/input/all-files/remont.csv
/kaggle/input/all-files/camers.csv
/kaggle/input/final-train-test/final_test_with_all_features.csv
/kaggle/input/final-train-test/final_train_with_all_features.csv


# Import data

In [2]:
velocity = pd.read_csv('../input/all-files/velocity.csv', sep=';', decimal=",")
sets = pd.read_csv('../input/all-files/set.csv', sep=';')
remont = pd.read_csv('../input/all-files/remont.csv', sep=';')
camers = pd.read_csv('../input/all-files/camers.csv', sep=';')
train_new = pd.read_csv('../input/train-new/train_new.csv', low_memory=False)

In [3]:
xakaton_test = pd.read_csv('../input/train-test/xakaton_test.csv')
xakaton_train = pd.read_csv('../input/train-test/xakaton_train.csv')

# Merge data

In [4]:
xakaton_train_with_graf = xakaton_train.fillna(0).merge(train_new, on='id')
print('xakaton_train_with_graf: ', xakaton_train_with_graf.shape)
xakaton_train_with_graf_with_set = xakaton_train_with_graf.merge(sets, left_on='id', right_on='EDGEID', how='left',  right_index=True)
print('xakaton_train_with_graf_with_set: ', xakaton_train_with_graf_with_set.shape)
xakaton_test_with_graf = xakaton_test.merge(train_new, on='id', how='left')
print('xakaton_test_with_graf: ', xakaton_test_with_graf.shape)
xakaton_test_with_graf_with_set = xakaton_test_with_graf.merge(sets, 
                                                               left_on='id',
                                                               right_on='EDGEID',
                                                               how='left',
                                                              right_index=True)
print('xakaton_test_with_graf_with_set: ', xakaton_test_with_graf_with_set.shape)

xakaton_train_with_graf:  (875478, 36)
xakaton_train_with_graf_with_set:  (875478, 39)
xakaton_test_with_graf:  (87330, 35)
xakaton_test_with_graf_with_set:  (87330, 38)


# Создадим необходимые столбцы в данных тест и трейн

In [5]:
xakaton_train_with_graf_with_set['avg_check_speed'] = 0
xakaton_train_with_graf_with_set['cnt_fix'] = 0
xakaton_train_with_graf_with_set['closed_lines'] = 0

xakaton_test_with_graf_with_set['avg_check_speed'] = 0
xakaton_test_with_graf_with_set['cnt_fix'] = 0
xakaton_test_with_graf_with_set['closed_lines'] = 0

# Для работы с координатами используем geopandas, shapely 

In [7]:
dataset_train_geom = geopandas.GeoDataFrame()
dataset_test_geom = geopandas.GeoDataFrame()
remont_geom = geopandas.GeoDataFrame()

In [8]:
dataset_train_geom['geometry_road'] = xakaton_train_with_graf_with_set['geom_buffer'].apply(wkt.loads)
dataset_train_geom['geometry_road'] = geopandas.GeoSeries(dataset_train_geom['geometry_road'])
dataset_test_geom['geometry_road'] = xakaton_test_with_graf['geom_buffer'].apply(wkt.loads)
dataset_test_geom['geometry_road'] = geopandas.GeoSeries(dataset_test_geom['geometry_road'])

# Merge данных по камерам и создание столбца с геометрией камер

In [9]:
data_camera = camers.merge(velocity, left_on='camera', right_on='camera_id')
all_camers = data_camera.groupby(['camera'], as_index=False).agg({'avg_check_speed': 'mean',
                                                        'cnt_fix': 'mean',
                                                                 'x': 'first',
                                                                 'y': 'first'})


In [10]:
cam = geopandas.GeoDataFrame()
lst_of_camers = []
for i in range(len(all_camers)):
    lst_of_camers += [Point(all_camers.y[i], all_camers.x[i])]

cam['geometry_cams'] = geopandas.GeoSeries(lst_of_camers)
    

# Проверяем, содержится ли геометрия камеры (точка) в геометрии дорог (полигон), если содержится, приписываем данные с камер к соответствующей дороге

In [11]:
%%time
for i in range(len(cam['geometry_cams'])):
    xakaton_train_with_graf_with_set['avg_check_speed'].loc[dataset_train_geom['geometry_road'].contains(cam['geometry_cams'][i])] = all_camers['avg_check_speed'][i]
    xakaton_train_with_graf_with_set['cnt_fix'].loc[dataset_train_geom['geometry_road'].contains(cam['geometry_cams'][i])] = all_camers['cnt_fix'][i]
    xakaton_test_with_graf_with_set['avg_check_speed'].loc[dataset_test_geom['geometry_road'].contains(cam['geometry_cams'][i])] = all_camers['avg_check_speed'][i]
    xakaton_test_with_graf_with_set['cnt_fix'].loc[dataset_test_geom['geometry_road'].contains(cam['geometry_cams'][i])] = all_camers['cnt_fix'][i]
    if i % 25 == 0:
        print(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


0
25
50
75
100
125
150
175
200
225
250
275
300
325
350
375
400
425
450
475
500
525
550
575
600
625
650
675
700
725
750
775
800
825
850
875
900
925
950
975
1000
1025
1050
1075
1100
1125
1150
1175
CPU times: user 3h 39min 47s, sys: 3.32 s, total: 3h 39min 51s
Wall time: 3h 39min 52s


In [15]:
xakaton_train_with_graf_with_set.to_csv('xakaton_train_with_graf_with_set_vrycnyu.csv')

In [16]:
xakaton_test_with_graf_with_set.to_csv('xakaton_test_with_graf_with_set_vrycnyu.csv')