In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.metrics import mean_squared_error


In [2]:
df = pd.read_csv('data/train.csv')
df.shape

(4219370, 21)

In [3]:
# filter incorrect
df = df[(df['distance'] > 0) & (df['y'] > 0)]
df['avg'] = df['distance'] / df['y']
df = df[df['avg'] <= 100]

In [4]:
df.head()

Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,vidsobst,distance,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,y,avg
0,wagonwagonwagontutu:)wagon,wagondigitalnewyearrailway:)happy,2021,11,45,12,14,3399.0,4.0,1,...,102.0,930.0,1098,5476,13,13,78,75,185.2,5.021598
1,railwaypgkhappyrailway:)railway,digitalhappynewyeardigital:)wagon,2021,4,15,17,16,2261.0,3.0,0,...,111.0,16.0,2974,6682,21,21,111,111,0.25,64.0
2,railwaytutupgkpgk:)digital,pgkdigitalrailway:):)happy,2021,5,17,2,7,1089.0,3.0,1,...,111.0,879.0,2082,10729,9,9,67,23,109.133333,8.054368
3,tuturailwaydigital2022:)digital,newyearnewyear20222022:)2022,2020,11,47,17,11,2261.0,4.0,0,...,103.0,8569.0,2974,10571,28,28,115,112,214.093056,40.024652
4,railwaytutupgkpgk:)digital,pgkhappyrailwayhack:)railway,2020,12,52,25,5,1666.0,4.0,1,...,101.0,656.0,2082,9225,9,9,67,16,104.1,6.301633


In [5]:
df.columns

Index(['st_code_snd', 'st_code_rsv', 'date_depart_year', 'date_depart_month',
       'date_depart_week', 'date_depart_day', 'date_depart_hour', 'fr_id',
       'route_type', 'is_load', 'rod', 'common_ch', 'vidsobst', 'distance',
       'snd_org_id', 'rsv_org_id', 'snd_roadid', 'rsv_roadid', 'snd_dp_id',
       'rsv_dp_id', 'y', 'avg'],
      dtype='object')

In [6]:
def join_s_d(row):
    return f"{row['snd_dp_id']} {row['rsv_dp_id']} {row['distance']}"

df['reg_s_d'] = df.apply(join_s_d, axis=1)

In [7]:
region_s_d_unique = df['reg_s_d'].unique().tolist()
region_s_d_unique

['78 75 930.0',
 '111 111 16.0',
 '67 23 879.0',
 '115 112 8569.0',
 '67 16 656.0',
 '10 67 1021.0',
 '59 111 4480.0',
 '67 67 209.0',
 '126 125 1680.0',
 '121 1 3832.0',
 '29 70 1676.0',
 '67 67 276.0',
 '66 104 2827.0',
 '111 111 51.0',
 '22 22 76.0',
 '4 75 2641.0',
 '67 67 15.0',
 '29 111 3626.0',
 '16 19 270.0',
 '101 101 123.0',
 '67 67 304.0',
 '30 123 1091.0',
 '75 71 409.0',
 '111 111 40.0',
 '111 117 5900.0',
 '3 3 393.0',
 '111 4 4820.0',
 '29 104 1583.0',
 '29 31 1886.0',
 '1 111 4060.0',
 '117 17 9237.0',
 '105 102 945.0',
 '111 111 10.0',
 '30 7 1894.0',
 '110 111 204.0',
 '84 106 1102.0',
 '115 115 136.0',
 '29 29 5.0',
 '31 123 317.0',
 '117 121 5936.0',
 '73 73 5.0',
 '105 102 986.0',
 '102 102 72.0',
 '113 131 1044.0',
 '67 67 17.0',
 '121 111 302.0',
 '111 19 3667.0',
 '102 111 2102.0',
 '66 66 314.0',
 '29 28 535.0',
 '123 60 3525.0',
 '59 67 1215.0',
 '101 101 5.0',
 '67 67 4.0',
 '135 63 1235.0',
 '73 72 363.0',
 '67 67 7.0',
 '68 67 187.0',
 '4 4 262.0',
 '0 67 1

In [8]:
len(region_s_d_unique)

138505

In [9]:
region_s_d_unique = sorted(region_s_d_unique)

In [10]:
df = df.sort_values(['reg_s_d'])

In [11]:
df

Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,distance,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,y,avg,reg_s_d
1801904,railwaypgkpgknewyear:)happy,digitalhappynewyeardigital:)wagon,2022,3,9,6,6,2261.0,3.0,0,...,106.0,2974,1323,0,0,0,0,18.916667,5.603524,0 0 106.0
1711209,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2021,5,17,1,17,2261.0,4.0,0,...,106.0,2974,1323,0,0,0,0,31.766667,3.336831,0 0 106.0
1129045,happyrailwaynewyear:)tutu,newyearhappyrailway:)hack,2021,11,47,27,18,2261.0,4.0,0,...,106.0,1099,1099,0,0,0,0,28.450000,3.725835,0 0 106.0
2925222,pgk:)newyearpgk:)hack,pgkhack2022pgk:):),2022,3,9,6,6,2261.0,3.0,0,...,106.0,2974,1323,0,0,0,0,18.916667,5.603524,0 0 106.0
2961123,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2022,3,9,6,6,2261.0,3.0,0,...,106.0,2974,1323,0,0,0,0,69.966667,1.515007,0 0 106.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2481890,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,26,25,17,2261.0,4.0,0,...,3650.0,17401,1259,17,17,99,70,525.916667,6.940263,99 70 3650.0
777626,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,24,11,16,2261.0,4.0,0,...,3650.0,17401,2974,17,17,99,70,330.500000,11.043873,99 70 3650.0
3841305,wagonhappyrailwayhack:)tutu,newyearwagondigitaltutu:)pgk,2020,7,28,12,17,2261.0,4.0,0,...,3650.0,17401,1259,17,17,99,70,323.383333,11.286914,99 70 3650.0
3909129,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,26,22,16,2261.0,4.0,0,...,3650.0,17401,1259,17,17,99,70,289.116667,12.624661,99 70 3650.0


In [12]:
region_s_d_unique

['0 0 106.0',
 '0 0 109.0',
 '0 0 118.0',
 '0 0 119.0',
 '0 0 122.0',
 '0 0 125.0',
 '0 0 133.0',
 '0 0 146.0',
 '0 0 149.0',
 '0 0 159.0',
 '0 0 160.0',
 '0 0 164.0',
 '0 0 166.0',
 '0 0 168.0',
 '0 0 17.0',
 '0 0 172.0',
 '0 0 178.0',
 '0 0 184.0',
 '0 0 185.0',
 '0 0 187.0',
 '0 0 19.0',
 '0 0 190.0',
 '0 0 193.0',
 '0 0 197.0',
 '0 0 199.0',
 '0 0 202.0',
 '0 0 206.0',
 '0 0 213.0',
 '0 0 214.0',
 '0 0 221.0',
 '0 0 23.0',
 '0 0 230.0',
 '0 0 236.0',
 '0 0 238.0',
 '0 0 24.0',
 '0 0 244.0',
 '0 0 248.0',
 '0 0 26.0',
 '0 0 262.0',
 '0 0 272.0',
 '0 0 275.0',
 '0 0 276.0',
 '0 0 278.0',
 '0 0 296.0',
 '0 0 299.0',
 '0 0 3.0',
 '0 0 30.0',
 '0 0 306.0',
 '0 0 308.0',
 '0 0 313.0',
 '0 0 315.0',
 '0 0 316.0',
 '0 0 319.0',
 '0 0 32.0',
 '0 0 322.0',
 '0 0 329.0',
 '0 0 336.0',
 '0 0 341.0',
 '0 0 342.0',
 '0 0 35.0',
 '0 0 353.0',
 '0 0 359.0',
 '0 0 376.0',
 '0 0 385.0',
 '0 0 408.0',
 '0 0 41.0',
 '0 0 42.0',
 '0 0 428.0',
 '0 0 440.0',
 '0 0 449.0',
 '0 0 45.0',
 '0 0 454.0',
 '0 0

In [13]:
len(region_s_d_unique)

138505

In [14]:
df = df.reset_index()

In [15]:
df = df.drop(['index'], axis=1)
df

Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,distance,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,y,avg,reg_s_d
0,railwaypgkpgknewyear:)happy,digitalhappynewyeardigital:)wagon,2022,3,9,6,6,2261.0,3.0,0,...,106.0,2974,1323,0,0,0,0,18.916667,5.603524,0 0 106.0
1,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2021,5,17,1,17,2261.0,4.0,0,...,106.0,2974,1323,0,0,0,0,31.766667,3.336831,0 0 106.0
2,happyrailwaynewyear:)tutu,newyearhappyrailway:)hack,2021,11,47,27,18,2261.0,4.0,0,...,106.0,1099,1099,0,0,0,0,28.450000,3.725835,0 0 106.0
3,pgk:)newyearpgk:)hack,pgkhack2022pgk:):),2022,3,9,6,6,2261.0,3.0,0,...,106.0,2974,1323,0,0,0,0,18.916667,5.603524,0 0 106.0
4,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2022,3,9,6,6,2261.0,3.0,0,...,106.0,2974,1323,0,0,0,0,69.966667,1.515007,0 0 106.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4139686,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,26,25,17,2261.0,4.0,0,...,3650.0,17401,1259,17,17,99,70,525.916667,6.940263,99 70 3650.0
4139687,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,24,11,16,2261.0,4.0,0,...,3650.0,17401,2974,17,17,99,70,330.500000,11.043873,99 70 3650.0
4139688,wagonhappyrailwayhack:)tutu,newyearwagondigitaltutu:)pgk,2020,7,28,12,17,2261.0,4.0,0,...,3650.0,17401,1259,17,17,99,70,323.383333,11.286914,99 70 3650.0
4139689,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,26,22,16,2261.0,4.0,0,...,3650.0,17401,1259,17,17,99,70,289.116667,12.624661,99 70 3650.0


In [16]:
df['bad'] = False

In [17]:
avg_speed = {}

up_ind = 0
for ind in tqdm(range(1, df.shape[0])):
    if df.iloc[ind-1]['reg_s_d'] != df.iloc[ind]['reg_s_d']:
        down_ind = ind
        t = df[up_ind: down_ind]
        reg_s_d = df.iloc[ind-1]['reg_s_d']
        try:
            filtered_t = t[(t.y <= np.percentile(t.y,95))&(t.y >= np.percentile(t.y,5))]

            if filtered_t.shape[0] > 0:
                y_test = filtered_t['y']
                speed = filtered_t['avg'].mean()
                avg_speed[reg_s_d] = speed
            else:
                avg_speed[reg_s_d] = t['avg'].mean()

            df[up_ind: down_ind].loc[((t.y > np.percentile(t.y,95))|(t.y < np.percentile(t.y,5))), 'bad'] = True
        except:
            avg_speed[reg_s_d] = t['avg'].mean()
        up_ind = ind

100%|██████████| 4139690/4139690 [05:40<00:00, 12167.79it/s]


In [18]:
df[df['bad']==False]

Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,y,avg,reg_s_d,bad
0,railwaypgkpgknewyear:)happy,digitalhappynewyeardigital:)wagon,2022,3,9,6,6,2261.0,3.0,0,...,2974,1323,0,0,0,0,18.916667,5.603524,0 0 106.0,False
1,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2021,5,17,1,17,2261.0,4.0,0,...,2974,1323,0,0,0,0,31.766667,3.336831,0 0 106.0,False
2,happyrailwaynewyear:)tutu,newyearhappyrailway:)hack,2021,11,47,27,18,2261.0,4.0,0,...,1099,1099,0,0,0,0,28.450000,3.725835,0 0 106.0,False
3,pgk:)newyearpgk:)hack,pgkhack2022pgk:):),2022,3,9,6,6,2261.0,3.0,0,...,2974,1323,0,0,0,0,18.916667,5.603524,0 0 106.0,False
5,newyearnewyear2022wagon:)happy,newyeartutuhappy2022:)newyear,2021,11,47,27,18,2261.0,4.0,0,...,1099,1099,0,0,0,0,28.450000,3.725835,0 0 106.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4139685,wagonhappy:)digital:)wagon,wagonrailwaywagon:):)hack,2020,7,28,12,17,2261.0,4.0,0,...,17401,1259,17,17,99,70,323.383333,11.286914,99 70 3650.0,False
4139687,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,24,11,16,2261.0,4.0,0,...,17401,2974,17,17,99,70,330.500000,11.043873,99 70 3650.0,False
4139688,wagonhappyrailwayhack:)tutu,newyearwagondigitaltutu:)pgk,2020,7,28,12,17,2261.0,4.0,0,...,17401,1259,17,17,99,70,323.383333,11.286914,99 70 3650.0,False
4139689,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,26,22,16,2261.0,4.0,0,...,17401,1259,17,17,99,70,289.116667,12.624661,99 70 3650.0,False


In [19]:
df[df['bad']==True]


Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,y,avg,reg_s_d,bad
4,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2022,3,9,6,6,2261.0,3.0,0,...,2974,1323,0,0,0,0,69.966667,1.515007,0 0 106.0,True
7,newyearnewyearnewyear:):)digital,newyearnewyear2022wagon:)happy,2021,10,42,21,13,2261.0,4.0,0,...,2974,2678,0,0,0,0,11.766667,9.008499,0 0 106.0,True
16,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2021,5,21,30,14,2261.0,4.0,0,...,2974,3406,0,0,0,0,2.883333,37.803468,0 0 109.0,True
24,wagonpgk2022:)pgk,wagonhackrailway:)pgk,2020,11,46,10,10,2261.0,4.0,0,...,2974,3406,0,0,0,0,38.150000,3.119266,0 0 119.0,True
26,wagonpgk2022:)pgk,wagonhackrailway:)pgk,2020,6,26,22,21,2261.0,3.0,0,...,2974,3406,0,0,0,0,2.000000,59.500000,0 0 119.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4139668,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,25,18,16,2261.0,4.0,0,...,17401,1259,17,17,99,70,380.583333,8.676155,99 70 3302.0,True
4139673,newyear20222022happy:):),wagonhackpgk:):)happy,2020,7,30,25,17,2261.0,4.0,0,...,17401,2974,17,17,99,70,239.533333,14.770387,99 70 3538.0,True
4139674,newyear20222022happy:):),wagonhackpgk:):)happy,2020,7,28,12,17,2261.0,4.0,0,...,17401,1259,17,17,99,70,423.833333,8.347621,99 70 3538.0,True
4139677,newyear20222022happy:):),wagonhackpgk:):)happy,2020,7,30,25,17,2261.0,4.0,0,...,17401,2974,17,17,99,70,239.533333,15.237963,99 70 3650.0,True


In [25]:
avg_speed['99 74 4776.0'] = 11.894404781670262

In [26]:
avg_speed

{'0 0 106.0': 4.393731221830636,
 '0 0 109.0': 34.63699873360861,
 '0 0 118.0': 2.438856355494316,
 '0 0 119.0': 32.05105770964266,
 '0 0 122.0': 36.96969696969697,
 '0 0 125.0': 4.051002053560627,
 '0 0 133.0': 7.330647745926734,
 '0 0 146.0': 4.474810242407672,
 '0 0 149.0': 1.550737207285343,
 '0 0 159.0': 3.233898305084746,
 '0 0 160.0': 10.0418410041841,
 '0 0 164.0': 36.359653948369235,
 '0 0 166.0': 43.493449781659386,
 '0 0 168.0': 5.328566433566434,
 '0 0 17.0': 29.610890180483327,
 '0 0 172.0': 7.277856135401975,
 '0 0 178.0': 7.644953471725125,
 '0 0 184.0': 12.920480741254671,
 '0 0 185.0': 2.781257830117765,
 '0 0 187.0': 5.10611972020715,
 '0 0 19.0': 45.6,
 '0 0 190.0': 9.836065573770494,
 '0 0 193.0': 34.87827112827113,
 '0 0 197.0': 6.324237560192617,
 '0 0 199.0': 3.1385703403563294,
 '0 0 202.0': 5.45105563255935,
 '0 0 206.0': 34.499772925166525,
 '0 0 213.0': 8.073236644504913,
 '0 0 214.0': 8.396608638206985,
 '0 0 221.0': 14.617059495287215,
 '0 0 23.0': 40.54359

In [27]:
df.tail()

Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,y,avg,reg_s_d,bad,avg_speed
4139686,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,26,25,17,2261.0,4.0,0,...,1259,17,17,99,70,525.916667,6.940263,99 70 3650.0,True,11.384481
4139687,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,24,11,16,2261.0,4.0,0,...,2974,17,17,99,70,330.5,11.043873,99 70 3650.0,False,11.384481
4139688,wagonhappyrailwayhack:)tutu,newyearwagondigitaltutu:)pgk,2020,7,28,12,17,2261.0,4.0,0,...,1259,17,17,99,70,323.383333,11.286914,99 70 3650.0,False,11.384481
4139689,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,26,22,16,2261.0,4.0,0,...,1259,17,17,99,70,289.116667,12.624661,99 70 3650.0,False,11.384481
4139690,wagonrailwayhackrailway:)wagon,wagonrailwaypgkwagon:)pgk,2021,3,12,24,11,2261.0,4.0,0,...,1350,17,17,99,74,401.533333,11.894405,99 74 4776.0,False,


In [28]:
# set avg speed in segment (station A to station B)

def set_speed(val):
    if val in avg_speed:
        return avg_speed[val]
    return None

df['avg_speed'] = df['reg_s_d'].apply(set_speed)

In [29]:
df.head()

Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,y,avg,reg_s_d,bad,avg_speed
0,railwaypgkpgknewyear:)happy,digitalhappynewyeardigital:)wagon,2022,3,9,6,6,2261.0,3.0,0,...,1323,0,0,0,0,18.916667,5.603524,0 0 106.0,False,4.393731
1,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2021,5,17,1,17,2261.0,4.0,0,...,1323,0,0,0,0,31.766667,3.336831,0 0 106.0,False,4.393731
2,happyrailwaynewyear:)tutu,newyearhappyrailway:)hack,2021,11,47,27,18,2261.0,4.0,0,...,1099,0,0,0,0,28.45,3.725835,0 0 106.0,False,4.393731
3,pgk:)newyearpgk:)hack,pgkhack2022pgk:):),2022,3,9,6,6,2261.0,3.0,0,...,1323,0,0,0,0,18.916667,5.603524,0 0 106.0,False,4.393731
4,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2022,3,9,6,6,2261.0,3.0,0,...,1323,0,0,0,0,69.966667,1.515007,0 0 106.0,True,4.393731


In [30]:
df.tail()

Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,y,avg,reg_s_d,bad,avg_speed
4139686,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,26,25,17,2261.0,4.0,0,...,1259,17,17,99,70,525.916667,6.940263,99 70 3650.0,True,11.384481
4139687,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,24,11,16,2261.0,4.0,0,...,2974,17,17,99,70,330.5,11.043873,99 70 3650.0,False,11.384481
4139688,wagonhappyrailwayhack:)tutu,newyearwagondigitaltutu:)pgk,2020,7,28,12,17,2261.0,4.0,0,...,1259,17,17,99,70,323.383333,11.286914,99 70 3650.0,False,11.384481
4139689,newyear20222022happy:):),wagonhackpgk:):)happy,2020,6,26,22,16,2261.0,4.0,0,...,1259,17,17,99,70,289.116667,12.624661,99 70 3650.0,False,11.384481
4139690,wagonrailwayhackrailway:)wagon,wagonrailwaypgkwagon:)pgk,2021,3,12,24,11,2261.0,4.0,0,...,1350,17,17,99,74,401.533333,11.894405,99 74 4776.0,False,11.894405


In [31]:
df['predictions'] = df['distance'] / df['avg_speed']

In [32]:
df.head(100)

Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,y,avg,reg_s_d,bad,avg_speed,predictions
0,railwaypgkpgknewyear:)happy,digitalhappynewyeardigital:)wagon,2022,3,9,6,6,2261.0,3.0,0,...,0,0,0,0,18.916667,5.603524,0 0 106.0,False,4.393731,24.125281
1,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2021,5,17,1,17,2261.0,4.0,0,...,0,0,0,0,31.766667,3.336831,0 0 106.0,False,4.393731,24.125281
2,happyrailwaynewyear:)tutu,newyearhappyrailway:)hack,2021,11,47,27,18,2261.0,4.0,0,...,0,0,0,0,28.450000,3.725835,0 0 106.0,False,4.393731,24.125281
3,pgk:)newyearpgk:)hack,pgkhack2022pgk:):),2022,3,9,6,6,2261.0,3.0,0,...,0,0,0,0,18.916667,5.603524,0 0 106.0,False,4.393731,24.125281
4,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2022,3,9,6,6,2261.0,3.0,0,...,0,0,0,0,69.966667,1.515007,0 0 106.0,True,4.393731,24.125281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,happy:)pgkwagon:):),pgknewyeartutudigital:)wagon,2021,1,2,11,14,2261.0,3.0,0,...,0,0,0,0,18.841111,7.059032,0 0 133.0,False,7.330648,18.143008
96,newyeartutu:)hack:)newyear,newyearnewyeartutu:):)2022,2021,5,17,2,8,2261.0,4.0,0,...,0,0,0,0,30.333333,4.384615,0 0 133.0,False,7.330648,18.143008
97,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2021,5,19,12,19,2261.0,4.0,0,...,0,0,0,0,13.433333,9.900744,0 0 133.0,False,7.330648,18.143008
98,happy:)pgkwagon:):),newyearwagondigitalnewyear:)tutu,2022,3,13,28,8,2261.0,4.0,0,...,0,0,0,0,217.266667,0.612151,0 0 133.0,True,7.330648,18.143008


In [33]:
cleared_df = df[df['bad']==False]

In [34]:
print(mean_squared_error(cleared_df['predictions'], cleared_df['y'], squared = False))

43.320459146527945


In [35]:
cleared_df.to_csv('data/train_with_avg_road_speed_filtered.csv')

# Predict test

In [49]:
df_test = pd.read_csv('data/test_with_nan.csv')

In [50]:
df_test['reg_s_d'] = df_test.apply(join_s_d, axis=1)

In [51]:
df_test.head(5)

Unnamed: 0.1,Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,...,distance,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,reg_s_d,avg_speed,predictions
0,0,happy:)pgkwagon:):),pgk2022newyear:)pgk,2022,9,35,1,9,2261.0,3.0,...,2930.0,2974,2847,28,28,134,125,134 125 2930.0,18.085222,162.01073
1,1,2022newyear:)newyear:):),hacktutu:)newyear:)2022,2022,8,34,22,14,1653.0,4.0,...,1728.0,2957,6340,18,18,102,16,102 16 1728.0,13.400721,128.948285
2,2,happyrailwaytutu:)pgk,newyeartutu:)hack:)newyear,2022,5,21,26,16,643.0,3.0,...,4393.0,1664,2128,0,0,1,124,1 124 4393.0,22.343906,196.608414
3,3,2022tuturailwayhack:):),2022newyear:)happy:)pgk,2022,11,44,4,4,2261.0,4.0,...,1111.0,2974,2468,32,32,29,31,29 31 1111.0,16.31429,68.099805
4,4,happyrailwaytutu:)pgk,pgk2022newyear:)pgk,2022,5,20,17,6,2261.0,3.0,...,19.0,2974,1098,29,29,117,117,117 117 19.0,0.197711,96.099722


In [52]:
def set_speed(val):
    if val in avg_speed:
        return avg_speed[val]
    return 16.036287769360637

df_test['avg_speed'] = df_test['reg_s_d'].apply(set_speed)

In [53]:
df_test[df_test['avg_speed'] > 0]

Unnamed: 0.1,Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,...,distance,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,reg_s_d,avg_speed,predictions
0,0,happy:)pgkwagon:):),pgk2022newyear:)pgk,2022,9,35,1,9,2261.0,3.0,...,2930.0,2974,2847,28,28,134,125,134 125 2930.0,17.507887,162.010730
1,1,2022newyear:)newyear:):),hacktutu:)newyear:)2022,2022,8,34,22,14,1653.0,4.0,...,1728.0,2957,6340,18,18,102,16,102 16 1728.0,13.490347,128.948285
2,2,happyrailwaytutu:)pgk,newyeartutu:)hack:)newyear,2022,5,21,26,16,643.0,3.0,...,4393.0,1664,2128,0,0,1,124,1 124 4393.0,22.328235,196.608414
3,3,2022tuturailwayhack:):),2022newyear:)happy:)pgk,2022,11,44,4,4,2261.0,4.0,...,1111.0,2974,2468,32,32,29,31,29 31 1111.0,16.459609,68.099805
4,4,happyrailwaytutu:)pgk,pgk2022newyear:)pgk,2022,5,20,17,6,2261.0,3.0,...,19.0,2974,1098,29,29,117,117,117 117 19.0,0.197711,96.099722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1182898,1182898,pgknewyeardigitalhack:)hack,hack:)tutu:)railway,2022,6,26,27,19,539.0,4.0,...,1627.0,1416,14179,32,32,123,3,123 3 1627.0,8.405567,187.920563
1182899,1182899,tutupgkdigitalrailway:)hack,happyrailwaytutu:)pgk,2022,9,39,27,16,1677.0,3.0,...,5096.0,2129,1664,27,27,125,1,125 1 5096.0,20.245887,254.821252
1182900,1182900,digitalhappynewyeardigital:)wagon,railwaytutupgkhappy:)newyear,2022,7,30,27,23,618.0,1.0,...,276.0,1316,2082,9,9,67,67,67 67 276.0,45.651310,5.994804
1182901,1182901,hacknewyear:)pgk:)happy,hacktutupgkwagon:)railway,2022,7,30,26,14,100056.0,4.0,...,408.0,2974,3392,0,0,1,0,1 0 408.0,9.764659,41.783333


In [54]:
df_test.shape

(1182903, 24)

In [55]:
df['avg_speed'].mean()

16.050409913475267

In [56]:
df_test['predictions'] = df_test['distance'] / df_test['avg_speed']


In [57]:
df_test['predictions'].mean()

85.22917332680507

In [58]:
df_test['predictions'].min()


0.0

In [59]:
df_test[df_test['predictions'] == df_test['predictions'].min()]

Unnamed: 0.1,Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,...,distance,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,reg_s_d,avg_speed,predictions
367,367,happy:):):)wagon,2022hackdigital:):),2022,8,33,17,10,2261.0,3.0,...,0.0,2974,1983,21,21,111,111,111 111 0.0,16.036288,0.0
747,747,tutu2022digitaldigital:)2022,newyearwagonhappynewyear:)wagon,2022,9,36,9,1,2261.0,3.0,...,0.0,2974,1356,32,32,29,29,29 29 0.0,16.036288,0.0
1468,1468,newyearhappypgkdigital:)happy,newyearwagonrailwayhack:)wagon,2022,9,39,30,23,2261.0,3.0,...,0.0,2974,1356,32,32,29,29,29 29 0.0,16.036288,0.0
1600,1600,happy:)railwayhack:):),happy:)2022railway:):),2022,11,45,7,7,2261.0,3.0,...,0.0,2974,1356,32,32,29,29,29 29 0.0,16.036288,0.0
2459,2459,railwaydigitalhappyhappy:)hack,railwaypgktutunewyear:)pgk,2022,11,47,25,17,100056.0,3.0,...,0.0,2974,1316,9,9,67,67,67 67 0.0,16.036288,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1177666,1177666,pgk:):)pgk:)2022,hackrailway:)railway:)tutu,2022,11,45,7,11,2261.0,4.0,...,0.0,2974,1356,32,32,29,29,29 29 0.0,16.036288,0.0
1177680,1177680,tutunewyearpgkhappy:):),tutunewyearpgkpgk:)wagon,2022,10,42,21,6,2261.0,3.0,...,0.0,2974,1356,32,32,29,29,29 29 0.0,16.036288,0.0
1177770,1177770,railwayhack:)digital:)hack,railway:):):)tutu,2022,10,42,18,20,100056.0,4.0,...,0.0,2974,8961,32,32,29,29,29 29 0.0,16.036288,0.0
1179666,1179666,tutunewyear2022happy:)happy,tutunewyear2022:):)pgk,2022,11,47,22,10,100056.0,4.0,...,0.0,2974,8852,9,9,67,67,67 67 0.0,16.036288,0.0


In [60]:
df_test['distance'].min()


0.0

In [201]:
df['y'].max()

10936.768055555556

In [61]:
sol = df_test[['predictions']]

In [62]:
sol.columns = ['time']

In [68]:
sol['time'].mean()

85.22917332680507

In [69]:
sol.to_csv('data/solution_5.csv', index=False)

In [131]:
df_test.shape

(1182903, 23)

In [85]:
df_test = df_test.fillna(np.nan)

In [86]:
df_test[df_test['distance'].isna()]


Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,distance,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,reg_s_d,avg_speed,predictions
67,newyear:)hackpgk:)newyear,newyear:)railwaydigital:)wagon,2022,10,44,31,15,2261.0,4.0,0,...,,2974,2942,21,21,121,121,121 121 nan,16.036288,
123,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2022,11,45,12,12,2261.0,4.0,0,...,,2974,1356,32,32,29,29,29 29 nan,16.036288,
367,happy:):):)wagon,2022hackdigital:):),2022,8,33,17,10,2261.0,3.0,0,...,,2974,1983,21,21,111,111,111 111 nan,16.036288,
729,tutunewyearrailwaytutu:)wagon,tutunewyearwagonhack:)happy,2022,10,40,5,2,100056.0,3.0,0,...,,2974,3876,0,0,3,3,3 3 nan,16.036288,
747,tutu2022digitaldigital:)2022,newyearwagonhappynewyear:)wagon,2022,9,36,9,1,2261.0,3.0,0,...,,2974,1356,32,32,29,29,29 29 nan,16.036288,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1182288,happy:)pgkwagon:):),pgknewyeartutudigital:)wagon,2022,10,44,31,15,2261.0,3.0,0,...,,8328,1293,30,30,19,19,19 19 nan,16.036288,
1182329,2022:):)hack:):),newyeardigitalhappydigital:)newyear,2022,10,43,25,19,2261.0,3.0,0,...,,2974,1664,0,0,1,1,1 1 nan,16.036288,
1182362,railwayhappyhappyhack:)pgk,newyearwagondigitalnewyear:)tutu,2022,10,43,29,11,2261.0,4.0,0,...,,2974,3102,21,21,111,111,111 111 nan,16.036288,
1182473,newyearwagonhappypgk:)digital,newyearwagon:)pgk:)wagon,2022,9,36,11,8,2261.0,3.0,0,...,,2974,1664,0,0,1,1,1 1 nan,16.036288,


In [31]:
df_with_nan = df_test[df_test['distance'].isna()]

In [32]:
dict_distance = {}

for ind in range(df_with_nan.shape[0]):
    row = df_with_nan.iloc[ind]
    dict_distance[f"{row['st_code_snd']}+{row['st_code_rsv']}"] = 0

In [33]:
for one in tqdm(dict_distance):
    st_code_snd, st_code_rsv = one.split('+')
    try:
        val = cleared_df[(cleared_df['st_code_snd'] == st_code_snd)&(cleared_df['st_code_rsv'] == st_code_rsv)]['distance'].value_counts().idxmax()
        dict_distance[one] = val
    except:
        pass

100%|██████████| 3810/3810 [16:13<00:00,  3.91it/s]


In [88]:
df_with_nan = df_test[df_test['distance'].isna()]
df_with_nan

Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,distance,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,reg_s_d,avg_speed,predictions
67,newyear:)hackpgk:)newyear,newyear:)railwaydigital:)wagon,2022,10,44,31,15,2261.0,4.0,0,...,,2974,2942,21,21,121,121,121 121 nan,16.036288,
123,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2022,11,45,12,12,2261.0,4.0,0,...,,2974,1356,32,32,29,29,29 29 nan,16.036288,
367,happy:):):)wagon,2022hackdigital:):),2022,8,33,17,10,2261.0,3.0,0,...,,2974,1983,21,21,111,111,111 111 nan,16.036288,
729,tutunewyearrailwaytutu:)wagon,tutunewyearwagonhack:)happy,2022,10,40,5,2,100056.0,3.0,0,...,,2974,3876,0,0,3,3,3 3 nan,16.036288,
747,tutu2022digitaldigital:)2022,newyearwagonhappynewyear:)wagon,2022,9,36,9,1,2261.0,3.0,0,...,,2974,1356,32,32,29,29,29 29 nan,16.036288,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1182288,happy:)pgkwagon:):),pgknewyeartutudigital:)wagon,2022,10,44,31,15,2261.0,3.0,0,...,,8328,1293,30,30,19,19,19 19 nan,16.036288,
1182329,2022:):)hack:):),newyeardigitalhappydigital:)newyear,2022,10,43,25,19,2261.0,3.0,0,...,,2974,1664,0,0,1,1,1 1 nan,16.036288,
1182362,railwayhappyhappyhack:)pgk,newyearwagondigitalnewyear:)tutu,2022,10,43,29,11,2261.0,4.0,0,...,,2974,3102,21,21,111,111,111 111 nan,16.036288,
1182473,newyearwagonhappypgk:)digital,newyearwagon:)pgk:)wagon,2022,9,36,11,8,2261.0,3.0,0,...,,2974,1664,0,0,1,1,1 1 nan,16.036288,


In [120]:
def set_distance(row):
    if row['distance'] == 0:
        return 1441.4721009370023
    if not np.isnan(row['distance']) :
        return row['distance']

    key = f"{row['st_code_snd']}+{row['st_code_rsv']}"
    if key in dict_distance:
        return dict_distance[key]
    else:
        return 1441.4721009370023

In [121]:
df_test['distance'] = df_test.apply(set_distance, axis=1)


In [122]:
df_with_nan = df_test[df_test['distance'].isna()]
df_with_nan

Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,distance,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,reg_s_d,avg_speed,predictions


In [70]:
dict_distance

{'newyear:)hackpgk:)newyear+newyear:)railwaydigital:)wagon': 199.0,
 'railwaytutupgkhappy:)newyear+digitalhappynewyeardigital:)wagon': 276.0,
 'happy:):):)wagon+2022hackdigital:):)': 0,
 'tutunewyearrailwaytutu:)wagon+tutunewyearwagonhack:)happy': 5.0,
 'tutu2022digitaldigital:)2022+newyearwagonhappynewyear:)wagon': 0,
 '20222022happypgk:)railway+20222022railway2022:)tutu': 276.0,
 'newyearwagondigitalpgk:)2022+newyearwagonpgkhack:)newyear': 48.0,
 'pgkpgkrailwayhappy:):)+pgkpgktutu2022:)wagon': 230.0,
 '20222022railway:):)happy+20222022railwaydigital:)newyear': 33.0,
 'railwayhackhackrailway:)newyear+digitalhappynewyeardigital:)wagon': 925.0,
 'happy:)pgkrailway:)2022+hackwagonhappy:)newyear': 1528.0,
 'railwayhappyhappyhack:)pgk+railwaypgkrailwaydigital:)2022': 152.0,
 'tutunewyearrailway2022:)pgk+tutunewyeardigitalrailway:)pgk': 21.0,
 'newyearhappypgkdigital:)happy+newyearwagonrailwayhack:)wagon': 0,
 'happy:)pgkwagon:):)+pgknewyeartutudigital:)wagon': 1886.0,
 'happy:)railwayhack:

In [93]:
df_with_nan['distance'] = df_with_nan.apply(set_distance, axis=1)

In [94]:
df_with_nan

Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,distance,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,reg_s_d,avg_speed,predictions
67,newyear:)hackpgk:)newyear,newyear:)railwaydigital:)wagon,2022,10,44,31,15,2261.0,4.0,0,...,199.0,2974,2942,21,21,121,121,121 121 nan,16.036288,
123,railwaytutupgkhappy:)newyear,digitalhappynewyeardigital:)wagon,2022,11,45,12,12,2261.0,4.0,0,...,276.0,2974,1356,32,32,29,29,29 29 nan,16.036288,
367,happy:):):)wagon,2022hackdigital:):),2022,8,33,17,10,2261.0,3.0,0,...,0.0,2974,1983,21,21,111,111,111 111 nan,16.036288,
729,tutunewyearrailwaytutu:)wagon,tutunewyearwagonhack:)happy,2022,10,40,5,2,100056.0,3.0,0,...,5.0,2974,3876,0,0,3,3,3 3 nan,16.036288,
747,tutu2022digitaldigital:)2022,newyearwagonhappynewyear:)wagon,2022,9,36,9,1,2261.0,3.0,0,...,0.0,2974,1356,32,32,29,29,29 29 nan,16.036288,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1182288,happy:)pgkwagon:):),pgknewyeartutudigital:)wagon,2022,10,44,31,15,2261.0,3.0,0,...,1886.0,8328,1293,30,30,19,19,19 19 nan,16.036288,
1182329,2022:):)hack:):),newyeardigitalhappydigital:)newyear,2022,10,43,25,19,2261.0,3.0,0,...,1179.0,2974,1664,0,0,1,1,1 1 nan,16.036288,
1182362,railwayhappyhappyhack:)pgk,newyearwagondigitalnewyear:)tutu,2022,10,43,29,11,2261.0,4.0,0,...,5.0,2974,3102,21,21,111,111,111 111 nan,16.036288,
1182473,newyearwagonhappypgk:)digital,newyearwagon:)pgk:)wagon,2022,9,36,11,8,2261.0,3.0,0,...,31.0,2974,1664,0,0,1,1,1 1 nan,16.036288,


In [96]:
df_test

Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,distance,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,reg_s_d,avg_speed,predictions
0,happy:)pgkwagon:):),pgk2022newyear:)pgk,2022,9,35,1,9,2261.0,3.0,0,...,2930.0,2974,2847,28,28,134,125,134 125 2930.0,18.085222,162.010730
1,2022newyear:)newyear:):),hacktutu:)newyear:)2022,2022,8,34,22,14,1653.0,4.0,1,...,1728.0,2957,6340,18,18,102,16,102 16 1728.0,13.400721,128.948285
2,happyrailwaytutu:)pgk,newyeartutu:)hack:)newyear,2022,5,21,26,16,643.0,3.0,1,...,4393.0,1664,2128,0,0,1,124,1 124 4393.0,22.343906,196.608414
3,2022tuturailwayhack:):),2022newyear:)happy:)pgk,2022,11,44,4,4,2261.0,4.0,0,...,1111.0,2974,2468,32,32,29,31,29 31 1111.0,16.314290,68.099805
4,happyrailwaytutu:)pgk,pgk2022newyear:)pgk,2022,5,20,17,6,2261.0,3.0,0,...,19.0,2974,1098,29,29,117,117,117 117 19.0,0.197711,96.099722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1182898,pgknewyeardigitalhack:)hack,hack:)tutu:)railway,2022,6,26,27,19,539.0,4.0,1,...,1627.0,1416,14179,32,32,123,3,123 3 1627.0,8.657914,187.920563
1182899,tutupgkdigitalrailway:)hack,happyrailwaytutu:)pgk,2022,9,39,27,16,1677.0,3.0,1,...,5096.0,2129,1664,27,27,125,1,125 1 5096.0,19.998332,254.821252
1182900,digitalhappynewyeardigital:)wagon,railwaytutupgkhappy:)newyear,2022,7,30,27,23,618.0,1.0,1,...,276.0,1316,2082,9,9,67,67,67 67 276.0,46.039868,5.994804
1182901,hacknewyear:)pgk:)happy,hacktutupgkwagon:)railway,2022,7,30,26,14,100056.0,4.0,0,...,408.0,2974,3392,0,0,1,0,1 0 408.0,9.764659,41.783333


In [97]:
df_with_nan = df_test[df_test['distance'].isna()]
df_with_nan

Unnamed: 0,st_code_snd,st_code_rsv,date_depart_year,date_depart_month,date_depart_week,date_depart_day,date_depart_hour,fr_id,route_type,is_load,...,distance,snd_org_id,rsv_org_id,snd_roadid,rsv_roadid,snd_dp_id,rsv_dp_id,reg_s_d,avg_speed,predictions


In [98]:
df_test.to_csv('data/test_with_nan.csv')

In [118]:
df['distance'].mean()

1441.4721009370023