In [1]:
import pandas as pd

In [5]:
df_prices = pd.read_csv("price.csv")

df_test = pd.read_csv("test.csv")

## Price dataset

In [3]:
df_prices

Unnamed: 0,City Code,Validfrom,Product,CRP
0,116,2016-07-27,Benzin,4.46
1,116,2016-07-28,Benzin,4.46
2,116,2016-07-29,Benzin,4.46
3,116,2016-07-30,Benzin,4.40
4,116,2016-07-31,Benzin,4.40
...,...,...,...,...
311739,990,2019-12-27,Motorin,6.74
311740,990,2019-12-28,Motorin,6.74
311741,990,2019-12-29,Motorin,6.74
311742,990,2019-12-30,Motorin,6.74


Sort by date

In [7]:
df_prices.sort_values(by="Validfrom", inplace=True)

Keep only one price data per week ( every sunday)
- 05/01/2014 (first sunday), 29/12/2019 (last sunday)
- I will just delete all non-sunday prices, alternatively we could have taken an average 

In [13]:
# Convert string dates to datetime objects
df_prices['Validfrom']= pd.to_datetime(df_prices['Validfrom'])


65110    2014-01-01
7729     2014-01-01
73761    2014-01-01
99716    2014-01-01
253425   2014-01-01
            ...    
201132   2019-12-31
198979   2019-12-31
196817   2019-12-31
229632   2019-12-31
311743   2019-12-31
Name: Validfrom, Length: 311744, dtype: datetime64[ns]

In [25]:
# Keep only prices on sundays
df_prices = df_prices[df_prices.Validfrom.dt.dayofweek == 6]
df_prices

Unnamed: 0,City Code,Validfrom,Product,CRP
18546,218,2014-01-05,Benzin,5.10
127759,884,2014-01-05,Benzin,5.07
216661,532,2014-01-05,Motorin,4.33
270732,812,2014-01-05,Motorin,4.33
255592,722,2014-01-05,Motorin,4.39
...,...,...,...,...
242607,657,2019-12-29,Motorin,6.83
209782,475,2019-12-29,Motorin,6.70
165761,172,2019-12-29,Motorin,6.79
296600,936,2019-12-29,Motorin,6.65


Rename columns and reindex 

In [28]:
df_prices.columns=["City","Date","Product","CRP"]
df_prices.reset_index(inplace=True)


In [34]:
df_prices.head()

Unnamed: 0,index,City,Date,Product,CRP
0,18546,218,2014-01-05,Benzin,5.1
1,127759,884,2014-01-05,Benzin,5.07
2,216661,532,2014-01-05,Motorin,4.33
3,270732,812,2014-01-05,Motorin,4.33
4,255592,722,2014-01-05,Motorin,4.39


## Train dataset

In [93]:
df_train = pd.read_csv("train.csv")

In [94]:
df_train.head()

Unnamed: 0,Customer,Area,City Code,Billing Date,Product,Plant,Litres
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T102,0.0
1,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T155,0.0
2,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T102,0.0
3,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T155,0.0
4,10048261,Karadeniz Bölgesi,883,2014-01-02,Motorin,T155,16685.0


- Sum up all orders for a week
    I want to have a single entry summing up the orders placed for each week
    Will sum up orders for the same fuel across different plants

In [95]:
# rename columns for ease
df_train.columns=["Customer", "Area", "City", "Date", "Product", "Plant", "Litres"]

# convert to datetime, subtract 7 days so we can get the total for the following week
df_train['Date'] = pd.to_datetime(df_train['Date']) - pd.to_timedelta(7, unit='d')
# Sort
df_train.sort_values(by=["Date", "Customer", "Product"], inplace=True)

In [96]:
df_train.head()

Unnamed: 0,Customer,Area,City,Date,Product,Plant,Litres
0,10048261,Karadeniz Bölgesi,883,2013-12-25,Benzin,T102,0.0
1,10048261,Karadeniz Bölgesi,883,2013-12-25,Benzin,T155,0.0
2,10048261,Karadeniz Bölgesi,883,2013-12-25,Motorin,T102,0.0
3,10048261,Karadeniz Bölgesi,883,2013-12-25,Motorin,T155,0.0
2283,10048264,Karadeniz Bölgesi,883,2013-12-25,Motorin,T105,0.0


- Below I sum up orders across week, and drop the city and plant code

In [97]:
df_t = df_train.groupby(['Customer', "Area","Product", pd.Grouper(key='Date', freq='W-MON')])['Litres'].sum().reset_index().sort_values('Date')

In [102]:
#sort values to inspect
df_t.sort_values(by=["Date","Customer", "Product"], inplace=True)

In [103]:
df_t.head(20)

Unnamed: 0,Customer,Area,Product,Date,Litres
0,10048261,Karadeniz Bölgesi,Benzin,2013-12-30,5040.0
256,10048261,Karadeniz Bölgesi,Motorin,2013-12-30,28712.0
685,10048263,Karadeniz Bölgesi,Motorin,2013-12-30,34082.0
899,10048264,Karadeniz Bölgesi,Benzin,2013-12-30,3608.0
1069,10048264,Karadeniz Bölgesi,Motorin,2013-12-30,21325.0
1283,10048265,Karadeniz Bölgesi,Benzin,2013-12-30,0.0
1454,10048265,Karadeniz Bölgesi,Motorin,2013-12-30,0.0
1677,10048789,Karadeniz Bölgesi,Benzin,2013-12-30,0.0
1816,10048789,Karadeniz Bölgesi,Motorin,2013-12-30,0.0
1998,10048792,Marmara Bölgesi,Benzin,2013-12-30,27215.0


In [106]:
# Remove first week because the data for it is incomplete
df_t = df_t[df_t["Date"] != "2013-12-30" ]

In [109]:
# Rename date column to clarify meaning
df_t.rename(columns={"Date":"WeekStarting"}, inplace=True)

In [111]:
# reassign to original df
df_train = df_t.copy()

In [112]:
df_train

Unnamed: 0,Customer,Area,Product,WeekStarting,Litres
1,10048261,Karadeniz Bölgesi,Benzin,2014-01-06,4242.0
257,10048261,Karadeniz Bölgesi,Motorin,2014-01-06,29017.0
514,10048263,Karadeniz Bölgesi,Benzin,2014-01-06,4572.0
686,10048263,Karadeniz Bölgesi,Motorin,2014-01-06,30681.0
1070,10048264,Karadeniz Bölgesi,Motorin,2014-01-06,25009.0
...,...,...,...,...,...
480614,12663274,Doğu Anadolu Bölgesi,Motorin,2018-12-24,17102.0
480617,12663344,Doğu Anadolu Bölgesi,Benzin,2018-12-24,11111.0
480620,12663344,Doğu Anadolu Bölgesi,Motorin,2018-12-24,22882.0
480621,12664221,Marmara Bölgesi,Benzin,2018-12-24,6528.0
