In [1]:
import pandas as pd

In [2]:
df_prices = pd.read_csv("price.csv")


In [3]:
df_prices.columns=["City","Date","Product","CRP"]
df_prices.reset_index(inplace=True)
df_prices.drop(labels="index",axis=1,inplace=True)

## Ready Price Dataset for merge

In [4]:
# Convert string dates to datetime objects
df_prices['Date']= pd.to_datetime(df_prices['Date'])
df_prices.sort_values(by="Date", inplace=True)

df_prices.columns=["City","Date","Product","CRP"]
df_prices.reset_index(inplace=True)
df_prices.drop(labels="index",axis=1,inplace=True)


In [5]:
## Add one day so we can merge 
df_prices["Date"] = df_prices["Date"] + pd.to_timedelta(1, unit='d')

## Train dataset

In [6]:
df_train = pd.read_csv("train.csv")

In [7]:
df_train.head()

Unnamed: 0,Customer,Area,City Code,Billing Date,Product,Plant,Litres
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T102,0.0
1,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T155,0.0
2,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T102,0.0
3,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T155,0.0
4,10048261,Karadeniz Bölgesi,883,2014-01-02,Motorin,T155,16685.0


- Sum up all orders for a week
    I want to have a single entry summing up the orders placed for each week
    Will sum up orders for the same fuel across different plants

In [8]:
# rename columns for ease
df_train.columns=["Customer", "Area", "City", "Date", "Product", "Plant", "Litres"]

# convert to datetime, subtract 7 days so we can get the total for the following week
df_train['Date'] = pd.to_datetime(df_train['Date']) - pd.to_timedelta(7, unit='d')
# Sort
df_train.sort_values(by=["Date", "Customer", "Product"], inplace=True)

In [9]:
df_train.head()

Unnamed: 0,Customer,Area,City,Date,Product,Plant,Litres
0,10048261,Karadeniz Bölgesi,883,2013-12-25,Benzin,T102,0.0
1,10048261,Karadeniz Bölgesi,883,2013-12-25,Benzin,T155,0.0
2,10048261,Karadeniz Bölgesi,883,2013-12-25,Motorin,T102,0.0
3,10048261,Karadeniz Bölgesi,883,2013-12-25,Motorin,T155,0.0
2283,10048264,Karadeniz Bölgesi,883,2013-12-25,Motorin,T105,0.0


- Below I sum up orders across week, and drop the plant code

In [10]:
df_t = df_train.groupby(['Customer', "Area","City","Product", pd.Grouper(key='Date', freq='W-MON')])['Litres'].sum().reset_index().sort_values('Date')

In [11]:
#sort values to inspect
df_t.sort_values(by=["Date","Customer", "Product"], inplace=True)

In [12]:
df_t.head(5)

Unnamed: 0,Customer,Area,City,Product,Date,Litres
0,10048261,Karadeniz Bölgesi,883,Benzin,2013-12-30,5040.0
256,10048261,Karadeniz Bölgesi,883,Motorin,2013-12-30,28712.0
685,10048263,Karadeniz Bölgesi,966,Motorin,2013-12-30,34082.0
899,10048264,Karadeniz Bölgesi,883,Benzin,2013-12-30,3608.0
1069,10048264,Karadeniz Bölgesi,883,Motorin,2013-12-30,21325.0


In [13]:
# Remove first week because the data for it is incomplete
df_t = df_t[df_t["Date"] != "2013-12-30" ]

In [14]:
# reassign to original df
df_train = df_t.copy()

## Combine price and training datasets
 - For the week starting on 2014-01-06(Monday), the price data will be the price on 2014-01-05(Sunday)

In [15]:
# Merge datasets
df_merged = df_train.merge(df_prices,how="left")
# Rename column names for clarity
df_merged.rename(columns={"CRP":"Price(t-1)"}, inplace=True)


In [16]:
df_merged.head(5)

Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1)
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.4
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.4


In [17]:
# Check for nan values
df_merged["Price(t-1)"].isna().any()

True

In [18]:
df_merged["Price(t-1)"].isna().sum()

8617

#### Fill in the Nan values in Price t-1

In [19]:
df_merged.head()

Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1)
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.4
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.4


In [20]:
temp = df_merged.groupby(['Date', 'Product'])[['Price(t-1)']].mean().reset_index()

In [21]:
temp.rename(columns={'Price(t-1)': 'Price(t-1)_mean'}, inplace=True)

In [22]:
df_temp = pd.merge(df_merged, temp, how='left', on=['Date', 'Product'])

In [23]:
df_temp.loc[df_temp['Price(t-1)'].isna(),'Price(t-1)'] = df_temp['Price(t-1)_mean']

In [24]:
df_temp

Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1),Price(t-1)_mean
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07,5.039854
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.40,4.346342
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04,5.039854
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34,4.346342
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.40,4.346342
...,...,...,...,...,...,...,...,...
478949,12663274,Doğu Anadolu Bölgesi,518,Motorin,2018-12-24,17102.0,6.10,6.012897
478950,12663344,Doğu Anadolu Bölgesi,218,Benzin,2018-12-24,11111.0,6.28,6.203699
478951,12663344,Doğu Anadolu Bölgesi,218,Motorin,2018-12-24,22882.0,6.11,6.012897
478952,12664221,Marmara Bölgesi,941,Benzin,2018-12-24,6528.0,6.19,6.203699


In [25]:
# show none values
df_temp[df_temp.isna().any(axis=1)]

Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1),Price(t-1)_mean
94677,10048261,Karadeniz Bölgesi,883,Benzin,2015-01-05,4215.0,,
94678,10048261,Karadeniz Bölgesi,883,Motorin,2015-01-05,18791.0,,
94679,10048263,Karadeniz Bölgesi,966,Motorin,2015-01-05,23570.0,,
94680,10048264,Karadeniz Bölgesi,883,Benzin,2015-01-05,3609.0,,
94681,10048264,Karadeniz Bölgesi,883,Motorin,2015-01-05,21576.0,,
...,...,...,...,...,...,...,...,...
112078,12406270,Marmara Bölgesi,941,Motorin,2015-03-09,42012.0,,
112079,12412814,Ege Bölgesi,774,Benzin,2015-03-09,5806.0,,
112080,12412814,Ege Bölgesi,774,Motorin,2015-03-09,20684.0,,
112081,12415789,Marmara Bölgesi,835,Benzin,2015-03-09,10059.0,,


In [26]:
df_temp[df_temp.isna().any(axis=1)].Date.unique()

array(['2015-01-05T00:00:00.000000000', '2015-01-12T00:00:00.000000000',
       '2015-02-09T00:00:00.000000000', '2015-03-09T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [27]:
b1 = df_temp[(df_temp["Date"] == "2014-12-29") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()
b2= df_temp[(df_temp["Date"] == "2015-01-19") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()

b3= df_temp[(df_temp["Date"] == "2015-02-02") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()
b4= df_temp[(df_temp["Date"] == "2015-02-16") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()

b5= df_temp[(df_temp["Date"] == "2015-03-02") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()
b6= df_temp[(df_temp["Date"] == "2015-03-16") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()

print(b1,b2,b3,b4,b5,b6)

d1 = df_temp[(df_temp["Date"] == "2014-12-29") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()
d2= df_temp[(df_temp["Date"] == "2015-01-19") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()

d3 = df_temp[(df_temp["Date"] == "2015-02-02") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()
d4 = df_temp[(df_temp["Date"] == "2015-02-16") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()

d5= df_temp[(df_temp["Date"] == "2015-03-02") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()
d6= df_temp[(df_temp["Date"] == "2015-03-16") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()

print(d1,d2,d3,d4,d5,d6)

4.2814469453377315 4.0857515527950445 4.280414129110803 4.389144215530904 4.191827338129465 4.477158671586621
3.793650000000047 3.553205128205157 3.7930486486485675 3.9143798882681846 3.711831735889189 3.8571668415529596


In [28]:
df_temp.loc[(df_temp["Date"] == "2015-01-05") & (df_temp["Product"] == "Benzin"), "Price(t-1)"] = (b1 + b2) /2
df_temp.loc[(df_temp["Date"] == "2015-01-12") & (df_temp["Product"] == "Benzin"), "Price(t-1)"] = (b1 + b2) /2

df_temp.loc[(df_temp["Date"] == "2015-02-09") & (df_temp["Product"] == "Benzin"), "Price(t-1)"] = (b3 + b4) /2
df_temp.loc[(df_temp["Date"] == "2015-03-09") & (df_temp["Product"] == "Benzin"), "Price(t-1)"] = (b5 + b6) /2

#diesels
df_temp.loc[(df_temp["Date"] == "2015-01-05") & (df_temp["Product"] == "Motorin"), "Price(t-1)"] = (d1 + d2) /2
df_temp.loc[(df_temp["Date"] == "2015-01-12") & (df_temp["Product"] == "Motorin"), "Price(t-1)"] = (d1 + d2) /2

df_temp.loc[(df_temp["Date"] == "2015-02-09") & (df_temp["Product"] == "Motorin"), "Price(t-1)"] = (d3 + d4) /2
df_temp.loc[(df_temp["Date"] == "2015-03-09") & (df_temp["Product"] == "Motorin"), "Price(t-1)"] = (d5 + d6) /2



### Check how successfull the filling of data was

In [29]:
df_temp['Price(t-1)'].isna().any()

False

### Finish Merge

In [30]:
# Reassign to df_train
df_train = df_temp.copy()
df_train.drop(labels="Price(t-1)_mean",axis=1,inplace=True)
df_train.head()

Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1)
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.4
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.4


In [31]:
# Drop city
df_train.drop(labels=["City"],axis=1,inplace=True)

In [32]:
df_train.head()

Unnamed: 0,Customer,Area,Product,Date,Litres,Price(t-1)
0,10048261,Karadeniz Bölgesi,Benzin,2014-01-06,4242.0,5.07
1,10048261,Karadeniz Bölgesi,Motorin,2014-01-06,29017.0,4.4
2,10048263,Karadeniz Bölgesi,Benzin,2014-01-06,4572.0,5.04
3,10048263,Karadeniz Bölgesi,Motorin,2014-01-06,30681.0,4.34
4,10048264,Karadeniz Bölgesi,Motorin,2014-01-06,25009.0,4.4


In [33]:
df_train.isna().any()

Customer      False
Area          False
Product       False
Date          False
Litres        False
Price(t-1)    False
dtype: bool

In [41]:
## Rename Column for clarity
df_train.rename(columns={"Date":"WeekStarting"},inplace=True)

# Add in external data

### Car data

In [43]:
df_cars = pd.read_csv("CarRegistrationDataReady.csv")

In [44]:
df_cars.columns = ["WeekStarting", "Area","NewCars"]
df_cars["WeekStarting"]  = pd.to_datetime(df_cars['WeekStarting'])
df_cars

Unnamed: 0,WeekStarting,Area,NewCars
0,2014-01-06,Akdeniz Bölgesi,12007
1,2014-01-13,Akdeniz Bölgesi,12007
2,2014-01-20,Akdeniz Bölgesi,12007
3,2014-01-27,Akdeniz Bölgesi,12007
4,2014-02-03,Akdeniz Bölgesi,9999
...,...,...,...
2186,2019-12-02,İç Anadolu Bölgesi,9058
2187,2019-12-09,İç Anadolu Bölgesi,9058
2188,2019-12-16,İç Anadolu Bölgesi,9058
2189,2019-12-23,İç Anadolu Bölgesi,9058


In [45]:
df_train

Unnamed: 0,Customer,Area,Product,WeekStarting,Litres,Price(t-1)
0,10048261,Karadeniz Bölgesi,Benzin,2014-01-06,4242.0,5.07
1,10048261,Karadeniz Bölgesi,Motorin,2014-01-06,29017.0,4.40
2,10048263,Karadeniz Bölgesi,Benzin,2014-01-06,4572.0,5.04
3,10048263,Karadeniz Bölgesi,Motorin,2014-01-06,30681.0,4.34
4,10048264,Karadeniz Bölgesi,Motorin,2014-01-06,25009.0,4.40
...,...,...,...,...,...,...
478949,12663274,Doğu Anadolu Bölgesi,Motorin,2018-12-24,17102.0,6.10
478950,12663344,Doğu Anadolu Bölgesi,Benzin,2018-12-24,11111.0,6.28
478951,12663344,Doğu Anadolu Bölgesi,Motorin,2018-12-24,22882.0,6.11
478952,12664221,Marmara Bölgesi,Benzin,2018-12-24,6528.0,6.19


In [46]:
df_train_merged = df_train.merge(df_cars,how="left")


In [47]:
df_train_merged[df_train_merged["Area"]=="Karadeniz Bölgesi"]

Unnamed: 0,Customer,Area,Product,WeekStarting,Litres,Price(t-1),NewCars
0,10048261,Karadeniz Bölgesi,Benzin,2014-01-06,4242.0,5.07,5015
1,10048261,Karadeniz Bölgesi,Motorin,2014-01-06,29017.0,4.40,5015
2,10048263,Karadeniz Bölgesi,Benzin,2014-01-06,4572.0,5.04,5015
3,10048263,Karadeniz Bölgesi,Motorin,2014-01-06,30681.0,4.34,5015
4,10048264,Karadeniz Bölgesi,Motorin,2014-01-06,25009.0,4.40,5015
...,...,...,...,...,...,...,...
478828,12505693,Karadeniz Bölgesi,Motorin,2018-12-24,51868.0,6.02,2087
478915,12627244,Karadeniz Bölgesi,Benzin,2018-12-24,5043.0,6.19,2087
478916,12627244,Karadeniz Bölgesi,Motorin,2018-12-24,59983.0,5.96,2087
478925,12633255,Karadeniz Bölgesi,Benzin,2018-12-24,2973.0,6.25,2087


In [48]:
df_train = df_train_merged.copy()
df_train[df_train['Price(t-1)'].isna()]

Unnamed: 0,Customer,Area,Product,WeekStarting,Litres,Price(t-1),NewCars


### Holiday Data

In [49]:
df_holidays = pd.read_csv("HolidayDataReady.csv")

In [50]:
df_holidays.columns = ["WeekStarting", "WeekendHolidays","WeekdayHolidays"]
df_holidays["WeekStarting"]  = pd.to_datetime(df_holidays['WeekStarting'])
df_holidays.head()

Unnamed: 0,WeekStarting,WeekendHolidays,WeekdayHolidays
0,2013-12-30,0,1
1,2014-04-21,0,1
2,2014-04-28,0,1
3,2014-05-19,0,1
4,2014-07-21,1,0


In [51]:
df_train_merged = df_train.merge(df_holidays,how="left")

In [52]:
df_train = df_train_merged.copy()

### GDP Growth

In [53]:
df_gdp = pd.read_csv("GDPDataReady.csv")

In [54]:
df_gdp.columns = ["WeekStarting", "GDPGrowth%"]
df_gdp["WeekStarting"]  = pd.to_datetime(df_gdp['WeekStarting'])
df_gdp.head()

Unnamed: 0,WeekStarting,GDPGrowth%
0,2014-01-06,0.044
1,2014-01-13,0.044
2,2014-01-20,0.044
3,2014-01-27,0.044
4,2014-02-03,0.044


In [55]:
df_train_merged = df_train.merge(df_gdp,how="left")

In [56]:
df_train = df_train_merged.copy()

## InflationData

In [57]:
df_inflation = pd.read_csv("InflationDataReady.csv")

In [58]:
df_inflation.columns = ["WeekStarting", "MoMInflation%"]
df_inflation["WeekStarting"]  = pd.to_datetime(df_inflation['WeekStarting'])
df_inflation.head()

Unnamed: 0,WeekStarting,MoMInflation%
0,2014-01-06,0.01
1,2014-01-13,0.46
2,2014-01-20,0.46
3,2014-01-27,0.46
4,2014-02-03,0.46


In [59]:
df_train_merged = df_train.merge(df_inflation,how="left")

In [60]:
df_train = df_train_merged.copy()

### USD TRY

In [61]:
df_usd = pd.read_csv("USDDataReady.csv")

In [62]:
df_usd.columns = ["WeekStarting", "WoWUSDChange%"]
df_usd["WeekStarting"]  = pd.to_datetime(df_usd['WeekStarting'])
df_usd.head()

Unnamed: 0,WeekStarting,WoWUSDChange%
0,2014-01-06,-0.62%
1,2014-01-13,3.08%
2,2014-01-20,4.47%
3,2014-01-27,"-3,04%"
4,2014-02-03,"-1,80%"


In [63]:
df_train_merged = df_train.merge(df_usd,how="left")

In [64]:
df_train = df_train_merged.copy()

In [65]:
df_train

Unnamed: 0,Customer,Area,Product,WeekStarting,Litres,Price(t-1),NewCars,WeekendHolidays,WeekdayHolidays,GDPGrowth%,MoMInflation%,WoWUSDChange%
0,10048261,Karadeniz Bölgesi,Benzin,2014-01-06,4242.0,5.07,5015,,,0.044,0.01,-0.62%
1,10048261,Karadeniz Bölgesi,Motorin,2014-01-06,29017.0,4.40,5015,,,0.044,0.01,-0.62%
2,10048263,Karadeniz Bölgesi,Benzin,2014-01-06,4572.0,5.04,5015,,,0.044,0.01,-0.62%
3,10048263,Karadeniz Bölgesi,Motorin,2014-01-06,30681.0,4.34,5015,,,0.044,0.01,-0.62%
4,10048264,Karadeniz Bölgesi,Motorin,2014-01-06,25009.0,4.40,5015,,,0.044,0.01,-0.62%
...,...,...,...,...,...,...,...,...,...,...,...,...
478949,12663274,Doğu Anadolu Bölgesi,Motorin,2018-12-24,17102.0,6.10,1521,,,0.016,-1.44,"-0,86%"
478950,12663344,Doğu Anadolu Bölgesi,Benzin,2018-12-24,11111.0,6.28,1521,,,0.016,-1.44,"-0,86%"
478951,12663344,Doğu Anadolu Bölgesi,Motorin,2018-12-24,22882.0,6.11,1521,,,0.016,-1.44,"-0,86%"
478952,12664221,Marmara Bölgesi,Benzin,2018-12-24,6528.0,6.19,16306,,,0.016,-1.44,"-0,86%"


## Fill Nan Values

In [68]:
df_train.isna().any()


Customer           False
Area               False
Product            False
WeekStarting       False
Litres             False
Price(t-1)         False
NewCars            False
WeekendHolidays     True
WeekdayHolidays     True
GDPGrowth%         False
MoMInflation%      False
WoWUSDChange%      False
dtype: bool

In [69]:
df_train = df_train.fillna(int(0))

In [71]:
df_train.WeekendHolidays = df_train.WeekendHolidays.astype('int')
df_train.WeekdayHolidays = df_train.WeekdayHolidays.astype('int')

In [73]:
df_train.isna().any()

Customer           False
Area               False
Product            False
WeekStarting       False
Litres             False
Price(t-1)         False
NewCars            False
WeekendHolidays    False
WeekdayHolidays    False
GDPGrowth%         False
MoMInflation%      False
WoWUSDChange%      False
dtype: bool

In [75]:
#Save data
df_train.to_csv("TrainingDataWithExternalData.csv",index=False)

### Add time series data as new features
- Order of previous week and the week before that
- Order for the previous week last year
- Order for the current week last year
