In [726]:
import pandas as pd

### Read in price dataset and ready it for merge

In [727]:
df_prices = pd.read_csv("price.csv")

In [728]:
df_prices.columns=["City","Date","Product","CRP"]
df_prices.reset_index(inplace=True)
df_prices.drop(labels="index",axis=1,inplace=True)

## Ready Price Dataset for merge

In [729]:
# Convert string dates to datetime objects
df_prices['Date']= pd.to_datetime(df_prices['Date'])
df_prices.sort_values(by="Date", inplace=True)

df_prices.columns=["City","Date","Product","CRP"]
df_prices.reset_index(inplace=True)
df_prices.drop(labels="index",axis=1,inplace=True)


In [730]:
## Add one day so we can merge 
df_prices["Date"] = df_prices["Date"] + pd.to_timedelta(1, unit='d')

## Train dataset

In [731]:
df_train = pd.read_csv("train.csv")

In [732]:
df_train.head()

Unnamed: 0,Customer,Area,City Code,Billing Date,Product,Plant,Litres
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T102,0.0
1,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T155,0.0
2,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T102,0.0
3,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T155,0.0
4,10048261,Karadeniz Bölgesi,883,2014-01-02,Motorin,T155,16685.0


- Sum up all orders for a week
    I want to have a single entry summing up the orders placed for each week
    Will sum up orders for the same fuel across different plants

In [733]:
# rename columns for ease
df_train.columns=["Customer", "Area", "City", "Date", "Product", "Plant", "Litres"]

# convert to datetime, subtract 7 days so we can get the total for the following week
df_train['Date'] = pd.to_datetime(df_train['Date']) - pd.to_timedelta(7, unit='d')
# Sort
df_train.sort_values(by=["Date", "Customer", "Product"], inplace=True)

In [734]:
df_train.head()

Unnamed: 0,Customer,Area,City,Date,Product,Plant,Litres
0,10048261,Karadeniz Bölgesi,883,2013-12-25,Benzin,T102,0.0
1,10048261,Karadeniz Bölgesi,883,2013-12-25,Benzin,T155,0.0
2,10048261,Karadeniz Bölgesi,883,2013-12-25,Motorin,T102,0.0
3,10048261,Karadeniz Bölgesi,883,2013-12-25,Motorin,T155,0.0
2283,10048264,Karadeniz Bölgesi,883,2013-12-25,Motorin,T105,0.0


- Below I sum up orders across week, and drop the plant code

In [735]:
df_t = df_train.groupby(['Customer', "Area","City","Product", pd.Grouper(key='Date', freq='W-MON')])['Litres'].sum().reset_index().sort_values('Date')

In [736]:
#sort values to inspect
df_t.sort_values(by=["Date","Customer", "Product"], inplace=True)

In [737]:
df_t.head(5)

Unnamed: 0,Customer,Area,City,Product,Date,Litres
0,10048261,Karadeniz Bölgesi,883,Benzin,2013-12-30,5040.0
256,10048261,Karadeniz Bölgesi,883,Motorin,2013-12-30,28712.0
685,10048263,Karadeniz Bölgesi,966,Motorin,2013-12-30,34082.0
899,10048264,Karadeniz Bölgesi,883,Benzin,2013-12-30,3608.0
1069,10048264,Karadeniz Bölgesi,883,Motorin,2013-12-30,21325.0


In [738]:
# Remove first week because the data for it is incomplete
df_t = df_t[df_t["Date"] != "2013-12-30" ]

In [739]:
# reassign to original df
df_train = df_t.copy()

### Get Test Dataset

In [740]:
df_test = pd.read_csv("test.csv")
df_test["Date"] = pd.to_datetime(df_test['Week Start Date']) 
df_test.drop(labels=["index","Date"],axis=1,inplace=True)

In [741]:
df_test.columns = ["Customer", "Area", "City","Product","Date"]

In [742]:
df_test.head()

Unnamed: 0,Customer,Area,City,Product,Date
0,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-07
1,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-14
2,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-21
3,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-28
4,10048261,Karadeniz Bölgesi,883,Motorin,2019-02-04


In [743]:
df = pd.concat([df_train,df_test])
df["Date"] = pd.to_datetime(df['Date']) 

## Combine price and training datasets
 - For the week starting on 2014-01-06(Monday), the price data will be the price on 2014-01-05(Sunday)

In [744]:
# Merge datasets
df_merged = df.merge(df_prices,how="left")
# Rename column names for clarity
df_merged.rename(columns={"CRP":"Price(t-1)"}, inplace=True)


In [745]:
df_merged

Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1)
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.40
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.40
...,...,...,...,...,...,...,...
587421,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02,,7.14
587422,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09,,7.14
587423,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16,,7.14
587424,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23,,7.14


In [746]:
# Check for nan values
df_merged["Price(t-1)"].isna().any()

True

In [747]:
df_merged["Price(t-1)"].isna().sum()

8721

#### Fill in the Nan values in Price t-1

In [748]:
df_merged.head()

Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1)
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.4
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.4


In [749]:
temp = df_merged.groupby(['Date', 'Product'])[['Price(t-1)']].mean().reset_index()

In [750]:
temp.rename(columns={'Price(t-1)': 'Price(t-1)_mean'}, inplace=True)

In [751]:
df_temp = pd.merge(df_merged, temp, how='left', on=['Date', 'Product'])

In [752]:
df_temp.loc[df_temp['Price(t-1)'].isna(),'Price(t-1)'] = df_temp['Price(t-1)_mean']

In [753]:
df_temp

Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1),Price(t-1)_mean
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07,5.039854
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.40,4.346342
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04,5.039854
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34,4.346342
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.40,4.346342
...,...,...,...,...,...,...,...,...
587421,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02,,7.14,6.963877
587422,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09,,7.14,6.963877
587423,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16,,7.14,6.963877
587424,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23,,7.14,6.965134


In [754]:
# show none values
df_temp[df_temp.isna().any(axis=1)]

Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1),Price(t-1)_mean
94677,10048261,Karadeniz Bölgesi,883,Benzin,2015-01-05,4215.0,,
94678,10048261,Karadeniz Bölgesi,883,Motorin,2015-01-05,18791.0,,
94679,10048263,Karadeniz Bölgesi,966,Motorin,2015-01-05,23570.0,,
94680,10048264,Karadeniz Bölgesi,883,Benzin,2015-01-05,3609.0,,
94681,10048264,Karadeniz Bölgesi,883,Motorin,2015-01-05,21576.0,,
...,...,...,...,...,...,...,...,...
587421,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02,,7.14,6.963877
587422,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09,,7.14,6.963877
587423,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16,,7.14,6.963877
587424,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23,,7.14,6.965134


In [755]:
df_temp[df_temp['Price(t-1)'].isna()].Date.unique()

array(['2015-01-05T00:00:00.000000000', '2015-01-12T00:00:00.000000000',
       '2015-02-09T00:00:00.000000000', '2015-03-09T00:00:00.000000000'],
      dtype='datetime64[ns]')

Manually impute prices from the preceeding and next week

In [756]:
b1 = df_temp[(df_temp["Date"] == "2014-12-29") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()
b2= df_temp[(df_temp["Date"] == "2015-01-19") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()

b3= df_temp[(df_temp["Date"] == "2015-02-02") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()
b4= df_temp[(df_temp["Date"] == "2015-02-16") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()

b5= df_temp[(df_temp["Date"] == "2015-03-02") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()
b6= df_temp[(df_temp["Date"] == "2015-03-16") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()

print(b1,b2,b3,b4,b5,b6)

d1 = df_temp[(df_temp["Date"] == "2014-12-29") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()
d2= df_temp[(df_temp["Date"] == "2015-01-19") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()

d3 = df_temp[(df_temp["Date"] == "2015-02-02") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()
d4 = df_temp[(df_temp["Date"] == "2015-02-16") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()

d5= df_temp[(df_temp["Date"] == "2015-03-02") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()
d6= df_temp[(df_temp["Date"] == "2015-03-16") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()

print(d1,d2,d3,d4,d5,d6)

4.2814469453377315 4.0857515527950445 4.280414129110803 4.389144215530904 4.191827338129465 4.477158671586621
3.793650000000047 3.553205128205157 3.7930486486485675 3.9143798882681846 3.711831735889189 3.8571668415529596


In [757]:
df_temp.loc[(df_temp["Date"] == "2015-01-05") & (df_temp["Product"] == "Benzin"), "Price(t-1)"] = (b1 + b2) /2
df_temp.loc[(df_temp["Date"] == "2015-01-12") & (df_temp["Product"] == "Benzin"), "Price(t-1)"] = (b1 + b2) /2

df_temp.loc[(df_temp["Date"] == "2015-02-09") & (df_temp["Product"] == "Benzin"), "Price(t-1)"] = (b3 + b4) /2
df_temp.loc[(df_temp["Date"] == "2015-03-09") & (df_temp["Product"] == "Benzin"), "Price(t-1)"] = (b5 + b6) /2

#diesels
df_temp.loc[(df_temp["Date"] == "2015-01-05") & (df_temp["Product"] == "Motorin"), "Price(t-1)"] = (d1 + d2) /2
df_temp.loc[(df_temp["Date"] == "2015-01-12") & (df_temp["Product"] == "Motorin"), "Price(t-1)"] = (d1 + d2) /2

df_temp.loc[(df_temp["Date"] == "2015-02-09") & (df_temp["Product"] == "Motorin"), "Price(t-1)"] = (d3 + d4) /2
df_temp.loc[(df_temp["Date"] == "2015-03-09") & (df_temp["Product"] == "Motorin"), "Price(t-1)"] = (d5 + d6) /2


### Check how successfull the filling of data was

In [758]:
df_temp['Price(t-1)'].isna().any()

False

### Finish Merge

In [759]:
# Reassign to df_train
df = df_temp.copy()
df.drop(labels="Price(t-1)_mean",axis=1,inplace=True)
df.head()

Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1)
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.4
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.4


In [760]:
df.head()

Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1)
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.4
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.4


In [761]:
df.isna().any()
# we expect litres to be nan for 2019 data

Customer      False
Area          False
City          False
Product       False
Date          False
Litres         True
Price(t-1)    False
dtype: bool

In [762]:
## Rename Column for clarity
df.rename(columns={"Date":"WeekStarting"},inplace=True)

# Add in external data

### Car data

In [763]:
df_cars = pd.read_csv("CarRegistrationDataReady.csv")

In [764]:
df_cars.columns = ["WeekStarting", "Area","NewCars"]
df_cars["WeekStarting"]  = pd.to_datetime(df_cars['WeekStarting'])
df_cars

Unnamed: 0,WeekStarting,Area,NewCars
0,2014-01-06,Akdeniz Bölgesi,12007
1,2014-01-13,Akdeniz Bölgesi,12007
2,2014-01-20,Akdeniz Bölgesi,12007
3,2014-01-27,Akdeniz Bölgesi,12007
4,2014-02-03,Akdeniz Bölgesi,9999
...,...,...,...
2186,2019-12-02,İç Anadolu Bölgesi,9058
2187,2019-12-09,İç Anadolu Bölgesi,9058
2188,2019-12-16,İç Anadolu Bölgesi,9058
2189,2019-12-23,İç Anadolu Bölgesi,9058


In [765]:
df_merged = df.merge(df_cars,how="left")

In [766]:
# Check one area
df_merged[df_merged["Area"]=="Karadeniz Bölgesi"]

Unnamed: 0,Customer,Area,City,Product,WeekStarting,Litres,Price(t-1),NewCars
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07,5015
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.40,5015
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04,5015
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34,5015
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.40,5015
...,...,...,...,...,...,...,...,...
587109,12690825,Karadeniz Bölgesi,966,Benzin,2019-12-02,,7.00,3598
587110,12690825,Karadeniz Bölgesi,966,Benzin,2019-12-09,,7.00,3598
587111,12690825,Karadeniz Bölgesi,966,Benzin,2019-12-16,,7.00,3598
587112,12690825,Karadeniz Bölgesi,966,Benzin,2019-12-23,,7.00,3598


In [767]:
df = df_merged.copy()

### Holiday Data

In [768]:
df_holidays = pd.read_csv("HolidayDataReady.csv")

In [769]:
df_holidays.columns = ["WeekStarting", "WeekendHolidays","WeekdayHolidays"]
df_holidays["WeekStarting"]  = pd.to_datetime(df_holidays['WeekStarting'])
df_holidays.head()

Unnamed: 0,WeekStarting,WeekendHolidays,WeekdayHolidays
0,2013-12-30,0,1
1,2014-04-21,0,1
2,2014-04-28,0,1
3,2014-05-19,0,1
4,2014-07-21,1,0


In [770]:
df_merged = df.merge(df_holidays,how="left")

In [771]:
df = df_merged.copy()

### GDP Growth

In [772]:
df_gdp = pd.read_csv("GDPDataReady.csv")

In [773]:
df_gdp.columns = ["WeekStarting", "GDPGrowth%"]
df_gdp["WeekStarting"]  = pd.to_datetime(df_gdp['WeekStarting'])
df_gdp.head()

Unnamed: 0,WeekStarting,GDPGrowth%
0,2014-01-06,0.044
1,2014-01-13,0.044
2,2014-01-20,0.044
3,2014-01-27,0.044
4,2014-02-03,0.044


In [774]:
df_merged = df.merge(df_gdp,how="left")

In [775]:
df = df_merged.copy()

## InflationData

In [776]:
df_inflation = pd.read_csv("InflationDataReady.csv")

In [777]:
df_inflation.columns = ["WeekStarting", "MoMInflation%"]
df_inflation["WeekStarting"]  = pd.to_datetime(df_inflation['WeekStarting'])
df_inflation.head()

Unnamed: 0,WeekStarting,MoMInflation%
0,2014-01-06,0.01
1,2014-01-13,0.46
2,2014-01-20,0.46
3,2014-01-27,0.46
4,2014-02-03,0.46


In [778]:
df_merged = df.merge(df_inflation,how="left")

In [779]:
df = df_merged.copy()

### USD TRY

In [780]:
df_usd = pd.read_csv("USDDataReady.csv")

In [781]:
df_usd.columns = ["WeekStarting", "WoWUSDChange%"]
df_usd["WeekStarting"]  = pd.to_datetime(df_usd['WeekStarting'])
df_usd.head()

Unnamed: 0,WeekStarting,WoWUSDChange%
0,2014-01-06,-0.62%
1,2014-01-13,3.08%
2,2014-01-20,4.47%
3,2014-01-27,"-3,04%"
4,2014-02-03,"-1,80%"


In [782]:
df_merged = df.merge(df_usd,how="left")

In [783]:
df = df_merged.copy()

In [784]:
df

Unnamed: 0,Customer,Area,City,Product,WeekStarting,Litres,Price(t-1),NewCars,WeekendHolidays,WeekdayHolidays,GDPGrowth%,MoMInflation%,WoWUSDChange%
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07,5015,,,0.044,0.01,-0.62%
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.40,5015,,,0.044,0.01,-0.62%
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04,5015,,,0.044,0.01,-0.62%
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34,5015,,,0.044,0.01,-0.62%
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.40,5015,,,0.044,0.01,-0.62%
...,...,...,...,...,...,...,...,...,...,...,...,...,...
587421,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02,,7.14,2004,,,-0.015,2.00,"0,52%"
587422,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09,,7.14,2004,,,0.009,2.00,"0,55%"
587423,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16,,7.14,2004,,,0.009,0.38,"2,21%"
587424,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23,,7.14,2004,,,0.009,0.38,"0,23%"


## Fill Nan Values

In [785]:
df.isna().any()

Customer           False
Area               False
City               False
Product            False
WeekStarting       False
Litres              True
Price(t-1)         False
NewCars            False
WeekendHolidays     True
WeekdayHolidays     True
GDPGrowth%         False
MoMInflation%      False
WoWUSDChange%      False
dtype: bool

In [786]:
df.WeekendHolidays = df.WeekendHolidays.fillna(int(0))
df.WeekdayHolidays = df.WeekdayHolidays.fillna(int(0))

In [787]:
# Change usd% to float by replacing , with . then removing % then casting into float
df["WoWUSDChange%"] = df["WoWUSDChange%"].str.replace(",", ".")
df["WoWUSDChange%"] = df["WoWUSDChange%"].str.rstrip('%').astype('float')
df["WoWUSDChange%"]

0        -0.62
1        -0.62
2        -0.62
3        -0.62
4        -0.62
          ... 
587421    0.52
587422    0.55
587423    2.21
587424    0.23
587425    0.37
Name: WoWUSDChange%, Length: 587426, dtype: float64

Add Season data

In [788]:
df = df.assign(Season=lambda x: x['WeekStarting'].apply(lambda y: 'Winter' if y.month in [12, 1, 2] else 'Summer' if y.month in [
               3, 4, 5] else 'Autumn' if y.month in [6, 7, 8] else 'Fall' if y.month in [9, 10, 11] else 'Unknown'))


In [789]:
# Categorize the columns (NOT one-hot encoding as pytorch will take care of this)
category_columns = ["Customer", "Area", "Product", "Season"]
df[category_columns] = df[category_columns].astype("category")
#df_train["Product"] = df_train["Product"].astype("category")

Check column types

In [790]:
# Check dtypes correctness here
df.dtypes

Customer                 category
Area                     category
City                        int64
Product                  category
WeekStarting       datetime64[ns]
Litres                    float64
Price(t-1)                float64
NewCars                     int64
WeekendHolidays           float64
WeekdayHolidays           float64
GDPGrowth%                float64
MoMInflation%             float64
WoWUSDChange%             float64
Season                   category
dtype: object

Check NaN

In [791]:
df.isna().any()

Customer           False
Area               False
City               False
Product            False
WeekStarting       False
Litres              True
Price(t-1)         False
NewCars            False
WeekendHolidays    False
WeekdayHolidays    False
GDPGrowth%         False
MoMInflation%      False
WoWUSDChange%      False
Season             False
dtype: bool

### Add time series data as new features
- Order of prious week (week t-1)
- Order of week t-2
- Order for the previous week last year ( week t-1, y-1)
- Order for the current week last year 

Other todo:
- Drop first year data before training
- Drop Customer	and Area (Emin olamadım, bunları da encode ettirmicez mi category olarak)
- Encode petrol/diesel, categorize ettim (Done)
- Add seasonality (Done)
- Change usd% to float (Done)



In [792]:
# Function to add week lag

def add_time_lag(week_lag,df):
    df_lag = df[["Customer","Product","WeekStarting", "Litres"]].copy()
    df_lag["WeekStarting"] = df_lag["WeekStarting"] + pd.to_timedelta(week_lag,unit="w")
    df_lag.rename(columns={"Litres":f"Litres(w-{week_lag})"}, inplace=True)
    df = df.merge(df_lag,how="left")
    return df


In [793]:
df = add_time_lag(52,df)
df = add_time_lag(53,df)
df = add_time_lag(54,df)

Check which customers are unique to train/test

In [794]:
a = df_test.Customer.unique()
b = df_train.Customer.unique()


only_test = list(set(a) - set(b))
only_train = list(set(b)- set(a))
print(len(only_test), " customers in test set only")
print(len(only_train), " customers in train set only")

36  customers in test set only
229  customers in train set only


In [795]:
df[(df["WeekStarting"] > '2015-01-12') & (df["WeekStarting"] < '2019-01-01') ].isna().sum()

Customer               0
Area                   0
City                   0
Product                0
WeekStarting           0
Litres                 0
Price(t-1)             0
NewCars                0
WeekendHolidays        0
WeekdayHolidays        0
GDPGrowth%             0
MoMInflation%          0
WoWUSDChange%          0
Season                 0
Litres(w-52)       40095
Litres(w-53)       41452
Litres(w-54)       42458
dtype: int64

For the customers that are in the test set only, we need to set the w-52,53,54 values to the week average of that city

In [796]:
# Function to add week lag of past 52 weej average for the given city

def add_time_lag_average_city(week_lag,df):
    df_lag = df[["Customer","City","Product","WeekStarting", "Litres"]].copy()
    df_lag["WeekStarting"] = df_lag["WeekStarting"] + pd.to_timedelta(week_lag,unit="w")
    df_lag.rename(columns={"Litres":f"Litres(w-{week_lag}_average_city)"}, inplace=True)
    df_lag = df_lag.groupby(['WeekStarting', 'Product','City'])[[f"Litres(w-{week_lag}_average_city)"]].mean().reset_index()
    df = df.merge(df_lag,how="left")
    return df

In [797]:
df = add_time_lag_average_city(52,df)
df = add_time_lag_average_city(53,df)
df = add_time_lag_average_city(54,df)

In [798]:
df[(df["WeekStarting"] > '2015-01-12') & (df["WeekStarting"] < '2019-01-01') ]

Unnamed: 0,Customer,Area,City,Product,WeekStarting,Litres,Price(t-1),NewCars,WeekendHolidays,WeekdayHolidays,GDPGrowth%,MoMInflation%,WoWUSDChange%,Season,Litres(w-52),Litres(w-53),Litres(w-54),Litres(w-52_average_city),Litres(w-53_average_city),Litres(w-54_average_city)
98068,10048261,Karadeniz Bölgesi,883,Benzin,2015-01-19,9929.0,4.11,4995,0.0,0.0,0.017,-0.44,1.09,Winter,14733.0,13886.0,4242.0,8133.666667,7694.666667,4242.000000
98069,10048261,Karadeniz Bölgesi,883,Motorin,2015-01-19,29208.0,3.58,4995,0.0,0.0,0.017,-0.44,1.09,Winter,41549.0,55606.0,29017.0,42240.500000,41317.000000,28331.750000
98070,10048263,Karadeniz Bölgesi,966,Benzin,2015-01-19,10848.0,4.08,4995,0.0,0.0,0.017,-0.44,1.09,Winter,,,4572.0,5201.636364,5403.090909,6440.777778
98071,10048263,Karadeniz Bölgesi,966,Motorin,2015-01-19,9442.0,3.55,4995,0.0,0.0,0.017,-0.44,1.09,Winter,,35583.0,30681.0,37268.538462,32558.266667,30124.076923
98072,10048264,Karadeniz Bölgesi,883,Benzin,2015-01-19,3570.0,4.11,4995,0.0,0.0,0.017,-0.44,1.09,Winter,4656.0,,,8133.666667,7694.666667,4242.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478949,12663274,Doğu Anadolu Bölgesi,518,Motorin,2018-12-24,17102.0,6.10,1521,0.0,0.0,0.016,-1.44,-0.86,Winter,,,,,,
478950,12663344,Doğu Anadolu Bölgesi,218,Benzin,2018-12-24,11111.0,6.28,1521,0.0,0.0,0.016,-1.44,-0.86,Winter,,,,6551.200000,7402.333333,9803.400000
478951,12663344,Doğu Anadolu Bölgesi,218,Motorin,2018-12-24,22882.0,6.11,1521,0.0,0.0,0.016,-1.44,-0.86,Winter,,,,39078.000000,43062.500000,56666.500000
478952,12664221,Marmara Bölgesi,941,Benzin,2018-12-24,6528.0,6.19,16306,0.0,0.0,0.016,-1.44,-0.86,Winter,,,,19794.473684,17088.684211,16836.473684


In [799]:
# Function to add week lag of past 52 weej average for the given city

def add_time_lag_average_area(week_lag,df):
    df_lag = df[["Customer","Area","Product","WeekStarting", "Litres"]].copy()
    df_lag["WeekStarting"] = df_lag["WeekStarting"] + pd.to_timedelta(week_lag,unit="w")
    df_lag.rename(columns={"Litres":f"Litres(w-{week_lag}_average_area)"}, inplace=True)
    df_lag = df_lag.groupby(['WeekStarting', 'Product','Area'])[[f"Litres(w-{week_lag}_average_area)"]].mean().reset_index()
    df = df.merge(df_lag,how="left")
    return df

In [800]:
df = add_time_lag_average_area(52,df)
df = add_time_lag_average_area(53,df)
df = add_time_lag_average_area(54,df)

Remove year 2014 and first two weeks of 2015

In [801]:
df = df[df.WeekStarting > "2015-01-12"]

In [802]:
df

Unnamed: 0,Customer,Area,City,Product,WeekStarting,Litres,Price(t-1),NewCars,WeekendHolidays,WeekdayHolidays,...,Season,Litres(w-52),Litres(w-53),Litres(w-54),Litres(w-52_average_city),Litres(w-53_average_city),Litres(w-54_average_city),Litres(w-52_average_area),Litres(w-53_average_area),Litres(w-54_average_area)
98068,10048261,Karadeniz Bölgesi,883,Benzin,2015-01-19,9929.0,4.11,4995,0.0,0.0,...,Winter,14733.0,13886.0,4242.0,8133.666667,7694.666667,4242.000000,7808.352113,7245.707692,7204.403509
98069,10048261,Karadeniz Bölgesi,883,Motorin,2015-01-19,29208.0,3.58,4995,0.0,0.0,...,Winter,41549.0,55606.0,29017.0,42240.500000,41317.000000,28331.750000,42001.600000,39898.258824,33080.654762
98070,10048263,Karadeniz Bölgesi,966,Benzin,2015-01-19,10848.0,4.08,4995,0.0,0.0,...,Winter,,,4572.0,5201.636364,5403.090909,6440.777778,7808.352113,7245.707692,7204.403509
98071,10048263,Karadeniz Bölgesi,966,Motorin,2015-01-19,9442.0,3.55,4995,0.0,0.0,...,Winter,,35583.0,30681.0,37268.538462,32558.266667,30124.076923,42001.600000,39898.258824,33080.654762
98072,10048264,Karadeniz Bölgesi,883,Benzin,2015-01-19,3570.0,4.11,4995,0.0,0.0,...,Winter,4656.0,,,8133.666667,7694.666667,4242.000000,7808.352113,7245.707692,7204.403509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
587421,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02,,7.14,2004,0.0,0.0,...,Winter,,,,8800.750000,7729.500000,8763.000000,8121.625000,7690.533333,8455.258065
587422,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09,,7.14,2004,0.0,0.0,...,Winter,,,,8200.000000,8800.750000,7729.500000,9793.133333,8121.625000,7690.533333
587423,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16,,7.14,2004,0.0,0.0,...,Winter,,,,11166.333333,8200.000000,8800.750000,7648.218750,9793.133333,8121.625000
587424,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23,,7.14,2004,0.0,0.0,...,Winter,,,,6900.500000,11166.333333,8200.000000,8894.823529,7648.218750,9793.133333


For the customers that are only in the training set, set the w-52, w-53,w-54 data to the average

In [803]:
df.isna().sum()

Customer                          0
Area                              0
City                              0
Product                           0
WeekStarting                      0
Litres                       108472
Price(t-1)                        0
NewCars                           0
WeekendHolidays                   0
WeekdayHolidays                   0
GDPGrowth%                        0
MoMInflation%                     0
WoWUSDChange%                     0
Season                            0
Litres(w-52)                  54564
Litres(w-53)                  54117
Litres(w-54)                  55189
Litres(w-52_average_city)      2596
Litres(w-53_average_city)       528
Litres(w-54_average_city)       545
Litres(w-52_average_area)      2086
Litres(w-53_average_area)         0
Litres(w-54_average_area)         0
dtype: int64

In [804]:
df[(df["WeekStarting"] > '2015-01-12') & (df["WeekStarting"] < '2019-01-01') ].isna().sum()

Customer                         0
Area                             0
City                             0
Product                          0
WeekStarting                     0
Litres                           0
Price(t-1)                       0
NewCars                          0
WeekendHolidays                  0
WeekdayHolidays                  0
GDPGrowth%                       0
MoMInflation%                    0
WoWUSDChange%                    0
Season                           0
Litres(w-52)                 40095
Litres(w-53)                 41452
Litres(w-54)                 42458
Litres(w-52_average_city)      363
Litres(w-53_average_city)      377
Litres(w-54_average_city)      391
Litres(w-52_average_area)        0
Litres(w-53_average_area)        0
Litres(w-54_average_area)        0
dtype: int64

Fill in the w-52 values in year 2019 as w-53 values

Fill in Litres(w-52_average_area) for all customers as   Litres(w-53_average_area) 

In [805]:
df.loc[df["Litres(w-52_average_area)"].isna() ,"Litres(w-52_average_area)"] = df.loc[df["Litres(w-52_average_area)"].isna() ,"Litres(w-53_average_area)"] 

Fill in average cities for all customers

In [806]:
# fill in w54 average city as average region
df.loc[df["Litres(w-54_average_city)"].isna() ,"Litres(w-54_average_city)"] = df.loc[df["Litres(w-54_average_city)"].isna() ,"Litres(w-54_average_area)"] 

In [807]:
df.loc[df["Litres(w-53_average_city)"].isna() ,"Litres(w-53_average_city)"] = df.loc[df["Litres(w-53_average_city)"].isna() ,"Litres(w-54_average_city)"] 

In [808]:
df.loc[df["Litres(w-52_average_city)"].isna() ,"Litres(w-52_average_city)"] = df.loc[df["Litres(w-52_average_city)"].isna() ,"Litres(w-53_average_city)"] 

Fill in average cities as average areas if they are still missing

In [809]:
# fill in w54 average city as average region
df.loc[df["Litres(w-53_average_city)"].isna() ,"Litres(w-53_average_city)"] = df.loc[df["Litres(w-53_average_city)"].isna() ,"Litres(w-53_average_area)"] 

In [810]:
# fill in w54 average city as average region
df.loc[df["Litres(w-52_average_city)"].isna() ,"Litres(w-52_average_city)"] = df.loc[df["Litres(w-52_average_city)"].isna() ,"Litres(w-52_average_area)"] 

Fill in w-52,53,54 as city averages

In [811]:
# fill in w54 average city as average region
df.loc[df["Litres(w-52)"].isna() ,"Litres(w-52)"] = df.loc[df["Litres(w-52)"].isna() ,"Litres(w-52_average_city)"] 

In [812]:
# fill in w54 average city as average region
df.loc[df["Litres(w-53)"].isna() ,"Litres(w-53)"] = df.loc[df["Litres(w-53)"].isna() ,"Litres(w-53_average_city)"] 

In [813]:
# fill in w54 average city as average region
df.loc[df["Litres(w-54)"].isna() ,"Litres(w-54)"] = df.loc[df["Litres(w-54)"].isna() ,"Litres(w-54_average_city)"] 

Add week of year value

In [815]:
df["WeekNumber"] = df['WeekStarting'].dt.isocalendar().week.astype("category")

In [816]:
df.dtypes

Customer                           category
Area                               category
City                                  int64
Product                            category
WeekStarting                 datetime64[ns]
Litres                              float64
Price(t-1)                          float64
NewCars                               int64
WeekendHolidays                     float64
WeekdayHolidays                     float64
GDPGrowth%                          float64
MoMInflation%                       float64
WoWUSDChange%                       float64
Season                             category
Litres(w-52)                        float64
Litres(w-53)                        float64
Litres(w-54)                        float64
Litres(w-52_average_city)           float64
Litres(w-53_average_city)           float64
Litres(w-54_average_city)           float64
Litres(w-52_average_area)           float64
Litres(w-53_average_area)           float64
Litres(w-54_average_area)       

In [817]:
df.drop(labels="City",axis=1,inplace=True)

In [818]:
df.columns

Index(['Customer', 'Area', 'Product', 'WeekStarting', 'Litres', 'Price(t-1)',
       'NewCars', 'WeekendHolidays', 'WeekdayHolidays', 'GDPGrowth%',
       'MoMInflation%', 'WoWUSDChange%', 'Season', 'Litres(w-52)',
       'Litres(w-53)', 'Litres(w-54)', 'Litres(w-52_average_city)',
       'Litres(w-53_average_city)', 'Litres(w-54_average_city)',
       'Litres(w-52_average_area)', 'Litres(w-53_average_area)',
       'Litres(w-54_average_area)', 'WeekNumber'],
      dtype='object')

In [142]:
#df_train.to_csv("TrainingDataWithExternalData.csv")
df.to_pickle("Training_TestData.pkl")