Imports

In [1]:
import pandas as pd
import numpy as np

Fill missing weeks or not choose
- If True, it will assume customers which do not have any order data for a given week have ordered nothing in that week

In [2]:
fill_missing_weeks = True

### Read in price dataset and ready it for merge

In [3]:
df_prices = pd.read_csv("OriginalData\price.csv")
df_prices.head(1)

Unnamed: 0,City Code,Validfrom,Product,CRP
0,116,2016-07-27,Benzin,4.46


In [4]:
df_prices.columns=["City","Date","Product","CRP"]
df_prices.reset_index(inplace=True)
df_prices.drop(labels="index",axis=1,inplace=True)

## Ready Price Dataset for merge

Convert string dates to datetime objects

In [5]:
df_prices['Date']= pd.to_datetime(df_prices['Date'])
df_prices.sort_values(by=["Date","City"], inplace=True)
df_prices.columns=["City","Date","Product","CRP"]
df_prices.reset_index(inplace=True,drop=True)


Add one day so we can merge 

In [6]:
df_prices["Date"] = df_prices["Date"] + pd.to_timedelta(1, unit='d')
df_prices.head()

Unnamed: 0,City,Date,Product,CRP
0,125,2014-01-02,Benzin,4.95
1,125,2014-01-02,Motorin,4.57
2,126,2014-01-02,Benzin,4.95
3,126,2014-01-02,Motorin,4.56
4,172,2014-01-02,Benzin,5.03


Add Percentage Price Change WoW

In [7]:
df_prices["Price%Change"] = 0
cities = df_prices["City"].unique()
for city in cities:
    df_prices.loc[(df_prices["City"]==city)& (df_prices.Product == "Benzin"),"Price%Change" ] = df_prices[(df_prices.City == city)& (df_prices.Product == "Benzin")]["CRP"].pct_change()
    df_prices.loc[(df_prices["City"]==city)& (df_prices.Product == "Motorin"),"Price%Change"] = df_prices[(df_prices.City == city)& (df_prices.Product == "Motorin")]["CRP"].pct_change()

## Prep Train dataset

In [8]:
df_train = pd.read_csv(r"OriginalData\train.csv")
df_train.head()

Unnamed: 0,Customer,Area,City Code,Billing Date,Product,Plant,Litres
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T102,0.0
1,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T155,0.0
2,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T102,0.0
3,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T155,0.0
4,10048261,Karadeniz Bölgesi,883,2014-01-02,Motorin,T155,16685.0


Sum up all orders for a week:
- I want to have a single entry summing up the orders placed for each week so will sum up orders for the same fuel across different plants

In [9]:
# rename columns for ease
df_train.columns=["Customer", "Area", "City", "Date", "Product", "Plant", "Litres"]

# convert to datetime, subtract 7 days so we can get the total for the following week
df_train['Date'] = pd.to_datetime(df_train['Date']) - pd.to_timedelta(7, unit='d')
# Sort
df_train.sort_values(by=["Date", "Customer", "Product"], inplace=True)

In [10]:
df_train.head()

Unnamed: 0,Customer,Area,City,Date,Product,Plant,Litres
0,10048261,Karadeniz Bölgesi,883,2013-12-25,Benzin,T102,0.0
1,10048261,Karadeniz Bölgesi,883,2013-12-25,Benzin,T155,0.0
2,10048261,Karadeniz Bölgesi,883,2013-12-25,Motorin,T102,0.0
3,10048261,Karadeniz Bölgesi,883,2013-12-25,Motorin,T155,0.0
2283,10048264,Karadeniz Bölgesi,883,2013-12-25,Motorin,T105,0.0


- Below I sum up orders across week, and drop the plant code

In [11]:
df_t = df_train.groupby(['Customer', "Area","City","Product", pd.Grouper(key='Date', freq='W-MON')])['Litres'].sum().reset_index().sort_values('Date')
#sort values to inspect
df_t.sort_values(by=["Date","Customer", "Product"], inplace=True)

In [12]:
df_t.head(5)

Unnamed: 0,Customer,Area,City,Product,Date,Litres
0,10048261,Karadeniz Bölgesi,883,Benzin,2013-12-30,5040.0
256,10048261,Karadeniz Bölgesi,883,Motorin,2013-12-30,28712.0
685,10048263,Karadeniz Bölgesi,966,Motorin,2013-12-30,34082.0
899,10048264,Karadeniz Bölgesi,883,Benzin,2013-12-30,3608.0
1069,10048264,Karadeniz Bölgesi,883,Motorin,2013-12-30,21325.0


In [13]:
# Remove first week because the data for it is incomplete
df_t = df_t[df_t["Date"] != "2013-12-30" ]
# reassign to original df
df_train = df_t.copy()

## Fill in missing weeks in datasets

In [14]:
if fill_missing_weeks == True:
    idx = pd.date_range('2014-01-06', '2018-12-24',freq='W-MON')
    dates = (
        df_train.groupby(["Customer", "Product","City","Area"])["Date"]
        .apply(lambda s: pd.date_range('2014-01-06', '2018-12-24',freq='W-MON'))
        .explode()
        .reset_index()
    )
    df_train = df_train.merge(dates,how="outer").fillna(0)

### Get Test Dataset

For now I will merge the test and training dataset, so I can combine them with the external data. Then I will split and save them. No added features/columns will cause data leakage from the test to the training set

In [18]:
df_test = pd.read_csv(r"OriginalData\test.csv")
df_test["Date"] = pd.to_datetime(df_test['Week Start Date']) 
df_test.drop(labels=["index","Date"],axis=1,inplace=True)
df_test.columns = ["Customer", "Area", "City","Product","Date"]
df_test.head()

Unnamed: 0,Customer,Area,City,Product,Date
0,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-07
1,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-14
2,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-21
3,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-28
4,10048261,Karadeniz Bölgesi,883,Motorin,2019-02-04


In [19]:
df = pd.concat([df_train,df_test])
df.reset_index(inplace=True,drop=True)
df["Date"] = pd.to_datetime(df['Date']) 
df.head()

Unnamed: 0,Customer,Area,City,Product,Date,Litres
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0


## Combine price and training datasets
 - For the week starting on 2014-01-06(Monday), the price data will be the price on 2014-01-05(Sunday)

In [20]:
# Merge datasets
df_merged = df.merge(df_prices,how="left")
# Rename column names for clarity
df_merged.rename(columns={"CRP":"Price(t-1)"}, inplace=True)
df_merged.head()


Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1),Price%Change
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07,-0.005882
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.4,-0.011236
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04,-0.005917
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34,-0.01139
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.4,-0.011236


In [21]:
# Check for nan values
df_merged["Price(t-1)"].isna().any()

True

In [22]:
df_merged["Price(t-1)"].isna().sum()

13966

#### Fill in the Nan values in Price t-1 with the mean of that date

In [23]:
temp = df_merged.groupby(['Date', 'Product'])[['Price(t-1)']].mean().reset_index()
temp.rename(columns={'Price(t-1)': 'Price(t-1)_mean'}, inplace=True)
df_temp = pd.merge(df_merged, temp, how='left', on=['Date', 'Product'])
df_temp.loc[df_temp['Price(t-1)'].isna(),'Price(t-1)'] = df_temp['Price(t-1)_mean']

In [24]:
df_temp.head()

Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1),Price%Change,Price(t-1)_mean
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07,-0.005882,5.042535
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.4,-0.011236,4.34786
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04,-0.005917,5.042535
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34,-0.01139,4.34786
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.4,-0.011236,4.34786


In [30]:
# show none values
df_temp[df_temp.isna().any(axis=1)]

Unnamed: 0,Customer,Area,City,Product,Date,Litres,Price(t-1),Price%Change,Price(t-1)_mean
1006,10050046,Karadeniz Bölgesi,947,Benzin,2014-01-06,8340.0,5.039854,,5.039854
1007,10050046,Karadeniz Bölgesi,947,Motorin,2014-01-06,16382.0,4.346342,,4.346342
1453,12122988,Doğu Anadolu Bölgesi,116,Motorin,2014-01-06,23142.0,4.346342,,4.346342
3114,12122988,Doğu Anadolu Bölgesi,116,Benzin,2014-01-13,4836.0,4.469550,,4.469550
3115,12122988,Doğu Anadolu Bölgesi,116,Motorin,2014-01-13,26544.0,3.940996,,3.940996
...,...,...,...,...,...,...,...,...,...
587421,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02,,7.140000,0.0,6.963877
587422,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09,,7.140000,0.0,6.963877
587423,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16,,7.140000,0.0,6.963877
587424,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23,,7.140000,0.0,6.965134


There are 4 weeks which don't have any price data at all

In [25]:
df_temp[df_temp['Price(t-1)'].isna()].Date.unique()

array(['2015-01-05T00:00:00.000000000', '2015-01-12T00:00:00.000000000',
       '2015-02-09T00:00:00.000000000', '2015-03-09T00:00:00.000000000'],
      dtype='datetime64[ns]')

Manually impute prices from the preceeding and next week for these 4 weird dates

In [26]:
# Petrol
b1 = df_temp[(df_temp["Date"] == "2014-12-29") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()
b2= df_temp[(df_temp["Date"] == "2015-01-19") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()

b3= df_temp[(df_temp["Date"] == "2015-02-02") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()
b4= df_temp[(df_temp["Date"] == "2015-02-16") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()

b5= df_temp[(df_temp["Date"] == "2015-03-02") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()
b6= df_temp[(df_temp["Date"] == "2015-03-16") & (df_temp["Product"] == "Benzin")]["Price(t-1)_mean"].mean()

print(b1,b2,b3,b4,b5,b6)
# Diesel
d1 = df_temp[(df_temp["Date"] == "2014-12-29") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()
d2= df_temp[(df_temp["Date"] == "2015-01-19") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()

d3 = df_temp[(df_temp["Date"] == "2015-02-02") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()
d4 = df_temp[(df_temp["Date"] == "2015-02-16") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()

d5= df_temp[(df_temp["Date"] == "2015-03-02") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()
d6= df_temp[(df_temp["Date"] == "2015-03-16") & (df_temp["Product"] == "Motorin")]["Price(t-1)_mean"].mean()

print(d1,d2,d3,d4,d5,d6)

4.281553556827374 4.086721177432621 4.281553556827374 4.391218315617234 4.1927064595256756 4.479493049877403
3.793441822619967 3.5537672904801245 3.793441822619967 3.916118795768871 3.7118958502847637 3.857461350691659


In [27]:
df_temp.loc[(df_temp["Date"] == "2015-01-05") & (df_temp["Product"] == "Benzin"), "Price(t-1)"] = (b1 + b2) /2
df_temp.loc[(df_temp["Date"] == "2015-01-12") & (df_temp["Product"] == "Benzin"), "Price(t-1)"] = (b1 + b2) /2

df_temp.loc[(df_temp["Date"] == "2015-02-09") & (df_temp["Product"] == "Benzin"), "Price(t-1)"] = (b3 + b4) /2
df_temp.loc[(df_temp["Date"] == "2015-03-09") & (df_temp["Product"] == "Benzin"), "Price(t-1)"] = (b5 + b6) /2

#diesels
df_temp.loc[(df_temp["Date"] == "2015-01-05") & (df_temp["Product"] == "Motorin"), "Price(t-1)"] = (d1 + d2) /2
df_temp.loc[(df_temp["Date"] == "2015-01-12") & (df_temp["Product"] == "Motorin"), "Price(t-1)"] = (d1 + d2) /2

df_temp.loc[(df_temp["Date"] == "2015-02-09") & (df_temp["Product"] == "Motorin"), "Price(t-1)"] = (d3 + d4) /2
df_temp.loc[(df_temp["Date"] == "2015-03-09") & (df_temp["Product"] == "Motorin"), "Price(t-1)"] = (d5 + d6) /2


### Check how successfull the filling of data was

In [28]:
df_temp.isna().any()

Customer           False
Area               False
City               False
Product            False
Date               False
Litres              True
Price(t-1)         False
Price%Change        True
Price(t-1)_mean     True
dtype: bool

Price(t-1) has no Nans so we have succeeded (we expect litres (label) to be nan for 2019 data)

In [29]:
df = df_temp.copy()
df.drop(labels="Price(t-1)_mean",axis=1,inplace=True)
df["Price%Change"] = df["Price%Change"].fillna(0)

In [31]:
# Rename Column for clarity
df.rename(columns={"Date":"WeekStarting"},inplace=True)

# Add in external data one by one and merge

### Car data

In [32]:
df_cars = pd.read_csv("ExternalDataFormatted/CarRegistrationDataReady.csv")
df_cars.columns = ["WeekStarting", "Area","NewCars"]
df_cars["WeekStarting"]  = pd.to_datetime(df_cars['WeekStarting'])
df_cars.fillna(0)
df_cars.head()

Unnamed: 0,WeekStarting,Area,NewCars
0,2014-01-06,Akdeniz Bölgesi,
1,2014-01-13,Akdeniz Bölgesi,
2,2014-01-20,Akdeniz Bölgesi,
3,2014-01-27,Akdeniz Bölgesi,
4,2014-02-03,Akdeniz Bölgesi,


In [33]:
# Merge
df_merged = df.merge(df_cars,how="left")
# Check one area
df_merged[df_merged["Area"]=="Karadeniz Bölgesi"]

Unnamed: 0,Customer,Area,City,Product,WeekStarting,Litres,Price(t-1),Price%Change,NewCars
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-06,4242.0,5.07,-0.005882,
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-06,29017.0,4.40,-0.011236,
2,10048263,Karadeniz Bölgesi,966,Benzin,2014-01-06,4572.0,5.04,-0.005917,
3,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-06,30681.0,4.34,-0.011390,
4,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-06,25009.0,4.40,-0.011236,
...,...,...,...,...,...,...,...,...,...
749315,12690825,Karadeniz Bölgesi,966,Benzin,2019-12-02,,7.00,0.000000,34697.0
749316,12690825,Karadeniz Bölgesi,966,Benzin,2019-12-09,,7.00,0.000000,34697.0
749317,12690825,Karadeniz Bölgesi,966,Benzin,2019-12-16,,7.00,0.000000,34697.0
749318,12690825,Karadeniz Bölgesi,966,Benzin,2019-12-23,,7.00,0.000000,34697.0


In [34]:
df = df_merged.copy()

### Holiday Data

In [35]:
df_holidays = pd.read_csv("ExternalDataFormatted/HolidayDataReady.csv")
df_holidays.columns = ["WeekStarting", "WeekendHolidays","WeekdayHolidays"]
df_holidays["WeekStarting"]  = pd.to_datetime(df_holidays['WeekStarting'])
df_holidays.head()

Unnamed: 0,WeekStarting,WeekendHolidays,WeekdayHolidays
0,2013-12-30,0,1
1,2014-04-21,0,1
2,2014-04-28,0,1
3,2014-05-19,0,1
4,2014-07-21,1,0


In [36]:
df_merged = df.merge(df_holidays,how="left")

In [37]:
df = df_merged.copy()

### GDP Growth

In [38]:
df_gdp = pd.read_csv("ExternalDataFormatted/GDPDataReady.csv")
df_gdp.columns = ["WeekStarting", "GDPGrowth%"]
df_gdp["WeekStarting"]  = pd.to_datetime(df_gdp['WeekStarting'])
df_gdp.head()

Unnamed: 0,WeekStarting,GDPGrowth%
0,2014-01-06,0.044
1,2014-01-13,0.044
2,2014-01-20,0.044
3,2014-01-27,0.044
4,2014-02-03,0.044


In [39]:
df_merged = df.merge(df_gdp,how="left")

In [40]:
df = df_merged.copy()

## InflationData

In [41]:
df_inflation = pd.read_csv("ExternalDataFormatted/InflationDataReady.csv")
df_inflation.columns = ["WeekStarting", "MoMInflation%"]
df_inflation["WeekStarting"]  = pd.to_datetime(df_inflation['WeekStarting'])
df_inflation.head()

Unnamed: 0,WeekStarting,MoMInflation%
0,2014-01-06,6.91
1,2014-01-13,7.4
2,2014-01-20,7.4
3,2014-01-27,7.4
4,2014-02-03,7.4


In [42]:
df_merged = df.merge(df_inflation,how="left")

In [43]:
df = df_merged.copy()

### USD TRY

In [44]:
df_usd = pd.read_csv("ExternalDataFormatted/USDDataReady.csv")
df_usd.columns = ["WeekStarting", "WoWUSDChange%"]
df_usd["WeekStarting"]  = pd.to_datetime(df_usd['WeekStarting'])
df_usd.head()

Unnamed: 0,WeekStarting,WoWUSDChange%
0,2014-01-06,-0.62%
1,2014-01-13,3.08%
2,2014-01-20,4.47%
3,2014-01-27,"-3,04%"
4,2014-02-03,"-1,80%"


In [45]:
df_merged = df.merge(df_usd,how="left")
df = df_merged.copy()

Fill Nan Values

In [46]:
df.isna().any()

Customer           False
Area               False
City               False
Product            False
WeekStarting       False
Litres              True
Price(t-1)         False
Price%Change       False
NewCars             True
WeekendHolidays     True
WeekdayHolidays     True
GDPGrowth%         False
MoMInflation%      False
WoWUSDChange%      False
dtype: bool

The Nan values in Holiday columns mean the upcoming week has no holidays 

In [47]:
df.WeekendHolidays = df.WeekendHolidays.fillna(int(0))
df.WeekdayHolidays = df.WeekdayHolidays.fillna(int(0))
df.NewCars = df.NewCars.fillna(0)

Change USDTRY WoW%Change data to float by replacing , with . then removing % then casting into float

In [48]:
df["WoWUSDChange%"] = df["WoWUSDChange%"].str.replace(",", ".")
df["WoWUSDChange%"] = df["WoWUSDChange%"].str.rstrip('%').astype('float')
df["WoWUSDChange%"]

0        -0.62
1        -0.62
2        -0.62
3        -0.62
4        -0.62
          ... 
749627    0.52
749628    0.55
749629    2.21
749630    0.23
749631    0.37
Name: WoWUSDChange%, Length: 749632, dtype: float64

Add Season data

In [49]:
df = df.assign(Season=lambda x: x['WeekStarting'].apply(lambda y: 'Winter' if y.month in [12, 1, 2] else 'Summer' if y.month in [
               3, 4, 5] else 'Autumn' if y.month in [6, 7, 8] else 'Fall' if y.month in [9, 10, 11] else 'Unknown'))

Categorize the columns

In [50]:
category_columns = ["Customer", "Area", "Product", "Season"]
df[category_columns] = df[category_columns].astype("category")

Check column types for correctness

In [51]:
df.dtypes

Customer                 category
Area                     category
City                        int64
Product                  category
WeekStarting       datetime64[ns]
Litres                    float64
Price(t-1)                float64
Price%Change              float64
NewCars                   float64
WeekendHolidays           float64
WeekdayHolidays           float64
GDPGrowth%                float64
MoMInflation%             float64
WoWUSDChange%             float64
Season                   category
dtype: object

Check NaN

In [52]:
df.isna().any()

Customer           False
Area               False
City               False
Product            False
WeekStarting       False
Litres              True
Price(t-1)         False
Price%Change       False
NewCars            False
WeekendHolidays    False
WeekdayHolidays    False
GDPGrowth%         False
MoMInflation%      False
WoWUSDChange%      False
Season             False
dtype: bool

### Add time series data as new features
- Order of next week last year (week t-51)
- Order of this week last year (week t-52)
- And so on similarly for t-53,54


Other todo:
- Drop first year data before training
- Drop Customer	and Area (Emin olamadım, bunları da encode ettirmicez mi category olarak)
- Encode petrol/diesel, categorize ettim (Done)
- Add seasonality (Done)
- Change usd% to float (Done)



Function to add week lag

In [54]:
def add_time_lag(week_lag,df):
    df_lag = df[["Customer","Product","WeekStarting", "Litres"]].copy()
    df_lag["WeekStarting"] = df_lag["WeekStarting"] + pd.to_timedelta(week_lag,unit="w")
    df_lag.rename(columns={"Litres":f"Litres(w-{week_lag})"}, inplace=True)
    df = df.merge(df_lag,how="left")
    return df


In [55]:
df = add_time_lag(51,df)
df = add_time_lag(52,df)
df = add_time_lag(53,df)
df = add_time_lag(54,df)

Check which customers are unique to train/test

In [56]:
a = df_test.Customer.unique()
b = df_train.Customer.unique()


only_test = list(set(a) - set(b))
only_train = list(set(b)- set(a))
print(len(only_test), " customers in test set only")
print(len(only_train), " customers in train set only")

36  customers in test set only
229  customers in train set only


Deal with nan values of the new features

In [57]:
df[(df["WeekStarting"] > '2015-01-12') & (df["WeekStarting"] < '2019-01-01') ].isna().sum()

Customer           0
Area               0
City               0
Product            0
WeekStarting       0
Litres             0
Price(t-1)         0
Price%Change       0
NewCars            0
WeekendHolidays    0
WeekdayHolidays    0
GDPGrowth%         0
MoMInflation%      0
WoWUSDChange%      0
Season             0
Litres(w-51)       0
Litres(w-52)       0
Litres(w-53)       0
Litres(w-54)       0
dtype: int64

In [58]:
df[(df["WeekStarting"] >'2019-01-01') ].isna().sum()

Customer                0
Area                    0
City                    0
Product                 0
WeekStarting            0
Litres             108472
Price(t-1)              0
Price%Change            0
NewCars                 0
WeekendHolidays         0
WeekdayHolidays         0
GDPGrowth%              0
MoMInflation%           0
WoWUSDChange%           0
Season                  0
Litres(w-51)         7822
Litres(w-52)         5809
Litres(w-53)         3796
Litres(w-54)         3796
dtype: int64

For the customers that are in the test set only, we can set the w-52,53,54 values to the week average of that city

In [59]:
# Function to add week lag of past 52 weej average for the given city

def add_time_lag_average_city(week_lag,df):
    df_lag = df[["Customer","City","Product","WeekStarting", "Litres"]].copy()
    df_lag["WeekStarting"] = df_lag["WeekStarting"] + pd.to_timedelta(week_lag,unit="w")
    df_lag.rename(columns={"Litres":f"Litres(w-{week_lag}_average_city)"}, inplace=True)
    df_lag = df_lag.groupby(['WeekStarting', 'Product','City'])[[f"Litres(w-{week_lag}_average_city)"]].mean().reset_index()
    df = df.merge(df_lag,how="left")
    return df

In [60]:
df = add_time_lag_average_city(51,df)
df = add_time_lag_average_city(52,df)
df = add_time_lag_average_city(53,df)
df = add_time_lag_average_city(54,df)

In [62]:
# For checking
df[(df["WeekStarting"] > '2015-01-12') & (df["WeekStarting"] < '2019-01-01') ]

Unnamed: 0,Customer,Area,City,Product,WeekStarting,Litres,Price(t-1),Price%Change,NewCars,WeekendHolidays,...,WoWUSDChange%,Season,Litres(w-51),Litres(w-52),Litres(w-53),Litres(w-54),Litres(w-51_average_city),Litres(w-52_average_city),Litres(w-53_average_city),Litres(w-54_average_city)
98068,10048261,Karadeniz Bölgesi,883,Benzin,2015-01-19,9929.0,4.11,0.0,50235.0,0.0,...,1.09,Winter,4853.0,14733.0,13886.0,4242.0,808.833333,4066.833333,3847.333333,707.000000
98069,10048261,Karadeniz Bölgesi,883,Motorin,2015-01-19,29208.0,3.58,0.0,50235.0,0.0,...,1.09,Winter,33879.0,41549.0,55606.0,29017.0,14761.166667,28160.333333,20658.500000,18887.833333
98070,10048263,Karadeniz Bölgesi,966,Benzin,2015-01-19,10848.0,4.08,0.0,50235.0,0.0,...,1.09,Winter,0.0,0.0,0.0,4572.0,3624.947368,3011.473684,3128.105263,3050.894737
98071,10048263,Karadeniz Bölgesi,966,Motorin,2015-01-19,9442.0,3.55,0.0,50235.0,0.0,...,1.09,Winter,0.0,0.0,35583.0,30681.0,23537.210526,25499.526316,25703.894737,20611.210526
98072,10048264,Karadeniz Bölgesi,883,Benzin,2015-01-19,3570.0,4.11,0.0,50235.0,0.0,...,1.09,Winter,0.0,4656.0,0.0,0.0,808.833333,4066.833333,3847.333333,707.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641155,12664221,Marmara Bölgesi,941,Motorin,2018-11-19,0.0,6.39,0.0,525415.0,0.0,...,-0.83,Fall,0.0,0.0,0.0,0.0,67690.000000,67252.437500,63493.250000,72023.729167
641156,12664221,Marmara Bölgesi,941,Motorin,2018-11-26,0.0,6.26,0.0,525415.0,0.0,...,-1.41,Fall,0.0,0.0,0.0,0.0,67346.791667,67690.000000,67252.437500,63493.250000
641157,12664221,Marmara Bölgesi,941,Motorin,2018-12-03,0.0,6.03,0.0,489128.0,0.0,...,1.73,Winter,0.0,0.0,0.0,0.0,67743.666667,67346.791667,67690.000000,67252.437500
641158,12664221,Marmara Bölgesi,941,Motorin,2018-12-10,0.0,5.96,0.0,489128.0,0.0,...,1.14,Winter,0.0,0.0,0.0,0.0,67204.208333,67743.666667,67346.791667,67690.000000


In [63]:
# Function to add week lag of past 52 weej average for the given city

def add_time_lag_average_area(week_lag,df):
    df_lag = df[["Customer","Area","Product","WeekStarting", "Litres"]].copy()
    df_lag["WeekStarting"] = df_lag["WeekStarting"] + pd.to_timedelta(week_lag,unit="w")
    df_lag.rename(columns={"Litres":f"Litres(w-{week_lag}_average_area)"}, inplace=True)
    df_lag = df_lag.groupby(['WeekStarting', 'Product','Area'])[[f"Litres(w-{week_lag}_average_area)"]].mean().reset_index()
    df = df.merge(df_lag,how="left")
    return df

In [64]:
df = add_time_lag_average_area(51,df)
df = add_time_lag_average_area(52,df)
df = add_time_lag_average_area(53,df)
df = add_time_lag_average_area(54,df)

Remove year 2014 and first two weeks of 2015

In [65]:
df = df[df.WeekStarting > "2015-01-12"]

For the customers that are only in the training set, set the w-52, w-53,w-54 data to the average of that date

In [66]:
df.isna().sum()

Customer                          0
Area                              0
City                              0
Product                           0
WeekStarting                      0
Litres                       108472
Price(t-1)                        0
Price%Change                      0
NewCars                           0
WeekendHolidays                   0
WeekdayHolidays                   0
GDPGrowth%                        0
MoMInflation%                     0
WoWUSDChange%                     0
Season                            0
Litres(w-51)                   7822
Litres(w-52)                   5809
Litres(w-53)                   3796
Litres(w-54)                   3796
Litres(w-51_average_city)      4172
Litres(w-52_average_city)      2086
Litres(w-53_average_city)         0
Litres(w-54_average_city)         0
Litres(w-51_average_area)      4172
Litres(w-52_average_area)      2086
Litres(w-53_average_area)         0
Litres(w-54_average_area)         0
dtype: int64

Fill in the w-52 values in year 2019 as w-53 values

Fill in Litres(w-52 and 2-51_average_area) for all customers as   Litres(w-53_average_area) 

In [67]:
df.loc[df["Litres(w-52_average_area)"].isna() ,"Litres(w-52_average_area)"] = df.loc[df["Litres(w-52_average_area)"].isna() ,"Litres(w-53_average_area)"] 
df.loc[df["Litres(w-51_average_area)"].isna() ,"Litres(w-51_average_area)"] = df.loc[df["Litres(w-51_average_area)"].isna() ,"Litres(w-52_average_area)"] 

Fill in average cities for all customers

In [68]:
# fill in w54 average city as average region
df.loc[df["Litres(w-54_average_city)"].isna() ,"Litres(w-54_average_city)"] = df.loc[df["Litres(w-54_average_city)"].isna() ,"Litres(w-54_average_area)"] 

In [69]:
df.loc[df["Litres(w-53_average_city)"].isna() ,"Litres(w-53_average_city)"] = df.loc[df["Litres(w-53_average_city)"].isna() ,"Litres(w-54_average_city)"] 

In [70]:
df.loc[df["Litres(w-52_average_city)"].isna() ,"Litres(w-52_average_city)"] = df.loc[df["Litres(w-52_average_city)"].isna() ,"Litres(w-53_average_city)"] 

In [71]:
df.loc[df["Litres(w-51_average_city)"].isna() ,"Litres(w-51_average_city)"] = df.loc[df["Litres(w-51_average_city)"].isna() ,"Litres(w-52_average_city)"] 

Fill in average cities as average areas if they are still missing

In [72]:
df.isna().sum()

Customer                          0
Area                              0
City                              0
Product                           0
WeekStarting                      0
Litres                       108472
Price(t-1)                        0
Price%Change                      0
NewCars                           0
WeekendHolidays                   0
WeekdayHolidays                   0
GDPGrowth%                        0
MoMInflation%                     0
WoWUSDChange%                     0
Season                            0
Litres(w-51)                   7822
Litres(w-52)                   5809
Litres(w-53)                   3796
Litres(w-54)                   3796
Litres(w-51_average_city)         0
Litres(w-52_average_city)         0
Litres(w-53_average_city)         0
Litres(w-54_average_city)         0
Litres(w-51_average_area)         0
Litres(w-52_average_area)         0
Litres(w-53_average_area)         0
Litres(w-54_average_area)         0
dtype: int64

In [107]:
# fill in w54 average city as average region
#df.loc[df["Litres(w-53_average_city)"].isna() ,"Litres(w-53_average_city)"] = df.loc[df["Litres(w-53_average_city)"].isna() ,"Litres(w-53_average_area)"] 

Fill in w-52,53,54 as city averages

In [73]:
# fill in w51 average city as average region
df.loc[df["Litres(w-51)"].isna() ,"Litres(w-51)"] = df.loc[df["Litres(w-51)"].isna() ,"Litres(w-51_average_city)"] 
# fill in w52 average city as average region
df.loc[df["Litres(w-52)"].isna() ,"Litres(w-52)"] = df.loc[df["Litres(w-52)"].isna() ,"Litres(w-52_average_city)"] 
# fill in w53 average city as average region
df.loc[df["Litres(w-53)"].isna() ,"Litres(w-53)"] = df.loc[df["Litres(w-53)"].isna() ,"Litres(w-53_average_city)"] 
# fill in w54 average city as average region
df.loc[df["Litres(w-54)"].isna() ,"Litres(w-54)"] = df.loc[df["Litres(w-54)"].isna() ,"Litres(w-54_average_city)"] 

In [77]:
df.isna().sum()

Customer                          0
Area                              0
City                              0
Product                           0
WeekStarting                      0
Litres                       108472
Price(t-1)                        0
Price%Change                      0
NewCars                           0
WeekendHolidays                   0
WeekdayHolidays                   0
GDPGrowth%                        0
MoMInflation%                     0
WoWUSDChange%                     0
Season                            0
Litres(w-51)                      0
Litres(w-52)                      0
Litres(w-53)                      0
Litres(w-54)                      0
Litres(w-51_average_city)         0
Litres(w-52_average_city)         0
Litres(w-53_average_city)         0
Litres(w-54_average_city)         0
Litres(w-51_average_area)         0
Litres(w-52_average_area)         0
Litres(w-53_average_area)         0
Litres(w-54_average_area)         0
dtype: int64

Add week of year value

In [78]:
df["WeekNumber"] = df['WeekStarting'].dt.isocalendar().week.astype("category")

In [79]:
df.dtypes

Customer                           category
Area                               category
City                                  int64
Product                            category
WeekStarting                 datetime64[ns]
Litres                              float64
Price(t-1)                          float64
Price%Change                        float64
NewCars                             float64
WeekendHolidays                     float64
WeekdayHolidays                     float64
GDPGrowth%                          float64
MoMInflation%                       float64
WoWUSDChange%                       float64
Season                             category
Litres(w-51)                        float64
Litres(w-52)                        float64
Litres(w-53)                        float64
Litres(w-54)                        float64
Litres(w-51_average_city)           float64
Litres(w-52_average_city)           float64
Litres(w-53_average_city)           float64
Litres(w-54_average_city)       

In [80]:
df.drop(labels="City",axis=1,inplace=True)

In [81]:
df.columns

Index(['Customer', 'Area', 'Product', 'WeekStarting', 'Litres', 'Price(t-1)',
       'Price%Change', 'NewCars', 'WeekendHolidays', 'WeekdayHolidays',
       'GDPGrowth%', 'MoMInflation%', 'WoWUSDChange%', 'Season',
       'Litres(w-51)', 'Litres(w-52)', 'Litres(w-53)', 'Litres(w-54)',
       'Litres(w-51_average_city)', 'Litres(w-52_average_city)',
       'Litres(w-53_average_city)', 'Litres(w-54_average_city)',
       'Litres(w-51_average_area)', 'Litres(w-52_average_area)',
       'Litres(w-53_average_area)', 'Litres(w-54_average_area)', 'WeekNumber'],
      dtype='object')

In [82]:
df.reset_index(inplace=True,drop=True)

Save

In [84]:
df.to_pickle("Training_TestDataNonFinal.pkl")