<img src="images/post_processor_overview.png">

In [1]:
import numpy as np
import pandas as pd
import json
import math
import warnings
warnings.filterwarnings("ignore")
import os
from os import listdir
from os.path import isfile, join
import calendar; 
import time;

In [2]:
def add_date(df): #faster way to stitch year month and day together into date
    df["date"] =  list(zip(df["year"], df["month"], df["day"]))
    df["date"] = df["date"].astype("category")
    (year, month, day) = zip(*(df["date"].cat.categories))
    new_cats = [str(year[i])+"-"+str(month[i])+"-"+str(day[i]) for i in range(0, len(day))]
    new_cats_map = dict(zip(df["date"].cat.categories, new_cats))
    df["date"]=df["date"].map(new_cats_map)
    df["date"] = pd.to_datetime(df["date"])
    df["date"] = df["date"].astype("datetime64")    
    #df.drop(["year", "month", "day"], axis=1, inplace=True)
    return df

In [3]:
test=pd.read_csv("test_data.csv", sep=",", na_values=["?",",","#","NaN","unknown",""])# reading train data

In [4]:
test = add_date(test)

In [5]:
def get_preds_for(pred_file):
    df = pd.read_csv(pred_file)
    df["date"] = pd.to_datetime(df["date"])
    p1 = pd.merge(test, df, how='inner', left_on = ['date', 'city', 'medicine'], 
                  right_on = ['date', 'city', 'medicine'])
    return p1

def get_predictions(files):
    p = pd.DataFrame()
    for f in files:
        p1 = get_preds_for(f)
        p = pd.concat([p, p1], ignore_index = True)
    return p

def get_files(files_dir, files_prefix):
    return [join(files_dir, f) for f in listdir(files_dir) if isfile(join(files_dir, f)) and files_prefix in f]


def get_ts():
    return str(calendar.timegm(time.gmtime()))


In [18]:
def get_common_predictions_for(cities = []):
    pred_final = pd.DataFrame()
    for c in cities:
        file_dir = "prophet_" + str(c) + "/pred/"
        file_prefix = "pred_" + str(c) + "_"
        files = get_files(file_dir, file_prefix)
        merged = get_predictions(files)
        pred_final = pd.concat([pred_final, merged])
        print(f'city={c} merging done.')
    
    pred_final = pred_final.sort_values("id")
    pred_final = pred_final.reset_index().drop("index", axis=1)
    csv_dump = "common_pred_" + get_ts() + ".csv"
    print(f'dumping {csv_dump} with predictions of all city medicine combinations common to test and train')
    pred_final.to_csv(csv_dump, index=False)
    return pred_final

In [None]:
pred_final = get_common_predictions_for(cities = [1,2, 3, 4, 5, 6, 7, 8, 9, 10])

city=1 merging done.
city=2 merging done.
city=3 merging done.
city=4 merging done.
city=5 merging done.
city=6 merging done.
city=7 merging done.
city=8 merging done.
city=9 merging done.


<img src="images/prediction_combiner.png">

In [None]:
def get_complete_predictioins(test_data_csv, common_preds_csv):
    test = pd.read_csv(test_data_csv, sep=",", na_values=["?",",","#","NaN","unknown",""])
    test = add_date(test)
    pred_final = pd.read_csv(common_preds_csv, sep=",", na_values=["?",",","#","NaN","unknown",""])
    pred_final["date"] = pd.to_datetime(pred_final["date"])
    full_test = test.copy()
    full_test = pd.merge(test, pred_final, how = 'left', left_on = ['id', 'year', 'month', 'day', 'date', 'city', 'medicine'],
            right_on = ['id', 'year', 'month', 'day', 'date', 'city', 'medicine'])
    new_city_meds = full_test[full_test["sales"].isnull()]
    new_city_meds.to_csv("new_city_meds6.csv", index = False)
    avg_sales = full_test.groupby(["city", "date"]).agg({"sales":'mean'}).reset_index()
    new_city_meds = new_city_meds.drop("sales", axis=1)
    new_city_meds = pd.merge(new_city_meds, avg_sales, how='left', left_on = ['date', 'city'], right_on = ['date', 'city'])
    old_city_meds = full_test[full_test["sales"].notnull()]
    last = pd.concat([old_city_meds, new_city_meds])
    last = last.sort_values("id").reset_index().drop("index", axis=1)
    last.to_csv("full_pred6.csv", index=False)
    sample_submission6 = last[["id", "sales"]]
    dumpfile = "sample_submission_" + get_ts() + ".csv"
    sample_submission6.to_csv(dumpfile, index=False)
    return last

In [None]:
get_complete_predictioins("test_data.csv", "common_pred_1608299360.csv")

<img src="images/prediction_normaliser.png">

In [166]:
def normalise_predictions(salse_agg_csv, full_pred_csv):
    sales_agg = pd.read_csv(salse_agg_csv)
    full_pred = pd.read_csv(full_pred_csv)
    
    last_normalised = full_pred.groupby(["date", "city"])["sales"].sum().reset_index()
    last_normalised["pred_agg"] = sales_agg["sales"]
    last_normalised["normaliser"] = last_normalised["pred_agg"]/last_normalised["sales"]
    full_pred_2 = full_pred.copy()
    full_pred_2 = pd.merge(full_pred_2, last_normalised[["date", "city", "normaliser"]], how='left',
             left_on = ["date", "city"], right_on = ["date", "city"])
    full_pred_2["sales_normalised"] = full_pred_2["sales"] * full_pred_2["normaliser"]
    full_pred_2.to_csv("full_pred_normalised.csv", index=False)
    
    sample_submission4 = full_pred_2[["id", "sales_normalised"]]
    sample_submission4 = sample_submission4.rename({"sales_normalised":"sales"}, axis='columns')
    sample_submission4.to_csv("sample_submission4.csv", index=False)
    return sample_submission4

In [168]:
normalise_predictions("xgboost_predicitons.csv", "full_pred6.csv")