# Mini prosjekt ML

In [None]:
import pandas as pd
import numpy as np

## Loading Data

In [None]:
sales = pd.read_csv("data/sales_train.csv")
categories = pd.read_csv("data/item_categories.csv")
test_set = pd.read_csv("data/test.csv")
items = pd.read_csv(f"data/items.csv")

(For special people)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# path = "G:\My Drive\AW_Academy\mini_project_week6\github repo\df_feat/"
# df=pd.read_csv(f'{path}df_feat.csv')

## Cleaning

In [None]:
sales_shape_pre = sales.shape

# Removing unecessary values
sales = sales.query("item_cnt_day >= 0 and item_price > 0")

# Removing outliers
sales = sales[sales["item_cnt_day"] < sales["item_cnt_day"].quantile(0.99)]
sales = sales[sales["item_price"] < sales["item_price"].quantile(0.99)]


# Removing rows with shop id not in test set
# sales_shops = sales["shop_id"].unique() 
# test_shops = test_set["shop_id"].unique()
# shops = np.intersect1d(sales_shops, test_shops)

sales = sales.query("shop_id in @shops")

sales_shape_post = sales.shape


# Resetting index after removing rows
sales.reset_index(drop=True, inplace=True)

# Adding test-data to main dataframe for easy transformation
test_set["date_block_num"] = 34
test_set.index=np.arange(2928493, 2928493+test_set.shape[0])

sales = pd.concat([sales, test_set.drop(columns=["ID"])])
sales.fillna(0, inplace=True)

# Converting date column to datetime
sales["date"] = pd.to_datetime(sales["date"], dayfirst=True)

print(sales_shape_pre)
print(sales_shape_post)
print(f"-{sales_shape_pre[0] - sales_shape_post[0]} rows")

## Features

In [None]:
sales["month"] = sales["date"].dt.month

In [None]:
# Grouping by each month, shop_id and item_id -> goal of prediction
# Also removes date column
sales = sales.groupby(by=["date_block_num", "shop_id", "item_id"], as_index=False)
sales = sales.agg({"item_cnt_day":"sum", "item_price":"mean", "month":"min"})

sales.rename(columns={"item_cnt_day":"item_cnt_month"}, inplace=True)

In [None]:
# Each shops total sales for prior month
shop_tot_sales = sales[["date_block_num", "shop_id", "item_cnt_month"]].groupby(
    by=["date_block_num", "shop_id"], as_index=False).sum()

shop_tot_sales.rename(columns={"item_cnt_month":"shop_tot_month"}, inplace=True)

shop_tot_sales["date_block_num"] += 1

sales = sales.merge(shop_tot_sales, how="left", on=["date_block_num", "shop_id"])

# Freeing memory
del shop_tot_sales

In [None]:
def add_cnt_offset(n_steps):
    """ Adds `item_cnt_month` to `sales` df with offset of `n_steps`.
    """

    temp = sales[["date_block_num", "item_id", "shop_id", "item_cnt_month"]].copy()
    
    temp["date_block_num"] += n_steps
    
    temp.rename(columns={"item_cnt_month":f"item_cnt_month_offset_{n_steps}"}, inplace=True)

    return sales.merge(temp, how="left", on=["date_block_num", "item_id", "shop_id"])    

# Offset numbers yielding highest feature importance
sales = add_cnt_offset(1)
sales = add_cnt_offset(2)
sales = add_cnt_offset(3)
sales = add_cnt_offset(4)
# sales = add_cnt_offset(12)

In [None]:
# Processing and merging items-df for item_category_id column
items.drop(columns=["item_name"], inplace=True)
sales = sales.merge(items, how="left", on="item_id")

In [None]:
# Calculating total sales for each category
category_tot_sales = sales[["date_block_num", "item_category_id", "item_cnt_month"]].groupby(
    by=["date_block_num", "item_category_id"], as_index=False).sum()

category_tot_sales.rename(columns={"item_cnt_month":"category_tot_count"}, inplace=True)

category_tot_sales["date_block_num"] += 1

sales = sales.merge(category_tot_sales, how="left", on=["date_block_num", "item_category_id"])

# Freeing memory
del category_tot_sales

In [None]:
# The date_block where the first sale of each item happened
# Could also be the day of the year where the first sale happened, might yield better results?
first_sales = sales.groupby("item_id", as_index=False).agg({"date_block_num":"min"})

first_sales.columns=["item_id", "first_sale_date_block"]

sales = sales.merge(first_sales, how="left", on="item_id")

# Free memory
del first_sales

In [None]:
# Sin transform of date_block_num (did not improve performance of model)
# sales["date_block_num"] = np.sin(sales["date_block_num"])
# sales["date_block_num"].sample(10)

### Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import OPTICS

In [None]:
cluster_df = sales[["shop_id", "item_cnt_month", "date_block_num"]].groupby(by=["date_block_num", "shop_id"], as_index=False).sum()

In [None]:
kmeans = KMeans()
cluster_df["shop_cnt_cluster"] = kmeans.fit_predict(cluster_df)

In [None]:
sales = sales.merge(cluster_df[["shop_id", "shop_cnt_cluster", "date_block_num"]], how="left", on=["date_block_num","shop_id"])

In [None]:
optics = OPTICS()
cluster_df = sales[["item_category_id", "item_cnt_month", "date_block_num"]].groupby(by=["item_category_id", "date_block_num"], as_index=False).sum()
cluster_df["item_category_cluster"] = optics.fit_predict(cluster_df)
sales = sales.merge(cluster_df[["item_category_id","item_category_cluster", "date_block_num"]], how="left", on=["date_block_num", "item_category_id"])

In [None]:
# Remove uneeded column
del sales["item_price"]

# Fill nan values
sales.fillna(0, inplace=True)
sales.sample(10)

## Train/Test data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Using month 33 as testing data, change to make predictions over other months
test_month = 33

train = sales.query(f"date_block_num < {test_month}")
test = sales.query(f"date_block_num == {test_month}")


# Predict-target is item_cnt_month
y_train = train["item_cnt_month"]
x_train = train.drop(columns=["item_cnt_month"])

y_test = test["item_cnt_month"]
x_test = test.drop(columns=["item_cnt_month"])



# Random, test/train split
## y = sales["item_cnt_month"]
## x = sales.drop(columns=["item_cnt_month"])

# #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=420)

## Modelling
Testing with both `LinearRegression` and `XGBRegressor`.

In [None]:
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# Defining util functions
import json
import os
import time

if not "models" in os.listdir():
    os.mkdir("models")
if not "predictions" in os.listdir():
    os.mkdir("predictions")

mse = mean_squared_error
def clp(x):
    """Clips values between 0 and 20"""
    return np.clip(x, 0 ,20)

def score_model(model=None, metric=False):
    """Scores model, if `model` is `None` makes naive prediction. 
    If `metric` is `True`, returns scores as tuple else only prints scores."""
    
    if not model:
        train_score = mse(clp(y_train), clp(x_train["item_cnt_month_offset_1"]), squared=False)
        test_score = mse(clp(y_test), clp(x_test["item_cnt_month_offset_1"]), squared=False)
        
        print("train: ", train_score)
        print("test: ", test_score)
        
        if metric:
            return train_score, test_score
    
    elif model:
        train_score = mse(clp(y_train), clp(model.predict(x_train)), squared=False)
        test_score = mse(clp(y_test), clp(model.predict(x_test)), squared=False)
        
        print("train: ", train_score)
        print("test: ", test_score)
        
        if metric:
            return train_score, test_score


def save_final_pred(model):
    """Makes and saves predictions for `test_set` (date_block_num 34).
    """
    test = sales.query("date_block_num == 34")
    
    x_test = test.drop(columns=["item_cnt_month"])
    
    pred = clp(model.predict(x_test))

    if pred.shape[0] != 214200:
        raise Exception(f"Prediction must be 214200 rows, is: {pred.shape[0]} rows") 
    
    df = pd.DataFrame({"ID":test_set["ID"], "item_cnt_month":pred})
    
    df.to_csv(f"predictions/final_pred_{model.__class__.__name__}.csv", index=False)

def save_model_metrics(model):
    """Saves models paramters, scores and features used to predict y.
    """

    path = f"models/{model.__class__.__name__}.json"
    
    models = os.listdir("models")
    
    if f"{model.__class__.__name__}.json" not in models:
        with open(path, "w") as file:
            file.write("{}")
    
    with open(path, "r") as file:
        model_metrics = json.load(file)
    
    model_params = model.get_params()

    train_score, test_score = score_model(model, metric=True)
    
    model_metrics[int(time.time())] = {
        "scores":{"train":train_score, "test":test_score},
        "params": model_params,
        "features": list(x_train.columns)
    }

    with open(path, "w") as file:
        json.dump(model_metrics, file)

def save_results(model):
    """Creates and saves final prediction (test_set) and metrics for model.
    
    See:
    - `save_final_pred()`
    - `save_model_metrics()`
    """
    save_final_pred(model)
    save_model_metrics(model)

### LinearRegression

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)

score_model(lr)

# Uncomment to automatically save results of each run
# save_results(lr)

### KNeighborsRegressor

In [None]:
# knr = KNeighborsRegressor(n_neighbors=15)
# knr.fit(x_train, y_train)

# score_model(knr)

# Uncomment to automatically save results/metrics of each run
## save_results(knr)

### XGBRegressor

In [None]:
xgbr = XGBRegressor(
    max_depth=8,
    n_estimators=100,
    early_stopping_rounds = 8,
    eval_metric="rmse",
    gamma=0.1,

    random_state=420   
)

xgbr.fit(
    x_train, 
    y_train, 
    eval_set=[(x_train, y_train), (x_test, y_test)], 
    verbose=True
)


# Uncomment to automatically save results of each run
# save_results(xgbr)

score_model(xgbr)

# Feature importances of XGBRegressor model

feature_importances = {fn:fi for fn, fi in zip(xgbr.feature_names_in_, xgbr.feature_importances_)}


In [None]:
import matplotlib.pyplot as plt
test_score = np.array(xgbr.evals_result_["validation_1"]["rmse"])
train_score = np.array(xgbr.evals_result_["validation_0"]["rmse"])

In [None]:
plt.barh(list(feature_importances.keys()), feature_importances.values(), log=True)
plt.grid()
plt.show()

In [None]:
plt.plot(np.arange(test_score.shape[0]), test_score)
plt.plot(np.arange(train_score.shape[0]), train_score)
plt.legend(["test_rmse", "train_rmse"])
plt.grid()
plt.show()

## Export data

In [None]:
# compression_opts = dict(method='zip',
#                         archive_name='df_feat.csv')  

# sales.to_csv('df_feat.zip', index=False,
#           compression=compression_opts)  

In [None]:
# x_test["pred_cnt_month"] = lr.predict(x_test.drop(columns=["item_cnt_actual", "pred_cnt_month"]))
# x_test["item_cnt_actual"] = y_test
# x_test.to_csv("data_XGBRegressor")

In [None]:
# from xgboost.plotting import plot_importance, plot_tree

In [None]:
# fig, ax = plt.subplots(figsize=(400, 400))
# plot_tree(xgbr, ax = ax)