In [1]:
import datetime as dt
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [2]:
items = pd.read_csv("./items.csv")
holiday_events = pd.read_csv("./holidays_events.csv", parse_dates=['date'])
stores = pd.read_csv("./stores.csv")
oil = pd.read_csv("./oil.csv", parse_dates=['date'])
transactions = pd.read_csv("./transactions.csv", parse_dates=['date'])
train = pd.read_csv("./train.csv", nrows=100000  , parse_dates=['date'])
# train_large = pd.read_csv('./train.csv', skiprows = 115000000, names = train.columns, parse_dates = ['date'])

In [3]:
test = pd.read_csv('./test.csv', parse_dates=['date'])

In [4]:

def merge_with_metadata(source, oil, stores, items, holidays):
    source.onpromotion = source.onpromotion.fillna(False)
    oil.dcoilwtico = oil.dcoilwtico.bfill()
    source = pd.merge(source, oil, how='left', left_on='date', right_on='date')
    source = pd.merge(source, items, how='left', left_on='item_nbr', right_on='item_nbr')
    source = pd.merge(source, holidays, how = 'left', left_on='date', right_on='date')
    source = pd.merge(source, stores, how = 'left', left_on='store_nbr', right_on='store_nbr')
    source['date'] = source['date'].map(dt.datetime.toordinal)
    source.type_x = source.type_x.fillna('Not a Holiday')
    source.locale = source.locale.fillna('NA')
    source.locale_name = source.locale_name.fillna('NA')
    source.transferred = source.transferred.astype(str).fillna('NA')
    source.onpromotion = source.onpromotion.astype(float)
    source.drop(columns = ['id'], inplace=True)
    numeric_columns = source._get_numeric_data().columns
    categorical_columns = np.setdiff1d(source.columns , numeric_columns)

    num_df = source[numeric_columns].drop(columns = ['date'])
    cat_df = source[categorical_columns]
    date_df = source['date']

    return num_df, cat_df, date_df

In [None]:
numerical_values, categorical_values, date_values = merge_with_metadata(train, oil, stores, items, holiday_events)
print(numerical_values.columns)
print(categorical_values.columns)

In [24]:
categorical_values.astype(float)

Unnamed: 0,city_Ambato,city_Babahoyo,city_Cayambe,city_Cuenca,city_Daule,city_El Carmen,city_Esmeraldas,city_Guaranda,city_Guayaquil,city_Ibarra,...,state_Tungurahua,transferred_False,transferred_nan,type_x_Holiday,type_x_Not a Holiday,type_y_A,type_y_B,type_y_C,type_y_D,type_y_E
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
99996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
99997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
99998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
num_df = pd.DataFrame(StandardScaler().fit_transform(numerical_values.values), columns = numerical_values.columns)


In [34]:
train_merged = pd.concat([num_df, categorical_values.astype(float), date_values], axis=1)

In [35]:

categorical_values.head()

Unnamed: 0,city_Ambato,city_Babahoyo,city_Cayambe,city_Cuenca,city_Daule,city_El Carmen,city_Esmeraldas,city_Guaranda,city_Guayaquil,city_Ibarra,...,state_Tungurahua,transferred_False,transferred_nan,type_x_Holiday,type_x_Not a Holiday,type_y_A,type_y_B,type_y_C,type_y_D,type_y_E
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0


In [38]:
train_merged.head()

Unnamed: 0,store_nbr,item_nbr,unit_sales,onpromotion,dcoilwtico,class,perishable,cluster,city_Ambato,city_Babahoyo,...,transferred_False,transferred_nan,type_x_Holiday,type_x_Not a Holiday,type_y_A,type_y_B,type_y_C,type_y_D,type_y_E,date
0,0.08902,-1.681925,-0.173927,0.0,0.888042,0.760631,1.923917,-1.694159,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,734869
1,0.08902,-1.675633,-0.519334,0.0,0.888042,-0.833312,-0.519773,-1.694159,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,734869
2,0.08902,-1.67563,-0.461766,0.0,0.888042,-0.833312,-0.519773,-1.694159,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,734869
3,0.08902,-1.667376,-0.519334,0.0,0.888042,-0.847654,-0.519773,-1.694159,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,734869
4,0.08902,-1.665326,-0.519334,0.0,0.888042,0.695612,1.923917,-1.694159,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,734869


In [39]:
#X_tr = (train_merged[['date', 'onpromotion', 'item_nbr','locale', 'locale_name', 'transferred', 'store_nbr', 'dcoilwtico', 'family', 'class', 'perishable', 'cluster']]).values
X_tr = train_merged.drop(columns=['unit_sales']).values

In [40]:
from sklearn.preprocessing import StandardScaler

# X_tr = StandardScaler().fit_transform(X_tr)

In [41]:
X_tr = np.nan_to_num(X_tr).astype(float)
print(np.isnan(X_tr).any())

False


In [42]:
Y_tr = np.array(train_merged['unit_sales'])

In [43]:
'''validation on training set'''
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(X_tr, Y_tr, test_size=0.2)

In [44]:
print(train_x.shape, test_x.shape)

(80000, 77) (20000, 77)


In [52]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression

In [53]:
rf = SVR()

In [51]:
rf.fit(train_x, train_y)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
# print(train_x)
prediction = rf.predict(test_x)
#print(train_merged.head())

## Preprocessing Training Data


In [49]:
print(f'rmse = {np.sqrt(mean_squared_error(test_y, prediction))}')
print(f'mean_absolute_error = {mean_absolute_error(test_y, prediction)}')

rmse = 0.7050016817362958
mean_absolute_error = 0.3376136827328741


In [None]:
print(prediction, test_y)

In [None]:
df = pd.DataFrame(np.array([prediction, test_y]).T, columns = ['prediction', 'true_val'] )
df.to_csv('crossval.csv')

In [None]:
test_merged = merge_with_metadata(test, oil, stores, items, holiday_events)
test_merged = encode_data(test_merged, label_encoders, encodable_columns)