# Machine Learning

In [4]:
from sklearn.metrics import mean_squared_error

## Load data

In [82]:
import pandas as pd

data = pd.read_csv('../data_with_temp_processed.csv')

In [83]:
data.describe()

Unnamed: 0.1,Unnamed: 0,location,product,sa_quantity,temp_mean,temp_max,temp_min,sunshine_quant,price,is_special_event,weekday,day_of_year
count,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0
mean,4941555.0,2272.938,185.0,8.502425,8.665875,11.73502,5.596728,362.9658,2.990399,0.2463504,3.009124,159.7263
std,2922881.0,2071.264,106.5207,12.94981,5.113955,5.717526,4.884786,181.2255,2.053613,0.430885,2.002259,106.6001
min,0.0,55.0,1.0,0.0,-6.545,-2.93,-10.91,0.0,0.0,0.0,0.0,0.0
25%,2426544.0,420.0,93.0,2.0,4.96,7.82,1.97,225.0,1.736562,0.0,1.0,72.75
50%,4853088.0,2058.0,185.0,5.0,8.51,11.42,5.56,338.0,2.490909,0.0,3.0,144.0
75%,7330184.0,3133.75,277.0,10.0,12.47,15.98,9.23,488.0,3.49,0.0,5.0,240.25
max,10110600.0,9137.0,369.0,819.0,25.63,30.28,20.98,1022.0,56.1,1.0,6.0,382.0


In [84]:
data = data.drop(columns=['Unnamed: 0'])

In [55]:
sample = data.sample(n=500000, random_state=91)
sample = sample.drop(columns=['temp_mean'])

In [56]:
sample.describe()

Unnamed: 0,location,product,sa_quantity,temp_max,temp_min,sunshine_quant,price,is_special_event,weekday,day_of_year
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,2272.482596,184.912128,8.496744,11.742038,5.59603,363.505322,2.989823,0.246172,3.007984,159.702884
std,2070.387745,106.367515,12.952539,5.720407,4.883897,181.290533,2.052074,0.43078,2.000224,106.520245
min,55.0,1.0,0.0,-2.93,-10.91,0.0,0.0,0.0,0.0,0.0
25%,369.0,93.0,2.0,7.81,1.97,225.0,1.733333,0.0,1.0,73.0
50%,2011.0,185.0,5.0,11.43,5.56,339.0,2.490625,0.0,3.0,144.0
75%,3175.0,277.0,10.0,15.98,9.24,489.0,3.490164,0.0,5.0,240.0
max,9137.0,369.0,457.0,30.28,20.98,1022.0,19.0,1.0,6.0,382.0


In [7]:
sample = sample[sample['sa_quantity'] <= 60]
sample.describe()

Unnamed: 0,location,product,sa_quantity,temp_max,temp_min,sunshine_quant,price,is_special_event,weekday,day_of_year
count,495053.0,495053.0,495053.0,495053.0,495053.0,495053.0,495053.0,495053.0,495053.0,495053.0
mean,2276.500981,185.366516,7.647696,11.743395,5.596372,363.436121,2.995834,0.246596,3.002394,159.63801
std,2070.546384,106.360349,9.003645,5.722752,4.885619,181.276079,2.055851,0.43103,2.003288,106.501117
min,55.0,1.0,0.0,-2.93,-10.91,0.0,0.0,0.0,0.0,0.0
25%,437.0,93.0,2.0,7.82,1.97,225.0,1.75,0.0,1.0,73.0
50%,2105.0,186.0,5.0,11.43,5.56,339.0,2.491176,0.0,3.0,144.0
75%,3175.0,277.0,10.0,15.99,9.25,489.0,3.490909,0.0,5.0,240.0
max,9137.0,369.0,60.0,30.28,20.98,1022.0,19.0,1.0,6.0,382.0



## Train/Test split

In [8]:
from sklearn.model_selection import train_test_split

X = sample.drop(columns=['sa_quantity'])
y = sample['sa_quantity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=91)

## Scale data

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
from joblib import dump

dump(scaler, 'scaler.joblib')

['scaler.joblib']

## Reduce dimensions (not useful)

In [24]:
from sklearn.decomposition import FactorAnalysis

fa_transformer = FactorAnalysis(n_components=8, random_state=91)

fa_transformer.fit(X_train)
X_fa_train = fa_transformer.transform(X_train)
print(X_train.shape)
print(X_fa_train.shape)

(80000, 59)
(80000, 8)


## Spectral Embedding (not useful)

In [None]:
from sklearn.manifold import SpectralEmbedding

embedding = SpectralEmbedding(n_components=7)
embedding.fit(X_train)
X_e_train = embedding.transform(X_train)
print(X_train.shape)
print(X_e_train.shape)

## PCA

In [30]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9)
pca.fit(X_train)
print(X_train.shape)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
print(X_train.shape)

(80000, 59)
(80000, 49)


## PCA transpose (not useful)

In [23]:
tpca = PCA(n_components=10)
X_t_train = X_train.T
tpca.fit(X_t_train)
X_tpca_train = tpca.transform(X_t_train)
print(X_t_train.shape)
print(X_tpca_train.shape)

(10, 4853088)
(10, 10)


## Random Forest

In [13]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=100, random_state=91)
clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

rf_mse = mean_squared_error(y_test, y_predict)
print(f'Random forest regressor error: {rf_mse}')

Random forest regressor error: 22.657005691286827


In [14]:
from joblib import dump
dump(clf, 'regressor.joblib')

['regressor.joblib']

In [23]:
outliers = sample[sample['sa_quantity'] > 60]
outliers

y_predict = clf.predict(outliers.drop(columns=['sa_quantity']))

mse = mean_squared_error(outliers['sa_quantity'], y_predict)
mse

8041.866525874267

## SGDRegressor

In [10]:
from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor()
sgd.fit(X_train, y_train)
y_predict = sgd.predict(X_test)

sgd_mse = mean_squared_error(y_test, y_predict)
print(f'SGDRegressor error: {sgd_mse}')

SGDRegressor error: 75.60759167006607


## Gradient Boosting regressor

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(max_depth=11, learning_rate=0.18, min_samples_split=3, min_samples_leaf=3)
gbr.fit(X_train, y_train)
y_predict = gbr.predict(X_test)

gbr_mse = mean_squared_error(y_test, y_predict)
print(f'Gradient Boosting Regressor error: {gbr_mse}')

Gradient Boosting Regressor error: 73.53272893471


## KNeighbors Regressor

In [84]:
from sklearn.neighbors import KNeighborsRegressor

knr = KNeighborsRegressor(n_neighbors=40)
knr.fit(X_train, y_train)
y_predict = knr.predict(X_test)

knr_mse = mean_squared_error(y_test, y_predict)
print(f'KNeighbors Regressor error: {knr_mse}')

KNeighbors Regressor error: 70.9631651786165


In [71]:
print(y_predict[0:10])
print(y_test[0:10])

[ 2.8437298  15.52825683  2.68626225  9.26102274  4.61563325 33.96656564
  4.25642233  8.81544991 17.58307389  7.9229814 ]
1532048     3
4050172    25
3266929     1
836449      9
367902      4
664174     32
2913045     8
2975247     8
1768932    20
2950620     5
Name: sa_quantity, dtype: int64


## XGBoost

In [12]:
import xgboost as xgb

Xgb_train = xgb.DMatrix(X_train, y_train)
Xgb_test = xgb.DMatrix(X_test)

param = {
    'eta':0.1,
    'seed':0,
    'subsample':0.8,
    'colsample_bytree':0.8,
    'objective':'reg:squarederror',
    'max_depth':6,
    'min_child_weight':1
}

xgb_reg = xgb.XGBRFRegressor().fit(X_train, y_train)
type(xgb_reg)

y_predict = xgb_reg.predict(X_test)
xgb_mse = mean_squared_error(y_test, y_predict)
print(f'XGBoost Regressor error: {xgb_mse}')

  if getattr(data, 'base', None) is not None and \


XGBoost Regressor error: 71.18173299962754


# Train outliers

In [49]:
outliers = sample
outliers['is_outlier'] = (outliers['sa_quantity'] > 30).astype(int)
outliers.describe()

Unnamed: 0,location,product,sa_quantity,temp_max,temp_min,sunshine_quant,price,is_special_event,weekday,day_of_year,is_outlier
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,2272.482596,184.912128,8.496744,11.742038,5.59603,363.505322,2.989823,0.246172,3.007984,159.702884,0.044416
std,2070.387745,106.367515,12.952539,5.720407,4.883897,181.290533,2.052074,0.43078,2.000224,106.520245,0.206018
min,55.0,1.0,0.0,-2.93,-10.91,0.0,0.0,0.0,0.0,0.0,0.0
25%,369.0,93.0,2.0,7.81,1.97,225.0,1.733333,0.0,1.0,73.0,0.0
50%,2011.0,185.0,5.0,11.43,5.56,339.0,2.490625,0.0,3.0,144.0,0.0
75%,3175.0,277.0,10.0,15.98,9.24,489.0,3.490164,0.0,5.0,240.0,0.0
max,9137.0,369.0,457.0,30.28,20.98,1022.0,19.0,1.0,6.0,382.0,1.0


In [50]:
from sklearn.model_selection import train_test_split

X = outliers.drop(columns=['sa_quantity', 'is_outlier'])
y = outliers['is_outlier']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=91)

In [51]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier(n_estimators=100, random_state=91, class_weight={0:0.1, 1:100})
clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

rf_acc = accuracy_score(y_test, y_predict)
print(f'Random forest classifier accuracy: {rf_acc}')

Random forest classifier accuracy: 0.96357


In [54]:
from joblib import dump

dump(clf, 'outlier_classifier.joblib')

['outlier_classifier.joblib']

In [53]:
from sklearn.metrics import confusion_matrix

conf_m = confusion_matrix(y_test, y_predict)
print(conf_m)

[[95218   271]
 [ 3372  1139]]


# Train outlier

In [101]:
outliers = data[data['sa_quantity'] > 60]
outliers = outliers.drop(columns=['temp_mean'])
outliers.describe()

Unnamed: 0,location,product,sa_quantity,temp_max,temp_min,sunshine_quant,price,is_special_event,weekday,day_of_year
count,96569.0,96569.0,96569.0,96569.0,96569.0,96569.0,96569.0,96569.0,96569.0,96569.0
mean,1841.813853,143.136296,92.894262,11.664922,5.649368,369.359329,2.401708,0.206101,3.532811,165.660926
std,2024.217934,98.410695,39.563077,5.464445,4.71305,180.6975,1.495309,0.404506,1.586188,108.405404
min,55.0,1.0,61.0,-2.93,-10.91,0.0,0.45,0.0,0.0,1.0
25%,246.0,64.0,68.0,7.78,2.23,234.0,1.25,0.0,3.0,77.0
50%,1258.0,118.0,80.0,11.23,5.54,343.0,2.289815,0.0,4.0,152.0
75%,2914.0,218.0,103.0,15.68,9.06,495.0,2.689655,0.0,5.0,247.0
max,9137.0,369.0,819.0,30.28,20.98,1022.0,15.019149,1.0,6.0,382.0


In [102]:
from sklearn.model_selection import train_test_split

X = outliers.drop(columns=['sa_quantity'])
y = outliers['sa_quantity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=91)

In [103]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [104]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=100, random_state=91)
clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

rf_mse = mean_squared_error(y_test, y_predict)
print(f'Random forest regressor error: {rf_mse}')

Random forest regressor error: 562.1442893134515


In [105]:
from joblib import dump
dump(clf, 'outlier_reg.joblib')

['outlier_reg.joblib']

# Train With no temp

In [71]:
data_no_temp = pd.read_csv('../data_no_temp_processed.csv')
data_no_temp = data_no_temp.drop(columns=['Unnamed: 0', 'temp_mean', 'temp_max', 'temp_min', 'sunshine_quant'])
data_no_temp.describe()

Unnamed: 0.1,Unnamed: 0,location,product,sa_quantity,temp_mean,temp_max,temp_min,sunshine_quant,price,is_special_event,weekday,day_of_year
count,404424.0,404424.0,404424.0,404424.0,0.0,0.0,0.0,0.0,404424.0,404424.0,404424.0,404424.0
mean,7785162.0,5944.5,185.0,11.056448,,,,,2.990918,0.24635,3.009124,159.726277
std,408615.5,515.500637,106.520865,14.744311,,,,,2.058112,0.430886,2.002261,106.600201
min,7279632.0,5429.0,1.0,0.0,,,,,0.0,0.0,0.0,0.0
25%,7380738.0,5429.0,93.0,3.0,,,,,1.733333,0.0,1.0,72.75
50%,7785162.0,5944.5,185.0,7.0,,,,,2.490323,0.0,3.0,144.0
75%,8189585.0,6460.0,277.0,14.0,,,,,3.489286,0.0,5.0,240.25
max,8290691.0,6460.0,369.0,413.0,,,,,19.0,1.0,6.0,382.0


In [45]:
from sklearn.model_selection import train_test_split

X = data_no_temp.drop(columns=['sa_quantity'])
y = data_no_temp['sa_quantity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=91)

In [46]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [47]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=100, random_state=91)
clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

rf_mse = mean_squared_error(y_test, y_predict)
print(f'Random forest regressor error: {rf_mse}')

Random forest regressor error: 48.533221744452


In [48]:
from joblib import dump

dump(clf, 'no_temp_reg.joblib')

['no_temp_reg.joblib']

# Final Predictor

In [1]:
from joblib import load

clf = load('outlier_classifier.joblib')
scaler = load('scaler.joblib')
reg = load('regressor.joblib')
no_temp_reg = load('no_temp_reg.joblib')
outlier_reg = load('outlier_reg.joblib')

In [None]:
import numpy as np
import pandas as pd

data = pd.read_csv('../data_to_predict_processed.csv')
data = data.drop(columns=['Unnamed: 0'])

final_prediction = np.array([]);

for _, row in data.iterrows():
    tmp = pd.DataFrame(row).T
    has_no_temp = np.any(np.isnan(tmp['temp_mean']))
    tmp = tmp.drop(columns=['temp_mean'])
    if has_no_temp:
        tmp = tmp.drop(columns=['temp_max', 'temp_min', 'sunshine_quant'])
        tmp = scaler.transform(tmp)
        final_prediction = np.append(final_prediction, no_temp_reg.predict(row))
    else:
        tmp = scaler.transform(tmp)
        is_outlier = clf.predict(tmp)
        if is_outlier[0] == 0:
            prediction = reg.predict(tmp)
        else:
            prediction = outlier_reg.predict(tmp)
        final_prediction = np.append(final_prediction, prediction)

In [7]:
print(len(final_prediction))

[ 1.96  1.89  2.05 ... 11.62  2.77  9.27]
