# Machine Learning

In [98]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix

## Load data

In [99]:
import pandas as pd

data = pd.read_csv('../data_with_temp_processed.csv')

In [100]:
data.describe()

Unnamed: 0.1,Unnamed: 0,location,product,sa_quantity,temp_mean,temp_max,temp_min,sunshine_quant,price,is_special_event,weekday,day_of_year
count,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0,9706176.0
mean,4941555.0,2272.938,185.0,8.502425,8.665875,11.73502,5.596728,362.9658,2.990399,0.2463504,3.009124,159.7263
std,2922881.0,2071.264,106.5207,12.94981,5.113955,5.717526,4.884786,181.2255,2.053613,0.430885,2.002259,106.6001
min,0.0,55.0,1.0,0.0,-6.545,-2.93,-10.91,0.0,0.0,0.0,0.0,0.0
25%,2426544.0,420.0,93.0,2.0,4.96,7.82,1.97,225.0,1.736562,0.0,1.0,72.75
50%,4853088.0,2058.0,185.0,5.0,8.51,11.42,5.56,338.0,2.490909,0.0,3.0,144.0
75%,7330184.0,3133.75,277.0,10.0,12.47,15.98,9.23,488.0,3.49,0.0,5.0,240.25
max,10110600.0,9137.0,369.0,819.0,25.63,30.28,20.98,1022.0,56.1,1.0,6.0,382.0


In [101]:
data = data.drop(columns=['Unnamed: 0'])

In [102]:
sample = data.sample(n=100000, random_state=91)

In [103]:
sample.describe()

Unnamed: 0,location,product,sa_quantity,temp_mean,temp_max,temp_min,sunshine_quant,price,is_special_event,weekday,day_of_year
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,2276.40389,184.70016,8.56144,8.666542,11.748732,5.584353,364.14294,2.984857,0.24738,3.0045,159.16103
std,2072.154291,106.230657,13.216803,5.108464,5.723401,4.86946,181.691462,2.047711,0.431492,1.99885,106.242247
min,55.0,1.0,0.0,-6.545,-2.93,-10.91,0.0,0.4,0.0,0.0,0.0
25%,437.0,93.0,2.0,4.96,7.82,1.99,226.0,1.721022,0.0,1.0,73.0
50%,2105.0,185.0,5.0,8.495,11.43,5.55,339.0,2.490104,0.0,3.0,143.0
75%,3120.0,277.0,10.0,12.455,15.95,9.21,490.0,3.49,0.0,5.0,239.0
max,9137.0,369.0,457.0,25.63,30.28,20.98,1022.0,19.0,1.0,6.0,382.0


In [104]:
sample = sample[sample['sa_quantity'] <= 60]
sample.describe()

Unnamed: 0,location,product,sa_quantity,temp_mean,temp_max,temp_min,sunshine_quant,price,is_special_event,weekday,day_of_year
count,98950.0,98950.0,98950.0,98950.0,98950.0,98950.0,98950.0,98950.0,98950.0,98950.0,98950.0
mean,2280.449823,185.190995,7.660627,8.668352,11.750787,5.585917,364.03809,2.991155,0.247954,2.998858,159.088752
std,2071.965936,106.228082,9.020229,5.112005,5.727409,4.872305,181.694086,2.051289,0.431827,2.002356,106.243756
min,55.0,1.0,0.0,-6.545,-2.93,-10.91,0.0,0.4,0.0,0.0,0.0
25%,437.0,93.0,2.0,4.96,7.83,1.99,225.0,1.745455,0.0,1.0,73.0
50%,2105.0,185.0,5.0,8.505,11.43,5.55,339.0,2.490909,0.0,3.0,143.0
75%,3120.0,277.0,10.0,12.455,15.96,9.21,490.0,3.490473,0.0,5.0,239.0
max,9137.0,369.0,60.0,25.63,30.28,20.98,1022.0,19.0,1.0,6.0,382.0



## Train/Test split

In [105]:
from sklearn.model_selection import train_test_split

X = sample.drop(columns=['sa_quantity'])
y = sample['sa_quantity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=91)

## Scale data

In [106]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Reduce dimensions

In [15]:
from sklearn.decomposition import FactorAnalysis

fa_transformer = FactorAnalysis(n_components=8, random_state=91)

fa_transformer.fit(X_train)
X_fa_train = fa_transformer.transform(X_train)
print(X_train.shape)
print(X_fa_train.shape)

KeyboardInterrupt: 

## Spectral Embedding

In [None]:
from sklearn.manifold import SpectralEmbedding

embedding = SpectralEmbedding(n_components=7)
embedding.fit(X_train)
X_e_train = embedding.transform(X_train)
print(X_train.shape)
print(X_e_train.shape)

## PCA

In [55]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9)
pca.fit(X_train)
print(X_train.shape)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
print(X_train.shape)

(80000, 10)
(80000, 7)


## PCA transpose

In [23]:
tpca = PCA(n_components=10)
X_t_train = X_train.T
tpca.fit(X_t_train)
X_tpca_train = tpca.transform(X_t_train)
print(X_t_train.shape)
print(X_tpca_train.shape)

(10, 4853088)
(10, 10)


## Random Forest

In [90]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=100, random_state=91)
clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

rf_mse = mean_squared_error(y_test, y_predict)
print(f'Random forest regressor error: {rf_mse}')

KeyboardInterrupt: 

## SGDRegressor

In [None]:
from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor()
sgd.fit(X_train, y_train)
y_predict = sgd.predict(X_test)

sgd_mse = mean_squared_error(y_test, y_predict)
print(f'SGDRegressor error: {sgd_mse}')

## Gradient Boosting regressor

In [120]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(max_depth=11, learning_rate=0.18, min_samples_split=3, min_samples_leaf=3)
gbr.fit(X_train, y_train)
y_predict = gbr.predict(X_test)

gbr_mse = mean_squared_error(y_test, y_predict)
print(f'Gradient Boosting Regressor error: {gbr_mse}')

Gradient Boosting Regressor error: 28.5639461839219


## KNeighbors Regressor

In [84]:
from sklearn.neighbors import KNeighborsRegressor

knr = KNeighborsRegressor(n_neighbors=40)
knr.fit(X_train, y_train)
y_predict = knr.predict(X_test)

knr_mse = mean_squared_error(y_test, y_predict)
print(f'KNeighbors Regressor error: {knr_mse}')

KNeighbors Regressor error: 70.9631651786165


In [71]:
print(y_predict[0:10])
print(y_test[0:10])

[ 2.8437298  15.52825683  2.68626225  9.26102274  4.61563325 33.96656564
  4.25642233  8.81544991 17.58307389  7.9229814 ]
1532048     3
4050172    25
3266929     1
836449      9
367902      4
664174     32
2913045     8
2975247     8
1768932    20
2950620     5
Name: sa_quantity, dtype: int64


## XGBoost

In [121]:
import xgboost as xgb

Xgb_train = xgb.DMatrix(X_train, y_train)
Xgb_test = xgb.DMatrix(X_test)

param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 2

xgb_reg = xgb.train(param, Xgb_train, num_round)
type(xgb_reg)

y_predict xgb_reg.predict(Xgb_test)
xgb_mse = mean_squared_error(y_test, y_predict)
print(f'XGBoost Regressor error: {xgb_mse}')

SyntaxError: invalid syntax (<ipython-input-121-c35947375e50>, line 11)