Problem: Supervised Learning - Regression, model attempts to find relationship between dependent and indepdent variables to predict continuous value (sale)

Solution: using linear regression


In [None]:
#pip installs

In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from google.colab import drive
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
import datetime
import seaborn as sns
drive.mount('/content/drive/')

Mounted at /content/drive/


##1. Load Data

In [None]:
os.chdir('/content/drive/My Drive/4041MLProject')

In [None]:
df = pd.read_csv('data/train3.csv', parse_dates = ['date']) # load dataset
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df.drop('date', axis=1, inplace=True)
df.tail()

##2. Preprocess data


In [None]:
#Split dataset into train and test
train_dataset = df.sample(frac=0.8, random_state=0)
test_dataset = df.drop(train_dataset.index)

#Split features from labels
train_features = train_dataset.copy()
test_features = test_dataset.copy()
train_labels = train_features.pop('sales')
test_labels = test_features.pop('sales')

train_dataset.describe().transpose()[['mean', 'std']]

##3. Define Keras Model

In [None]:
#Using normalizing layer to build preprocessing into model
#Create the layer
normalizer = preprocessing.Normalization(axis=-1)
normalizer.adapt(np.array(train_features))
print(normalizer.mean.numpy())

In [None]:
first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
  print('First example:', first)
  print()
  print('Normalized:', normalizer(first).numpy())

In [None]:
linear_model = tf.keras.Sequential([
    normalizer,
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    #When you call this model on a batch of inputs, it produces units=1 outputs for each example.
    layers.Dense(units=1)

])
linear_model.summary()

In [None]:
linear_model.predict(train_features[:10])

In [None]:
linear_model.layers[1].kernel

##4. Compile Keras Model



In [None]:
linear_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.001),
    loss='mean_absolute_error')

##5. Fit Keras Model

In [None]:
%%time
history = linear_model.fit(
    train_features, train_labels,
    epochs=200,
    # suppress logging
    verbose=1,
    # Calculate validation results on 20% of the training data
    validation_split = 0.2)

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.ylim([10, 30])
  plt.xlabel('Epoch')
  plt.ylabel('Error [MPG]')
  plt.legend()
  plt.grid(True)
plot_loss(history)

##6. Evaluate

In [None]:
test_results = {}
test_results['linear_model'] = linear_model.evaluate(
    test_features,
    test_labels, verbose=0)

In [None]:
pd.DataFrame(test_results, index=['Mean absolute error [sales]']).T

In [None]:
test_predictions = linear_model.predict(test_features).flatten()
a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [sales]')
plt.ylabel('Predictions [sales]')
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims, color = 'green')

In [None]:
error = test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [sales]')
_ = plt.ylabel('Count')

In [2]:
linear_model.save('linear_model')

NameError: ignored

Predicting values from test.csv

In [3]:
reloaded = tf.keras.models.load_model('linear_model')
#test_results['reloaded'] = reloaded.evaluate(
#    test_features, test_labels, verbose=0)

OSError: ignored

In [None]:
df = pd.read_csv('data/test.csv', parse_dates = ['date']) # load dataset
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df.drop('date', axis=1, inplace=True)
df.drop('id', axis = 1, inplace=True)
#Split features from labels
test_features = df.copy()
test_features.head()

In [None]:
normalizer = preprocessing.Normalization(axis=-1)
normalizer.adapt(np.array(test_features))
print(normalizer.mean.numpy())

In [None]:
first = np.array(test_features[:1])
with np.printoptions(precision=2, suppress=True):
  print('First example:', first)
  print()
  print('Normalized:', normalizer(first).numpy())

In [None]:
predictions = reloaded.predict(test_features)

In [None]:
print(type(predictions))

In [None]:
oneD_arr = predictions.flatten()
#predictions
oneD_arr
oneD_series = pd.Series(oneD_arr)
oneD_series.tail()

In [None]:
submit = pd.read_csv('data/test.csv', parse_dates = ['date']) # load dataset
submit.drop(['store', 'item', 'date'], axis = 1, inplace=True)
submit['sales'] = pd.Series(oneD_arr)
submit.tail()

In [None]:
ls

In [None]:
submit.to_csv(r'submission/deeplearning.csv', index=False)