Import PyTorch libraries

In [99]:
import os
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score,roc_curve
torch.__version__
import matplotlib.pyplot as plt


### Check Our Processing Capability (CPU vs. GPU)<br>
When developing A.I. projects, it will help to have a powerful GPU.  While this project does not require one, the code below will detect if one is present in your environment and use it during the training process.<br>


Check to see if we have a GPU to use for training

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('A {} device was detected.'.format(device))
#Print the name of the cuda device, if detected
if device=='cuda':
    print (torch.torch.cuda.get_device_name(device=device))

A cpu device was detected.



## Step 2 - Download and Prepare our Dataset<br>
When training a neural network from scratch, you will usually need a lot of data.  We will start by loading all the lemonade stand data for one year (365 items) which is a rather small, simply synthetic dataset.  It includes information about the day the lemonade was sold including whether or not it was a weekend, sunny, warm, a big sign was present to advertise and the price.  Finally, there is the number of lemonade's sold.  Our neural network will be trained to predict the number of lemonade's sold (output) based on the other attributes (inputs).<br>


Use Pandas to do our dataprocessing on the dataset<br>
Download the dataset

In [73]:
import pandas as pd
url = 'https://raw.githubusercontent.com/federicoq1997/NeuralNetwork/main/dataset.csv'
df = pd.read_csv(url)
df.head(10)

Unnamed: 0,item,qty,price,date
0,1106,1,1.0,2021-05-31
1,1106,3,1.0,2021-06-02
2,1106,1,1.0,2021-06-03
3,1106,3,1.0,2021-06-18
4,1106,1,1.0,2021-06-25
5,1106,2,1.0,2021-07-12
6,1106,27,0.954546,2021-07-13
7,1106,106,1.0,2021-07-14
8,1106,142,1.0,2021-07-15
9,1106,133,1.0,2021-07-16


Check the size/shape of our dataset

In [56]:
df.shape

(29641, 8)


### Create our Inputs and Outputs for Training our Neural Network<br>
The data has been collected in a table with the following columns:  <br>
<pre> Weekend Sunny Warm BigSign price qty</pre><br>
While the dataset is more or less ready to be used, we have two fields (price and qty) that contain real values.  Usually, it's easier to train neural networks if the values used are in the range rough range of -1..1.  We will first reduce the range of price and qty down using standardization. <br>


Calculate the mean and standard deviation of price<br>
Standardize numSold

In [57]:
def normalizeC(str):
	priceMean = df[str].mean()
	priceStd = df[str].std()
	df[str] = (df[str]-priceMean)/priceStd

In [58]:
# normalizeC('price')
# normalizeC('qty')

Calculate the mean and standard deviation of numSold<br>
Standardize numSold

In [74]:
df.head()

Unnamed: 0,item,qty,price,date
0,1106,1,1.0,2021-05-31
1,1106,3,1.0,2021-06-02
2,1106,1,1.0,2021-06-03
3,1106,3,1.0,2021-06-18
4,1106,1,1.0,2021-06-25


In [75]:
df['item'].value_counts()
#df['date'].head()#dtype:is object

1106    252
1248    248
1309    242
1266    241
1286    240
       ... 
1260      1
1499      1
1498      1
1496      1
1293      1
Name: item, Length: 376, dtype: int64

In [76]:
df['date'] = pd.to_datetime(df['date'], 
 format = '%Y-%m-%d', 
 errors = 'coerce')
df['date'].head()#dtype:is object

0   2021-05-31
1   2021-06-02
2   2021-06-03
3   2021-06-18
4   2021-06-25
Name: date, dtype: datetime64[ns]

In [77]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['week'] = df['date'].dt.isocalendar().week
df['day'] = df['date'].dt.day

In [78]:
# normalizeC('year')
# normalizeC('month')
# normalizeC('week')
# normalizeC('day')
df.head()

Unnamed: 0,item,qty,price,date,year,month,week,day
0,1106,1,1.0,2021-05-31,2021,5,22,31
1,1106,3,1.0,2021-06-02,2021,6,22,2
2,1106,1,1.0,2021-06-03,2021,6,22,3
3,1106,3,1.0,2021-06-18,2021,6,24,18
4,1106,1,1.0,2021-06-25,2021,6,25,25


In [79]:
# shuffle the samples
df = df.sample(n = len(df), random_state = 42)
df = df.reset_index(drop = True)
df_valid = df.sample(frac = 0.3, random_state = 42)
df_train = df.drop(df_valid.index)

In [80]:
df_train.head()

Unnamed: 0,item,qty,price,date,year,month,week,day
0,1671,4,6.0,2022-08-22,2022,8,34,22
1,1535,2,4.5,2021-10-02,2021,10,39,2
2,1261,2,1.5,2022-05-19,2022,5,20,19
4,1671,1,6.0,2022-09-02,2022,9,35,2
5,1264,1,3.0,2021-08-16,2021,8,33,16


In [81]:
df_valid.head()

Unnamed: 0,item,qty,price,date,year,month,week,day
24076,1391,3,3.0,2022-09-28,2022,9,39,28
19083,1693,1,6.0,2022-07-26,2022,7,30,26
1981,1696,6,5.0,2021-08-16,2021,8,33,16
5545,1280,1,3.0,2022-08-19,2022,8,33,19
24084,1743,11,5.0,2022-08-20,2022,8,33,20



### Create our Input (x) and Ouput (y) to Train our Neural Network<br>
 <br>
Here you will create the input (x) and output (y) variables needed to train our network.  The number we want our neural network to predict is the field called 'qty'.  This will be the output (y).  We will need to seperate out our input (Weekend, Sunny, Warm, BigSign, price) from the ouput (qty).<br>


Create our PyTorch tensors and move to CPU or GPU if available<br>
Extract the inputs and create a PyTorch tensor x (inputs)

In [92]:
inputs = ['year','month','week','day','price']
outputs = ['qty']

df_train.dropna(inplace=True)
df_valid.dropna(inplace=True)

X_train = df_train[inputs].values
X_valid = df_valid[inputs].values
y_train = df_train[outputs].values
y_valid = df_valid[outputs].values
print('Training shapes:',X_train.shape, y_train.shape)
print('Validation shapes:',X_valid.shape, y_valid.shape)

Training shapes: (20749, 5) (20749, 1)
Validation shapes: (8892, 5) (8892, 1)


Explore the first 5 inputs

In [93]:
rf=RandomForestClassifier(max_depth = 5, n_estimators=100, random_state = 42)
rf.fit(X_train, y_train.ravel())

We can then get our predictions with

In [113]:
y_train_preds = rf.predict_proba(X_train)[:,1]
y_valid_preds = rf.predict_proba(X_valid)[:,1]
print(y_train_preds)
print(y_valid_preds)

[0.14304822 0.24172516 0.16670602 ... 0.15612946 0.15815024 0.16016034]
[0.20205936 0.1549951  0.13892682 ... 0.16877111 0.1493587  0.14228815]


Evaluate performance

In [112]:
def calc_specificity(y_actual, y_pred, thresh):
 # calculates specificity
 print(y_pred)
 print(y_actual)
 print(sum((y_pred < thresh) & (y_actual == 0)))
 print(sum(y_actual ==0))
 return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)
def print_report(y_actual, y_pred, thresh):
	auc = roc_auc_score(y_actual, y_pred,multi_class='ovo')
	accuracy = accuracy_score(y_actual, (y_pred > thresh))
	recall = recall_score(y_actual, (y_pred > thresh))
	precision = precision_score(y_actual, (y_pred > thresh))
	specificity = calc_specificity(y_actual, y_pred, thresh)
	print('AUC:%.3f'%auc)
	print('accuracy:%.3f'%accuracy)
	print('recall:%.3f'%recall)
	print('precision:%.3f'%precision)
	print('specificity:%.3f'%specificity)
	print(' ')
	return auc, accuracy, recall, precision, specificity
# Using this print_report function we can evaluate the performance for training and validation. Here I set the threshold at the prevalence of 0.201

thresh = 0.201
print('Random Forecast')
print('Training:')
print_report(y_train.ravel(), y_train_preds.ravel(), thresh)
print('Validation:')
print_report(y_valid.ravel(), y_valid_preds.ravel(), thresh)

Random Forecast
Training:


AxisError: axis 1 is out of bounds for array of dimension 1

In [106]:
fpr_train, tpr_train, thresholds_train = roc_curve(y_train.ravel(), y_train_preds.ravel(),pos_label=1)
auc_train = roc_auc_score(y_train.ravel(), y_train_preds.ravel(),multi_class='ovo')
fpr_valid, tpr_valid, thresholds_valid = roc_curve(y_valid, y_valid_preds,pos_label=1)
auc_valid = roc_auc_score(y_valid.ravel(), y_valid_preds.ravel(),multi_class='ovo')
plt.plot(fpr_train, tpr_train, 'r-',label ='Train AUC:%.3f'%auc_train)
plt.plot(fpr_valid, tpr_valid, 'b-',label ='Valid AUC:%.3f'%auc_valid)
plt.plot([0,1],[0,1],'k — ')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

AxisError: axis 1 is out of bounds for array of dimension 1