In [0]:
import torch
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
class MushroomDataset():
  
  def __init__(self):
    self.data = pd.read_csv('./mushrooms.csv')
    
  def __len__(self):
    return len(self.data)
  
  def __getitem__(self,idx):
    return self.data.iloc[idx][0:1]
  

In [9]:
shrooms = MushroomDataset()
len(shrooms), shrooms[0]

(8124, class    p
 Name: 0, dtype: object)

In [0]:
class MushroomDataset():
  
  def __init__(self):
    self.data = pd.read_csv('./mushrooms.csv')
    
  def __len__(self):
    return len(self.data)
  
  def __getitem__(self,idx):
    if type(idx) == torch.Tensor:
      idx = idx.item()
    return self.data.iloc[idx][1:],self.data.iloc[idx][0:1]

  
  

In [12]:
shrooms = MushroomDataset()
shrooms[0]

(cap-shape                   x
 cap-surface                 s
 cap-color                   n
 bruises                     t
 odor                        p
 gill-attachment             f
 gill-spacing                c
 gill-size                   n
 gill-color                  k
 stalk-shape                 e
 stalk-root                  e
 stalk-surface-above-ring    s
 stalk-surface-below-ring    s
 stalk-color-above-ring      w
 stalk-color-below-ring      w
 veil-type                   p
 veil-color                  w
 ring-number                 o
 ring-type                   p
 spore-print-color           k
 population                  s
 habitat                     u
 Name: 0, dtype: object, class    p
 Name: 0, dtype: object)

In [46]:
from torch.utils import data
number_for_testing  = int(len(shrooms)*0.05)
number_for_training = len(shrooms) - number_for_testing
train, test = data.random_split(shrooms,[number_for_training,number_for_testing])
len(train), len(test)

(7718, 406)

In [16]:
test[0]

(cap-shape                   x
 cap-surface                 y
 cap-color                   g
 bruises                     t
 odor                        n
 gill-attachment             f
 gill-spacing                c
 gill-size                   b
 gill-color                  p
 stalk-shape                 t
 stalk-root                  b
 stalk-surface-above-ring    s
 stalk-surface-below-ring    s
 stalk-color-above-ring      w
 stalk-color-below-ring      w
 veil-type                   p
 veil-color                  w
 ring-number                 o
 ring-type                   p
 spore-print-color           n
 population                  v
 habitat                     d
 Name: 2812, dtype: object, class    e
 Name: 2812, dtype: object)

In [18]:
from torch.utils.data import Dataset
one_hots = torch.eye(3,3)
one_hots

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])

In [19]:
ordinals = {c:i for i,c in enumerate(['A','B','C'])}
ordinals

{'A': 0, 'B': 1, 'C': 2}

In [20]:
one_hots[ordinals['A']]

tensor([1., 0., 0.])

In [0]:
class OneHotEncoder():
  
  def __init__(self,series):
    unique_values = series.unique()
    self.ordinals = {c:i for i, c in enumerate(unique_values)}
    self.encoder = torch.eye(len(unique_values),len(unique_values))
    
  def __getitem__(self,value):
    return self.encoder[self.ordinals[value]]
    

In [0]:
class CategoricalCSV():
  
  def __init__(self,datafile,output_series_name):
    self.dataset = pd.read_csv(datafile)
    self.output_series_name = output_series_name
    self.encoders = {}
    for series_name, series in self.dataset.items():
      self.encoders[series_name] = OneHotEncoder(series)
    
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self,idx):    
    if type(idx) == torch.Tensor:
      idx = idx.item()
    sample = self.dataset.iloc[idx]
    output = self.encoders[self.output_series_name][sample[self.output_series_name]]
    input_components = []
    for name, value in sample.items():
      if name != self.output_series_name:
        input_components.append(self.encoders[name][value])   
    input = torch.cat(input_components)
    return input,output
    

In [39]:
shrooms = CategoricalCSV('./mushrooms.csv','class')
shrooms[0]

(tensor([1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
         0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0.,
         1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 1., 0., 0., 0., 0., 0., 0.]), tensor([1., 0.]))

In [0]:
class Model(torch.nn.Module):
  
  def __init__(self,input_dimensions,output_dimensions,size = 128):
    
    super().__init__()
    self.layer_one = torch.nn.Linear(input_dimensions, 128)
    self.activation_one = torch.nn.ReLU()
    self.layer_two = torch.nn.Linear(size,size)
    self.activation_two = torch.nn.ReLU()
    self.shape_outputs = torch.nn.Linear(size,output_dimensions)
    
  def forward(self,inputs):
    x = self.layer_one(inputs)
    x = self.activation_one(x)
    x = self.layer_two(x)
    x = self.activation_two(x)
    x = self.shape_outputs(x)
    return torch.nn.functional.softmax(x,dim = -1)

In [0]:
model = Model(shrooms[0][0].shape[0],shrooms[0][1].shape[0])
optimizer = torch.optim.Adam(model.parameters())
loss_function = torch.nn.BCELoss()

In [74]:
from torch.utils import data
number_for_testing  = int(len(shrooms)*0.05)
number_for_training = len(shrooms) - number_for_testing
train, test = data.random_split(shrooms,[number_for_training,number_for_testing])
len(train), len(test)

(7718, 406)

In [77]:
training = torch.utils.data.DataLoader(train,batch_size = 16,shuffle = True)
for epoch in range(4):
  for inputs,outputs in training:
    optimizer.zero_grad()
    results = model(inputs)
    loss = loss_function(results,outputs)
    loss.backward()
    optimizer.step()
  print('Loss: {}'.format(loss))

Loss: 1.4404461978756444e-07
Loss: 2.4835271617007493e-08
Loss: 3.476938204016733e-08
Loss: 1.2914377975903335e-06


In [0]:
import sklearn.metrics

In [82]:
testing = torch.utils.data.DataLoader(test,batch_size = len(test), shuffle = False)
for inputs,outputs in testing:
  results = model(inputs).argmax(dim = 1).numpy()
  actual = outputs.argmax(dim = 1).numpy()
  accuracy = sklearn.metrics.accuracy_score(actual,results)
  print(accuracy)

1.0


In [83]:
sklearn.metrics.confusion_matrix(actual,results)

array([[186,   0],
       [  0, 220]])

In [84]:
print(sklearn.metrics.classification_report(actual,results))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       186
           1       1.00      1.00      1.00       220

    accuracy                           1.00       406
   macro avg       1.00      1.00      1.00       406
weighted avg       1.00      1.00      1.00       406

