# Data Preprocessing

In [1]:
# Read the data
from pandas import read_csv

df = read_csv("mushrooms.csv")
df.describe()



Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [2]:
# Data preprocessing
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder()
data = oe.fit_transform(df)
data

array([[1., 5., 2., ..., 2., 3., 5.],
       [0., 5., 2., ..., 3., 2., 1.],
       [0., 0., 2., ..., 3., 2., 3.],
       ...,
       [0., 2., 2., ..., 0., 1., 2.],
       [1., 3., 3., ..., 7., 4., 2.],
       [0., 5., 2., ..., 4., 1., 2.]])

In [4]:
y = data[:, 0]
x = data[:, 1:]

# Normalization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x = scaler.fit_transform(x)
x

array([[ 1.02971224,  0.14012794, -0.19824983, ..., -0.67019486,
        -0.5143892 ,  2.03002809],
       [ 1.02971224,  0.14012794,  1.76587407, ..., -0.2504706 ,
        -1.31310821, -0.29572966],
       [-2.08704716,  0.14012794,  1.37304929, ..., -0.2504706 ,
        -1.31310821,  0.86714922],
       ...,
       [-0.8403434 ,  0.14012794, -0.19824983, ..., -1.50964337,
        -2.11182722,  0.28570978],
       [-0.21699152,  0.95327039, -0.19824983, ...,  1.42842641,
         0.28432981,  0.28570978],
       [ 1.02971224,  0.14012794, -0.19824983, ...,  0.16925365,
        -2.11182722,  0.28570978]])

In [None]:
# We use 70% as training data and 30% as validation
cut = int(len(x) * 0.7)

tx = x[:cut]
vx = x[cut:]
ty = y[:cut]
vy = y[cut:]

tx.shape, vx.shape, ty.shape, vy.shape

((5686, 22), (2438, 22), (5686,), (2438,))

# Exercise 1: TensorFlow neural network: fully connected layers with ReLU activation

In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Build the model according to the diagram
model = Sequential([
    Dense(20, activation='relu', input_shape=(22,)),  # Hidden layer 1
    Dense(15, activation='relu'),                     # Hidden layer 2
    Dense(20, activation='relu'),                     # Hidden layer 3
    Dense(1, activation='sigmoid')                    # Output layer (binary classification)
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',   # use 'mse' if regression
              metrics=['accuracy'])

# Print summary to check parameters
model.summary()


**Question 1: How many parameters are there in the network?**
For the modified architecture with 22 input features, three hidden layers, and one output node, the parameter counts are as follows: 
the first hidden layer has (22 × 20) + 20 = 460 parameters, 
the second hidden layer has (20 × 15) + 15 = 315 parameters,
the third hidden layer has (15 × 20) + 20 = 320 parameters, 
and the output layer has (20 × 1) + 1 = 21 parameters 
Adding these together gives a total of 1,116 parameters in the network.

**Question 2: Is this larger network better than smaller networks?**  
Although this network is larger than a very simple one, having more parameters does not automatically make it better. Larger networks can capture more complex relationships, but they also risk overfitting if the dataset is small or noisy. Smaller networks may generalize better in such cases. The choice of network size should balance the complexity of the task with the amount of available data.

**Question 3: Any ideas on how to improve the accuracy of the network?**  
Accuracy can be improved through several strategies. Regularization methods such as dropout or L2 weight decay help reduce overfitting. Batch normalization can stabilize training and improve convergence. Adjusting the learning rate or using learning rate schedules can optimize training efficiency. Expanding the dataset through augmentation, if possible, can improve generalization. Finally, early stopping can prevent the model from training too long and overfitting. These techniques help the network achieve better accuracy while maintaining generalization.

# Exercise 2: PyTorch

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(22, 20)   # Input → Hidden layer 1
        self.fc2 = nn.Linear(20, 15)   # Hidden layer 1 → Hidden layer 2
        self.fc3 = nn.Linear(15, 20)   # Hidden layer 2 → Hidden layer 3
        self.fc4 = nn.Linear(20, 1)    # Hidden layer 3 → Output layer
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()    # For binary classification

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))  # Use identity if regression
        return x

# Instantiate the model
model = Net()

# Define loss and optimizer
criterion = nn.BCELoss()              # Binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Print model summary-like info
print(model)
print("Total parameters:", sum(p.numel() for p in model.parameters()))


Net(
  (fc1): Linear(in_features=22, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=15, bias=True)
  (fc3): Linear(in_features=15, out_features=20, bias=True)
  (fc4): Linear(in_features=20, out_features=1, bias=True)
  (relu): ReLU()
  (sigmoid): Sigmoid()
)
Total parameters: 1116


# Exercise 3: Better accuracy

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

class ImprovedNet(nn.Module):
    def __init__(self):
        super(ImprovedNet, self).__init__()
        self.fc1 = nn.Linear(22, 20)
        self.bn1 = nn.BatchNorm1d(20)
        self.fc2 = nn.Linear(20, 15)
        self.bn2 = nn.BatchNorm1d(15)
        self.fc3 = nn.Linear(15, 20)
        self.bn3 = nn.BatchNorm1d(20)
        self.fc4 = nn.Linear(20, 1)
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(p=0.3)  # 30% dropout

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.sigmoid(self.fc4(x))  # binary classification
        return x

# Instantiate model
model = ImprovedNet()

# Loss and optimizer
criterion = nn.BCELoss()  # binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

print(model)
print("Total parameters:", sum(p.numel() for p in model.parameters()))


ImprovedNet(
  (fc1): Linear(in_features=22, out_features=20, bias=True)
  (bn1): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=20, out_features=15, bias=True)
  (bn2): BatchNorm1d(15, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=15, out_features=20, bias=True)
  (bn3): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc4): Linear(in_features=20, out_features=1, bias=True)
  (relu): ReLU()
  (sigmoid): Sigmoid()
  (dropout): Dropout(p=0.3, inplace=False)
)
Total parameters: 1226
