In [868]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from torch import nn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [909]:
df = pd.read_csv("data/covtype.csv")
random_seed = 1
torch.manual_seed(random_seed)
np.random.seed(random_seed)

# split original dataset into train and test set, 80/20 ratio
train, test = train_test_split(df, test_size=0.2, train_size=0.8)

train.reset_index(drop=True, inplace=True); test.reset_index(drop=True, inplace=True);

# separate features from labels for train and test sets
# not explicitly making a validation set, using k-fold validation via sklearn's GridSearchCV function
X_train = train.iloc[:, :-1].values; y_train = train.iloc[:, -1].values;
X_test = test.iloc[:, :-1].values; y_test = test.iloc[:, -1].values;

X_train = torch.Tensor(X_train).type(torch.float); y_train = torch.Tensor(y_train).type(torch.LongTensor)
X_test = torch.Tensor(X_test).type(torch.float); y_test = torch.Tensor(y_test).type(torch.LongTensor)

## LOGISTIC REGRESSION

In [912]:
lr = LogisticRegression(fit_intercept=True, max_iter=1000)
lr.fit(X_train, y_train);

In [911]:
log_reg_train_score = log_reg.score(X_train, y_train)
log_reg_valid_score = log_reg.score(X_valid, y_valid)
log_reg_test_score = log_reg.score(X_test, y_test)


ValueError: X has 54 features, but LogisticRegression is expecting 10 features as input.

In [906]:
print(f"Accuracy on train set: {log_reg_train_score}\nAccuracy on valid set: {log_reg_valid_score}\nAccuracy on test set: {log_reg_test_score}")

NameError: name 'log_reg_train_score' is not defined

## SUPPORT VECTOR MACHINE

In [903]:
svm_rbf = SVC(kernel='rbf')
svm_linear = SVC(kernel='linear')

svm_rbf.fit(X_train, y_train)
svm_linear.fit(X_train, y_train);

In [902]:
svm_rbf_train_score = svm_rbf.score(X_train, y_train)
svm_rbf_valid_score = svm_rbf.score(X_valid, y_valid)
svm_rbf_test_score = svm_rbf.score(X_test, y_test)

svm_linear_train_score = svm_linear.score(X_train, y_train)
svm_linear_valid_score = svm_linear.score(X_valid, y_valid)
svm_linear_test_score = svm_linear.score(X_test, y_test)

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [758]:
print(f"Accuracy on train set: {svm_train_score}\nAccuracy on test set: {svm_test_score}")

Accuracy on train set: 0.916375
Accuracy on test set: 0.792


## DECISION TREE

In [759]:
dt_params = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 7, 8, 9],
}

dt = DecisionTreeClassifier()
dt_grid_search = GridSearchCV(
    estimator=dt,
    param_grid=dt_params,
    cv=5,
    n_jobs=-1,
    verbose=4
)

dt_grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END criterion=gini, max_depth=None, splitter=random;, score=0.739 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=None, splitter=best;, score=0.771 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=None, splitter=best;, score=0.737 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=None, splitter=best;, score=0.761 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=None, splitter=random;, score=0.721 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=None, splitter=best;, score=0.738 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=None, splitter=random;, score=0.746 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=None, splitter=random;, score=0.738 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=None, splitter=random;, score=0.736 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=None, splitter=best;, score=0.752 total time=   0.1s
[CV 1/5]

In [760]:
dt = dt_grid_search.best_estimator_
dt_train_score = dt.score(X_train, y_train)
dt_test_score = dt.score(X_test, y_test)

dt_grid_search.best_params_

{'criterion': 'log_loss', 'max_depth': 9, 'splitter': 'best'}

In [761]:
print(f"Accuracy on train set: {dt_train_score}\nAccuracy on test set: {dt_test_score}")

Accuracy on train set: 0.839375
Accuracy on test set: 0.764


## RANDOM FOREST

In [762]:
rf = RandomForestClassifier()
rf_params = {
    'n_estimators': [50, 75, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 7, 9]
}

rf_grid_search = GridSearchCV(
    estimator=rf,
    param_grid=rf_params,
    cv=5,
    n_jobs=-1,
    verbose=4
)

rf_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 2/5] END criterion=gini, max_depth=None, n_estimators=50;, score=0.790 total time=   1.1s
[CV 3/5] END criterion=gini, max_depth=None, n_estimators=50;, score=0.804 total time=   1.1s
[CV 1/5] END criterion=gini, max_depth=None, n_estimators=50;, score=0.832 total time=   1.1s
[CV 4/5] END criterion=gini, max_depth=None, n_estimators=50;, score=0.811 total time=   1.1s
[CV 5/5] END criterion=gini, max_depth=None, n_estimators=50;, score=0.821 total time=   1.1s
[CV 2/5] END criterion=gini, max_depth=None, n_estimators=75;, score=0.794 total time=   1.7s
[CV 1/5] END criterion=gini, max_depth=None, n_estimators=75;, score=0.842 total time=   1.7s
[CV 3/5] END criterion=gini, max_depth=None, n_estimators=75;, score=0.798 total time=   1.7s
[CV 1/5] END criterion=gini, max_depth=7, n_estimators=50;, score=0.801 total time=   0.5s
[CV 4/5] END criterion=gini, max_depth=None, n_estimators=75;, score=0.809 total time=   1.6s
[C

In [763]:
rf = rf_grid_search.best_estimator_
rf_train_score = rf.score(X_train, y_train)
rf_test_score = rf.score(X_test, y_test)

rf_grid_search.best_params_

{'criterion': 'entropy', 'max_depth': None, 'n_estimators': 100}

In [764]:
print(f"Accuracy on train set: {rf_train_score}\nAccuracy on test set: {rf_test_score}")

Accuracy on train set: 1.0
Accuracy on test set: 0.801


## NEURAL NETWORK

In [791]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, output_size, lin_size):
        super(NeuralNet, self).__init__()
        self.lin1 = nn.Linear(input_size, lin_size)
        self.lin2 = nn.Linear(lin_size, output_size)
        
        self.activation = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        l1 = self.lin1(x)
        l2 = self.lin2(self.activation(l1))
        return self.sigmoid(l2.squeeze())

    def score(self, predictions, y):
        accuracy = (torch.eq(predictions, y).sum().item() / len(y)) * 100
        return accuracy

In [829]:
# using manual hyper-parameter tuning for the neural network, so splitting train set into train and valid
X_train_nn, X_valid_nn, y_train_nn, y_valid_nn = train_test_split(X_train, y_train, test_size=0.2)

models = np.empty(11, dtype=NeuralNet)
valid_accuracies = np.zeros(11)
valid_losses = np.zeros(11)
j = 0
for i in np.array([64, 72, 84, 96, 100, 128, 180, 200, 240, 320, 500]):
    train_loss = np.empty(0)
    valid_loss = np.empty(0)
    valid_accuracy = np.empty(0)
    train_accuracy = np.empty(0)
    models[j], train_loss, train_accuracy, valid_loss, valid_accuracy\
    = train_simple_model(X_train_nn, y_train_nn, X_valid_nn, y_valid_nn, i, random_seed)
    valid_accuracies[j] = valid_accuracy[-1]
    valid_losses[j] = valid_loss[-1]
    j += 1


hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello


In [830]:
valid_accuracies

array([65.625 , 66.0625, 68.1875, 68.0625, 65.    , 74.    , 74.1875,
       74.6875, 75.125 , 75.75  , 76.1875])

In [787]:
model, train_losses, train_accuracies, test_losses, test_accuracies\
    = train_simple_model(X_train, y_train, X_test, y_test, random_seed)

hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello


In [576]:
def train_step(model, X_train, y_train, loss_fn, optimizer, device):
    # TODO: implement one epoch of training
    model.train()
    X_train = X_train.to(device); y_train = y_train.to(device)
    scores = model(X_train)
    predictions = torch.argmax(scores, dim=1)

    loss = loss_fn(scores, y_train)
    accuracy = (torch.eq(predictions, y_train).sum().item() / len(y_train)) * 100

    model.zero_grad()
    loss.backward()
    optimizer.step()

    return (model, loss, accuracy)

In [577]:
def evaluation_step(model, X_test, y_test, loss_fn, device):
    # TODO: implement evaluation on test set
    model.eval()
    X_test = X_test.to(device); y_test = y_test.to(device)
    
    with torch.inference_mode():
        scores = model(X_test)
        predictions = torch.argmax(scores, dim=1)

        loss = loss_fn(scores, y_test)
        accuracy = (torch.eq(predictions, y_test).sum().item() / len(y_test)) * 100
    
    return (loss, accuracy)

In [826]:
def train_simple_model(X_train, y_train, X_test, y_test, lin_size, random_seed):
    # Make device
    device = "cpu" 
    if torch.cuda.is_available(): 
        device = "cuda"
    else:
        if torch.backends.mps.is_available():
            device = "mps" 
    
    torch.manual_seed(random_seed) # do not change this

    # TODO: write codes to train your model here
    model = NeuralNet(input_size=10, output_size=2, lin_size=lin_size)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

    n_epochs = 1000

    train_loss = np.zeros(n_epochs)
    test_loss = np.zeros(n_epochs)
    train_accuracy = np.zeros(n_epochs)
    test_accuracy = np.zeros(n_epochs)

    for i in np.arange(n_epochs):
        model, train_loss[i], train_accuracy[i] = train_step(model, X_train, y_train, loss_fn, optimizer, device)
        test_loss[i], test_accuracy[i] = evaluation_step(model, X_test, y_test, loss_fn, device)
    return model, train_loss, train_accuracy, test_loss, test_accuracy