In [868]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from torch import nn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [966]:
df = pd.read_csv("data/covtype.csv")
random_seed = 1
torch.manual_seed(random_seed)
np.random.seed(random_seed)

# split original dataset into train and test set, 80/20 ratio
train, test = train_test_split(df, test_size=0.2, train_size=0.8)

train.reset_index(drop=True, inplace=True); test.reset_index(drop=True, inplace=True);

# separate features from labels for train, valid, and test sets
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2)
X_train = train.iloc[:, :-1].values; y_train = train.iloc[:, -1].values;
X_test = test.iloc[:, :-1].values; y_test = test.iloc[:, -1].values;

X_train = torch.Tensor(X_train).type(torch.float); y_train = torch.Tensor(y_train).type(torch.LongTensor)
X_valid = torch.Tensor(X_valid).type(torch.float); y_valid = torch.Tensor(y_valid).type(torch.LongTensor)
X_test = torch.Tensor(X_test).type(torch.float); y_test = torch.Tensor(y_test).type(torch.LongTensor)

## LOGISTIC REGRESSION

In [939]:
lr = LogisticRegression(fit_intercept=True, max_iter=1000)
lr.fit(X_train, y_train);


In [940]:
log_reg_train_score = lr.score(X_train, y_train)
log_reg_valid_score = lr.score(X_valid, y_valid)
log_reg_test_score = lr.score(X_test, y_test)


In [941]:
print(f"Accuracy on train set: {log_reg_train_score}\nAccuracy on valid set: {log_reg_valid_score}\nAccuracy on test set: {log_reg_test_score}")

Accuracy on train set: 0.719
Accuracy on valid set: 0.715625
Accuracy on test set: 0.7195


## SUPPORT VECTOR MACHINE

In [942]:
svm_rbf = SVC(kernel='rbf')
svm_linear = SVC(kernel='linear')

svm_rbf.fit(X_train, y_train)
svm_linear.fit(X_train, y_train);

In [943]:
svm_rbf_train_score = svm_rbf.score(X_train, y_train)
svm_rbf_valid_score = svm_rbf.score(X_valid, y_valid)
svm_rbf_test_score = svm_rbf.score(X_test, y_test)

svm_linear_train_score = svm_linear.score(X_train, y_train)
svm_linear_valid_score = svm_linear.score(X_valid, y_valid)
svm_linear_test_score = svm_linear.score(X_test, y_test)

In [946]:
print(f"Accuracy on train set (rbf kernel): {svm_rbf_train_score}\nAccuracy on valid set (rbf kernel): {svm_rbf_valid_score}\nAccuracy on test set (rbf kernel): {svm_rbf_test_score}\n\nAccuracy on train set (linear kernel): {svm_linear_train_score}\nAccuracy on valid set (linear kernel): {svm_linear_valid_score}\nAccuracy on test set (linear kernel): {svm_linear_test_score}")




Accuracy on train set (rbf kernel): 0.77875
Accuracy on valid set (rbf kernel): 0.780625
Accuracy on test set (rbf kernel): 0.7525

Accuracy on train set (linear kernel): 0.723
Accuracy on valid set (linear kernel): 0.716875
Accuracy on test set (linear kernel): 0.7185


## DECISION TREE

In [948]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [949]:
dt_train_score = dt.score(X_train, y_train)
dt_valid_score = dt.score(X_valid, y_valid)
dt_test_score = dt.score(X_test, y_test)

In [951]:
print(f"Accuracy on train set: {dt_train_score}\nAccuracy on valid set: {dt_valid_score}\nAccuracy on test set: {dt_test_score}")

Accuracy on train set: 1.0
Accuracy on valid set: 1.0
Accuracy on test set: 0.711


## RANDOM FOREST

In [955]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [957]:
rf_train_score = rf.score(X_train, y_train)
rf_valid_score = rf.score(X_valid, y_valid)
rf_test_score = rf.score(X_test, y_test)

In [959]:
print(f"Accuracy on train set: {rf_train_score}\nAccurayc on valid set: {rf_valid_score}\nAccuracy on test set: {rf_test_score}")

Accuracy on train set: 1.0
Accurayc on valid set: 1.0
Accuracy on test set: 0.803


## NEURAL NETWORK

In [1051]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, output_size, lin_size):
        super(NeuralNet, self).__init__()
        self.lin1 = nn.Linear(input_size, lin_size)
        self.lin2 = nn.Linear(lin_size, lin_size//2)
        self.lin3 = nn.Linear(lin_size//2, lin_size//4)
        self.lin4 = nn.Linear(lin_size//4, output_size)
        
        self.activation = nn.ReLU()
        self.LogSoftmax = nn.LogSoftmax()
        
    def forward(self, x):
        l1 = self.lin1(x)
        l2 = self.lin2(self.activation(l1))
        l3 = self.lin3(self.activation(l2))
        l4 = self.lin4(self.activation(l3))
        return self.LogSoftmax(l4)

In [1075]:
def train_simple_model(X_train, y_train, X_test, y_test, X_valid, y_valid, lin_size, lr, random_seed):
    # Make device
    device = "cpu" 
    if torch.cuda.is_available(): 
        device = "cuda"
    else:
        if torch.backends.mps.is_available():
            device = "mps" 
    
    torch.manual_seed(random_seed) # do not change this

    # TODO: write codes to train your model here
    model = NeuralNet(input_size=54, output_size=7, lin_size=lin_size)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    n_epochs = 200

    train_loss = np.zeros(n_epochs)
    test_loss = np.zeros(n_epochs)
    valid_loss = np.zeros(n_epochs)
    valid_accuracy = np.zeros(n_epochs)
    train_accuracy = np.zeros(n_epochs)
    test_accuracy = np.zeros(n_epochs)

    for i in np.arange(n_epochs):
        model, train_loss[i], train_accuracy[i] = train_step(model, X_train, y_train, loss_fn, optimizer, device)
        valid_loss[i], valid_accuracy[i] = evaluation_step(model, X_valid, y_valid, loss_fn, device)
        test_loss[i], test_accuracy[i] = evaluation_step(model, X_test, y_test, loss_fn, device)
    return model, train_loss, train_accuracy, test_loss, test_accuracy, valid_loss, valid_accuracy

In [576]:
def train_step(model, X_train, y_train, loss_fn, optimizer, device):
    # TODO: implement one epoch of training
    model.train()
    X_train = X_train.to(device); y_train = y_train.to(device)
    scores = model(X_train)
    predictions = torch.argmax(scores, dim=1)

    loss = loss_fn(scores, y_train)
    accuracy = (torch.eq(predictions, y_train).sum().item() / len(y_train)) * 100

    model.zero_grad()
    loss.backward()
    optimizer.step()

    return (model, loss, accuracy)

In [577]:
def evaluation_step(model, X_test, y_test, loss_fn, device):
    # TODO: implement evaluation on test set
    model.eval()
    X_test = X_test.to(device); y_test = y_test.to(device)
    
    with torch.inference_mode():
        scores = model(X_test)
        predictions = torch.argmax(scores, dim=1)

        loss = loss_fn(scores, y_test)
        accuracy = (torch.eq(predictions, y_test).sum().item() / len(y_test)) * 100
    
    return (loss, accuracy)

In [1061]:
# TUNING HYPERPARAMETERS
# CHECK VALIDATION ACCURACY FOR EACH COMBO OF LINEAR SIZE AND LEARNING RATE

# models = np.empty(48, dtype=NeuralNet)
# valid_accuracies = np.zeros(48)
# valid_losses = np.zeros(40)
# i = 0
# for lin_size in np.array([64, 80, 96, 128, 160, 200]):
#     for lr in np.array([0.15, 0.2, 0.225, 0.25, 0.275, 0.3, 0.5, 0.75]):
#         models[i], train_loss, train_accuracy, test_loss, test_accuracy, valid_loss, valid_accuracy\
#         = train_simple_model(X_train, y_train, X_test, y_test, X_valid, y_valid, lin_size, lr, random_seed)
#         valid_accuracies[i] = valid_accuracy[-1]
#         valid_losses[i] = valid_loss[-1]
#         i += 1
#         print(f"Lin size: {lin_size}\tLearning Rate: {lr}\tValid Accuracy: {max(valid_accuracy)}")

lin_size = 128
lr = 0.3

In [1076]:
train_accuracy = np.zeros(10)
valid_accuracy = np.zeros(10)
test_accuracy = np.zeros(10)

i = 0
for random_seed in np.random.randint(100, size=10):
    model, train_losses, train_accuracies, test_losses, test_accuracies, valid_losses, valid_accuracies\
        = train_simple_model(X_train, y_train, X_test, y_test, X_valid, y_valid, lin_size, lr, random_seed)
    train_accuracy[i] = train_accuracies[-1]
    valid_accuracy[i] = valid_accuracies[-1]
    test_accuracy[i] = test_accuracies[-1]
    i += 1

In [1077]:
print(f"Average Train Accuracy: {np.mean(train_accuracy)}\nAverage Test Accuracy: {np.mean(test_accuracy)}\nAverage Valid Accuracy: {np.mean(valid_accuracy)}")

Average Train Accuracy: 71.13374999999999
Average Test Accuracy: 70.065
Average Valid Accuracy: 71.65
