In [299]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from torch import nn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [540]:
df = pd.read_csv("data/covtype.csv")

random_seed = 1
torch.manual_seed(random_seed)
np.random.seed(random_seed)

# split original dataset into train and test set, 80/20 ratio
train, test = train_test_split(df, test_size=0.2, train_size=0.8)

# split train set into a final train set and a validation set, 80/20 ratio
train, valid = train_test_split(train, test_size=0.2, train_size=0.8)
train.reset_index(drop=True, inplace=True); valid.reset_index(drop=True, inplace=True); test.reset_index(drop=True, inplace=True);

# separate features from labels for train, valid, and test sets
X_train = train.iloc[:, :-1].values; y_train = train.iloc[:, -1].values;
X_valid = valid.iloc[:, :-1].values; y_valid = valid.iloc[:, -1].values;
X_test = test.iloc[:, :-1].values; y_test = test.iloc[:, -1].values;

X_train = torch.from_numpy(X_train).type(torch.float); y_train = torch.from_numpy(y_train).type(torch.LongTensor)
X_valid = torch.from_numpy(X_valid).type(torch.float); y_valid = torch.from_numpy(y_valid).type(torch.LongTensor)
X_test = torch.from_numpy(X_test).type(torch.float); y_test = torch.from_numpy(y_test).type(torch.LongTensor)

## LOGISTIC REGRESSION

In [528]:
log_reg = LogisticRegression(fit_intercept=True, max_iter=500).fit(X_train, y_train)

In [529]:
log_reg_train_predict = log_reg.predict(X_train)
log_reg_test_predict = log_reg.predict(X_test)

In [530]:
print(f"Accuracy on train set: {accuracy_score(log_reg_train_predict, y_train)}\nAccuracy on test set: {accuracy_score(log_reg_test_predict, y_test)}")

Accuracy on train set: 0.7222916666666667
Accuracy on test set: 0.721


## SUPPORT VECTOR MACHINE

In [273]:
svm = SVC().fit(X_train, y_train)

In [274]:
svm_train_predict = svm.predict(X_train)
svm_test_predict = svm.predict(X_test)

In [275]:
print(f"Accuracy on train set: {accuracy_score(svm_train_predict, y_train)}\nAccuracy on test set: {accuracy_score(svm_test_predict, y_test)}")

Accuracy on train set: 0.7794791666666666
Accuracy on test set: 0.7566666666666667


## DECISION TREE

In [276]:
dt = DecisionTreeClassifier().fit(X_train, y_train)

In [277]:
dt_train_predict = svm.predict(X_train)
dt_test_predict = svm.predict(X_test)

In [279]:
print(f"Accuracy on train set: {accuracy_score(dt_train_predict, y_train)}\nAccuracy on test set: {accuracy_score(dt_test_predict, y_test)}")

Accuracy on train set: 0.7794791666666666
Accuracy on test set: 0.7566666666666667


## RANDOM FOREST

In [280]:
rf = RandomForestClassifier().fit(X_train, y_train)

In [281]:
rf_train_predict = svm.predict(X_train)
rf_test_predict = svm.predict(X_test)

In [298]:
print(f"Accuracy on train set: {accuracy_score(rf_train_predict, y_train)}\nAccuracy on test set: {accuracy_score(rf_test_predict, y_test)}")

Accuracy on train set: 0.7794791666666666
Accuracy on test set: 0.7566666666666667


## NEURAL NETWORK

In [614]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, output_size):
        super(NeuralNet, self).__init__()
        self.lin1 = nn.Linear(input_size, 24)
        self.lin2 = nn.Linear(24, output_size)
        
        self.activation = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        l1 = self.lin1(x)
        l2 = self.lin2(self.activation(l1))
        return self.sigmoid(l2.squeeze())

In [615]:
model, train_losses, train_accuracies, test_losses, test_accuracies\
    = train_simple_model(X_train, y_train, X_test, y_test, random_seed)

hello
hello
hello
hello
hello


In [616]:
train_accuracies

array([21.90625   , 22.26041667, 22.63541667, 22.9375    , 23.25      ,
       23.64583333, 23.94791667, 24.27083333, 24.58333333, 24.84375   ,
       25.21875   , 25.44791667, 25.73958333, 26.0625    , 26.33333333,
       26.64583333, 27.01041667, 27.30208333, 27.65625   , 27.9375    ,
       28.27083333, 28.48958333, 28.85416667, 29.11458333, 29.48958333,
       29.75      , 30.03125   , 30.27083333, 30.65625   , 30.89583333,
       31.15625   , 31.375     , 31.65625   , 31.85416667, 32.13541667,
       32.375     , 32.66666667, 33.02083333, 33.14583333, 33.42708333,
       33.71875   , 33.92708333, 34.20833333, 34.36458333, 34.63541667,
       34.96875   , 35.26041667, 35.53125   , 35.83333333, 36.11458333,
       36.32291667, 36.65625   , 36.92708333, 37.20833333, 37.4375    ,
       37.67708333, 37.96875   , 38.22916667, 38.42708333, 38.59375   ,
       38.73958333, 38.92708333, 39.16666667, 39.38541667, 39.63541667,
       39.78125   , 39.98958333, 40.13541667, 40.35416667, 40.48

In [576]:
def train_step(model, X_train, y_train, loss_fn, optimizer, device):
    # TODO: implement one epoch of training
    model.train()
    X_train = X_train.to(device); y_train = y_train.to(device)
    scores = model(X_train)
    predictions = torch.argmax(scores, dim=1)

    loss = loss_fn(scores, y_train)
    accuracy = (torch.eq(predictions, y_train).sum().item() / len(y_train)) * 100

    model.zero_grad()
    loss.backward()
    optimizer.step()

    return (model, loss, accuracy)

In [577]:
def evaluation_step(model, X_test, y_test, loss_fn, device):
    # TODO: implement evaluation on test set
    model.eval()
    X_test = X_test.to(device); y_test = y_test.to(device)
    
    with torch.inference_mode():
        scores = model(X_test)
        predictions = torch.argmax(scores, dim=1)

        loss = loss_fn(scores, y_test)
        accuracy = (torch.eq(predictions, y_test).sum().item() / len(y_test)) * 100
    
    return (loss, accuracy)

In [610]:
def train_simple_model(X_train, y_train, X_test, y_test, random_seed):
    # Make device
    device = "cpu" 
    if torch.cuda.is_available(): 
        device = "cuda"
    else:
        if torch.backends.mps.is_available():
            device = "mps" 
    
    torch.manual_seed(random_seed) # do not change this

    # TODO: write codes to train your model here
    model = NeuralNet(input_size=54, output_size=7)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

    n_epochs = 250

    train_loss = np.zeros(n_epochs)
    test_loss = np.zeros(n_epochs)
    train_accuracy = np.zeros(n_epochs)
    test_accuracy = np.zeros(n_epochs)

    for i in np.arange(n_epochs):
        if i % 50 == 0:
            print("hello")
        model, train_loss[i], train_accuracy[i] = train_step(model, X_train, y_train, loss_fn, optimizer, device)
        test_loss[i], test_accuracy[i] = evaluation_step(model, X_test, y_test, loss_fn, device)
    return model, train_loss, train_accuracy, test_loss, test_accuracy