In [1]:
import pandas as pd
import numpy as np
from pandas import errors
import describe
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import math

In [54]:
def sigmoid(x):
    ret = 1 / (1 + np.exp(-x))
    return(ret)

In [67]:
class LogisticRegression:
    
    def __init__(self, datafile = None):
        if datafile:
            self.read_csv(datafile)
        self.activation = sigmoid

    def read_csv(self, datafile):
        try:
            df = pd.read_csv(datafile)
            df.fillna(df.median(), inplace = True)
            features = list(df.columns[6:8])
            self.X = df[features].to_numpy()
            one_hot_encoding = pd.get_dummies(df["Hogwarts House"], drop_first = False)
            self.houses = list(one_hot_encoding.columns)
            self.y = one_hot_encoding.to_numpy()
        except (FileNotFoundError, errors.EmptyDataError) as e:
            print(e)
        except KeyError as e:
            print(f"The csv file does not contain the expected column '{e}'")
    
    def feature_scale_normalise(self):
        self.scaler = StandardScaler()
        self.scaler.fit(self.X)
        self.X = self.scaler.transform(self.X)
    
    def split_data(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=42)
    
    def init_weights(self):
        self.thetas = np.zeros((self.X_train.shape[1], self.y_train.shape[1]))

    def add_bias_units(self):
        bias_units = np.ones((model.X.shape[0], 1))
        self.X = np.concatenate((bias_units, self.X), axis = 1)
    
    def hypothesis(self, X):
        z = np.matmul(X, self.thetas)
        self.H = self.activation(z)
    
    def compute_loss_gradient(self, X, y):
        error = self.H - y
        self.loss_gradient = np.matmul(X.T, error) / len(X)

    def gradient_descent(self):
        self.thetas = self.thetas - (0.1 * self.loss_gradient)
    
    def fit(self):
        for i in range(20):
            self.hypothesis(self.X_train)
            self.compute_loss_gradient(self.X_train, self.y_train)
            self.gradient_descent()
    
    

    
    def __str__(self):
        return(f"{self.X.shape = }\t\t{self.y.shape = }\n{self.X_train.shape = }\t\t{self.y_train.shape = }\n{self.thetas.shape = }")

In [68]:
datafile = "datasets/dataset_train.csv"
model = LogisticRegression(datafile)
model.feature_scale_normalise()
model.add_bias_units()
model.split_data()
model.init_weights()
####
model.fit()

In [81]:
a = 0
for i in range(len(model.H)):
    x = model.H[i,:]
    y = model.y_train[i, :]
    if (np.argmax(x) != np.argmax(y)):
        a += 1
print(a / len(model.H))

0.4160714285714286


In [77]:
model.thetas

array([[-1.89078142, -1.36447216, -1.68910461, -2.11913588],
       [ 0.14158761,  0.23159357,  0.06794665,  0.01938257],
       [ 1.54198864,  1.86476071, -1.83321016, -1.62623449]])

In [78]:
X, y, features = model.read_csv("datasets/dataset_train.csv")
X_clean, y_clean = model.clean_data(X, y)
X_t = X_clean.T
X_norm = []
for i in range(X_t.shape[0]):
    X_norm.append(model.feature_scale_normalise(X_t[i]))
X_norm = np.array(X_norm).T
X_train, X_test, y_train, y_test = model.split_data(X_norm, y_clean)
model.thetas = np.zeros((X_train.shape[1] + 1, model.houses.size))

TypeError: cannot unpack non-iterable NoneType object

In [89]:
y_train[2]

array(['Gryffindor'], dtype=object)

In [78]:
model.houses.size

4