# WORK IN PROGRESS

### Implementations

- kNN
- k-means
- Decision Trees
    - Gini 
    - Ent
- Naive Bayse
- Regression
    - Simple Linear
    - Multiple
    - Logigistic
- Perceptrons
- FeedForawd Neural Nets

### Imports we are going to use

In [1]:
# Numpy for Array operations
import numpy

#Pandas for nice data layout also will make it easer to implement the stats at the end
import pandas

from collections import Counter #for counting :D 
import math # maths \(' ')\
import random

#This is literally just to ensure our sublasses will be required to implement abstract methods
from abc import ABCMeta, abstractmethod

# Main Class

So I have gone with a fairly boring name for our base class. its just ML because why not. 
This base class will wrap up some functionality that each of our methods should have.
- Ensuring Similarity [Done]
- Train test splitting [Done]
- testing [Done]
- Evaluation []

In [2]:
class MachineLearning(metaclass=ABCMeta):
    
    def __init__(self, x, y, tts, thresh):
        x_ = x.copy()
        y_ = y
        if type(x_) == pandas.DataFrame:
            if 'labels' in x_.columns:
                y_ = x_.pop('labels')
            else:
                if y_ is not None:
                    raise RuntimeWarning("No 'labels' provided in Dataframe. Using the passed y")
            x_ = x.values
        if type(x_) != numpy.ndarray:
            x_ = numpy.array(x_)
        if y_ is not None:
            if type(y_) != numpy.ndarray:
                y_ = numpy.array(y_)
            if (len(x_) != len(y_)):
                raise ValueError("X and Y should have the same amount of samples")
            if len(y_.shape) > 1:
                raise ValueError("Y values should be singular")
        self.xtrain, self.ytrain, self.xtest, self.ytest = self.tts(x_, y_, tts, thresh)
        self.num_samples = len(self.xtrain) - 1
        self.num_features = len(self.xtrain[0]) - 1
            
        
    def get_train(self):
        #Get the Training Data in a nice way
        zipped_ = zip([i for i in range(len(self.xtrain))], self.xtrain.transpose())
        df_data = {}
        for n, sample in zipped_:
            df_data["train_feature"+str(n)] = sample
        if self.ytrain is not None:
            df_data["train_labels"] = self.ytrain
        return pandas.DataFrame(df_data)
            
    def get_test(self):
        testd = {}
        if self.xtest is None:
            raise RuntimeWarning("No Test Data")
        else:
            zipped_ = zip([i for i in range(len(self.xtest))], self.xtest.transpose())
            df_data = {}
            for n, sample in zipped_:
                testd["test_feature_"+str(n)] = sample
            if self.ytest is not None:
                testd["test_labels"] = self.ytest
        return pandas.DataFrame(testd)
    
    def tts(self, x, y, tts = 0.8, thresh = 10):
        '''
        This function just splits the data
        the threshold just checks whether or not to split the data
        we don't need this but sometimes if you're just experimenting
        you might have small datasets and not want to split it and you will want to Test
        on the Training (Even if it screws the evaluation)
        '''
        xtrain, ytrain, xtest, ytest = None, None, None, None
        xlen = len(x)
        train_len = round(xlen * tts)
        test_len = xlen - train_len
        if y is not None:
            if thresh > test_len:
                xtrain, xtest = x.copy(), x.copy()
                ytrain, ytest = y.copy(), y.copy()
            else:
                xtrain, xtest = x.copy()[0:train_len], x.copy()[train_len:]
                ytrain, ytest = y.copy()[0:train_len], y.copy()[train_len:]
        else:
            xtrain, xtest = x.copy(), x.copy()
        return xtrain, ytrain, xtest, ytest
        
    @abstractmethod
    def fit(self):
        pass
    
    @abstractmethod
    def predict(self, x):
        pass
        
    def test(self):
        predictions = self.predict(self.xtest)
        test_res = {"predictions": predictions}
        if self.ytest is not None:
            test_res["Actual"] = self.ytest
        return pandas.DataFrame(test_res)
    
    def results(self):
        res = self.test()
        tp = len(res[(res["Actual"] == 1) & (res["predictions"] == 1)])
        fn = len(res[(res["Actual"] == 1) & (res["predictions"] == 0)])
        fp = len(res[(res["Actual"] == 0) & (res["predictions"] == 1)])
        tn = len(res[(res["Actual"] == 0) & (res["predictions"] == 0)])
        return tp, fn, fp, tn
    
    def stats(self):
        tp, fn, fp, tn =  self.results()
        tp, fn, fp, tn = tp+1, fn+1, fp+1, tn+1 
        acc = (tp + tn) / (tp + fn + fp + tn)
        prec = tp / (tp + fp)
        recall = tp / (tp + fn)
        speci = tn / (tn + fp)
        # 𝑭𝟏 = 𝟐 ∗ (𝒑𝒓𝒆𝒄𝒊𝒔𝒊𝒐𝒏 ∗ 𝒓𝒆𝒄𝒂𝒍𝒍) / (𝒑𝒓𝒆𝒄𝒊𝒔𝒊𝒐𝒏 + 𝒓𝒆𝒄𝒂𝒍𝒍)
        f1 = 2 * (prec * recall) / (prec + recall)
        return {
            "Acuuracy": round(acc,3),
            "Precision": round(prec,3),
            "Recall": round(recall,3),
            "Specificity": round(speci,3),
            "F1": round(f1,3)
        }
    
    def confusion_matrix(self):
        tp, fn, fp, tn = self.results()
        d = {
            "pos": [tp, fp],
            "neg": [fn, tn]
        }
        ret = pandas.DataFrame(d)
        ret.rename(index={0: "pos", 1: "neg"})
        return ret

# K-Nearest Neighbours

In [3]:
class kNN(MachineLearning):
    
    def __init__(self, x, y=None, k=3, weighted=False, dist_type="manhattan", tts=0.8, thresh=10):
        super(kNN, self).__init__(x, y, tts=0.8, thresh=10)
        self.k = k
        self.weighted = weighted
        self.dist_type = dist_type
        self.distance_types = ["manhattan", "euclid"]
        if self.dist_type not in self.distance_types:
            raise RuntimeError("You need to use an actual distance type (see kNN.distance_types)")
        if self.k > self.num_samples:
            raise RuntimeError("k can't be greater than the number of samples")
        if self.ytrain is None:
            raise RuntimeError("K-Nearest Neighbours needs labels")
        
    def fit(self, x, y=None):
        #For K-neighbours we don't need to fit since we predict on the samples.
        pass
    
    def distance(self, a, b):
        tot = 0
        for i in range(self.num_features):
            if self.dist_type == "manhattan":
                tot += abs(a[i] - b[i])
            elif self.dist_type == "euclid":
                tot += ((a[i] - b[i]) ** 2)
        if self.dist_type == "euclid":
            tot = math.sqrt(tot)
        return tot
    
    def calc_weighted_label(self, labels, weights):
        tot_pos, tot_neg = 0, 0
        for i in range(self.k):
            if labels[i] == 0:
                tot_neg += weights[i]
            else:
                tot_pos += weights[i]
        ret = 0
        if tot_pos > tot_neg:
            ret = 1
        if tot_pos == tot_neg:
            ret = 1 if (random.random() > 0.5) else 0
        return ret
    
    def calc_knn(self, dists):
        ret = 0
        s = sorted(dists)
        labs, weights = [], []
        for i in range(self.k):
            l = self.ytrain[dists.index(s[i])]
            labs.append(l)
            weights.append(s[i])
        if self.weighted:
            ret = self.calc_weighted_label(labs, weights)
        else:
            c = Counter(labs)
            ret = c.most_common()[0][0]
        return ret
    
    def knn_sample(self, n):
        distances = []
        for i in range(self.num_samples):
            dist = self.distance(self.xtrain[i], n)
            distances.append(dist)
        return self.calc_knn(distances)
    
    def predict(self, x):
        ret = []
        if type(x) != numpy.ndarray:
            if type(x) != list:
                raise RuntimeError("You need to provide the values as a list")
        for samp in x:
            ret.append(self.knn_sample(samp))
        return numpy.array(ret)
        

## KNN EVAL

In [4]:
df = pandas.read_csv("./diabetes.csv")
df = df.rename(columns={"Outcome": "labels"})
labs = df.pop("labels") 
df = (df - df.mean())/df.std(ddof=0)
df["labels"] = labs

In [5]:
knn = kNN(df, k=8)
print(knn.stats())
knn = kNN(df, k=8, weighted=True)
print(knn.stats())
knn = kNN(df, k=8, dist_type="euclid")
print(knn.stats())
knn = kNN(df, k=8, weighted=True, dist_type="euclid")
print(knn.stats())

{'Acuuracy': 0.703, 'Precision': 0.609, 'Recall': 0.491, 'Specificity': 0.822, 'F1': 0.544}
{'Acuuracy': 0.734, 'Precision': 0.674, 'Recall': 0.509, 'Specificity': 0.861, 'F1': 0.58}
{'Acuuracy': 0.741, 'Precision': 0.682, 'Recall': 0.526, 'Specificity': 0.861, 'F1': 0.594}
{'Acuuracy': 0.715, 'Precision': 0.658, 'Recall': 0.439, 'Specificity': 0.871, 'F1': 0.526}
