In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
def accuracy_score(y_pred,y_true):
    score = np.sum(np.asarray(y_true) == np.asarray(y_pred))
    return score/len(y_true)

def precision_score(y_pred, y_true):
    tpfp = y_pred>0
    tp = y_true[tpfp]>0
    return np.sum(tp)/np.sum(tpfp)

def recall_score(y_pred, y_true):
    tpfn = y_true>0
    tp = y_pred[tpfn]>0
    return np.sum(tp)/np.sum(tpfn)

In [30]:
class LogisticRegression():
    def __init__(self, lr=0.001, epochs=100):
        self.lr = lr
        self.epochs = epochs

    def fit(self, X, y):
        self.m, self.n = X.shape
        self.W = np.random.randn(self.n)
        self.b = 0
        self.X = X
        self.y = y
        for i in range(self.epochs):
            self.weights_update()
        return self

    def weights_update(self):
        A = 1 / (1 + np.exp(- (self.X.dot(self.W) + self.b)))

        temp = (A - self.y.T).reshape(self.m)
        dW = self.X.T.dot(temp) / self.m
        db = np.sum(temp) / self.m

        self.W = self.W - self.lr * dW
        self.b = self.b - self.lr * db

        return self

    def predict(self, X):
        prob = 1 / (1 + np.exp(- (X.dot(self.W) + self.b)))
        y_pred = np.where(prob > 0.5, 1, 0)
        return y_pred

In [31]:
df = pd.read_csv("data/emails.csv")
df.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [32]:
X, y = df.iloc[:,1:-1], df.iloc[:,-1]
print(X.shape, y.shape)

(5000, 3000) (5000,)


In [35]:
for i in range(0,5000,1000):
    X_test = X.iloc[i:i+1000].to_numpy()
    y_test = y.iloc[i:i+1000].to_numpy()
    X_train = X.drop(np.arange(i,i+1000)).to_numpy()
    y_train = y.drop(np.arange(i,i+1000)).to_numpy()
    
    model = LogisticRegression(lr=0.1, epochs=500)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    acc = np.round(accuracy_score(y_pred, y_test), decimals=3)
    pre = np.round(precision_score(y_pred, y_test), decimals=3)
    rec = np.round(recall_score(y_pred, y_test), decimals=3)

    print(f"For Fold {(i//1000)+1}\tAccuracy = {acc},\tPrecision = {pre},\tRecall = {rec}")
    

  app.launch_new_instance()


For Fold 1	Accuracy = 0.891,	Precision = 0.819,	Recall = 0.793
For Fold 2	Accuracy = 0.676,	Precision = 0.46,	Recall = 0.968
For Fold 3	Accuracy = 0.865,	Precision = 0.89,	Recall = 0.599
For Fold 4	Accuracy = 0.832,	Precision = 0.652,	Recall = 0.918
For Fold 5	Accuracy = 0.725,	Precision = 0.529,	Recall = 0.912
