<a href="https://colab.research.google.com/github/dtran421/machine-learning-engineering-demo/blob/main/Machine_Learning_Engineering_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#@title Import statements for ML-related modules {display-mode: "form"}
import numpy as np
import re
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [3]:
DATA_URL = 'https://raw.githubusercontent.com/dtran421/machine-learning-engineering-demo/main/enron_data.csv'
df = pd.read_csv(DATA_URL, index_col=0)

In [4]:
numTrain    = 8000
numTest     = 2000
numTotal    = numTrain + numTest

numFeatures = 3000

In [5]:
df.describe()

Unnamed: 0,Unnamed: 0.1,Label
count,10000.0,10000.0
mean,17041.4046,0.5
std,9735.989695,0.500025
min,8.0,0.0
25%,8562.0,0.0
50%,17159.0,0.5
75%,25509.25,1.0
max,33698.0,1.0


In [6]:
labels = df['Label']   # list of labels for each message
docs   = df['Body']    # list of messages

In [7]:
# This function will be called on each message to preprocess it
def preprocess(doc):
    # Replace all currency signs and some url patterns by special
    # tokens. These are useful features.
    doc = re.sub('[£$]', ' __currency__ ', doc)
    doc = re.sub('\://', ' __url__ ', doc)
    doc = doc.lower() # convert to lower
    return doc


# This is the object that does the conversion from text to feature vectors
vectorizer = CountVectorizer(max_features=numFeatures, preprocessor=preprocess)

In [8]:
# Do the conversion ("fit" the transform from text to feature vector. 
#   later we will also "apply" the tranform on test messages)
X = vectorizer.fit_transform(docs)

In [9]:
# The vectorizer returns sparse scipy arrays. Convert this back to a dense 
#   numpy array --- not as efficient but easier to work with
X = X.toarray()
m,n = X.shape
y = labels.array

# Add column of ones
X = np.column_stack([np.ones(m), X])

In [10]:
pos = np.nonzero(y == 1)[0]   # indices of positive training examples
neg = np.nonzero(y == 0)[0]   # indices of negative training examples

# Create a subset that has the same number of positive and negative examples
subset = np.concatenate([pos[:numTotal//2], neg[:numTotal//2]])

# Randomly shuffle order of examples
np.random.shuffle(subset)

In [11]:
X = X[subset,:]
y = y[subset]

# Split into test and train
train = np.arange(numTrain)
test  = numTrain + np.arange(numTest)

X_train = X[train,:]
y_train = y[train]

X_test  = X[test,:]
y_test  = y[test]

In [12]:
# Extract the list of tokens (words) in the dictionary
tokens = vectorizer.get_feature_names_out()

In [41]:
model = LogisticRegression(max_iter=500, solver='liblinear')
model.fit(X_train, y_train)

In [44]:
pred_train = model.predict(X_test)

res = np.sum(np.abs(pred_train - y_test))
acc = 100 - (res / numTest * 100)
print(f'{acc}%')

98.35%
