In [4]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df['Category'].value_counts() # this gives us the count of Category variables i.e. below

Category
ham     4825
spam     747
Name: count, dtype: int64

In [10]:
# add new spam column by adding lamba func on category - if "spam" set column spam to 1, else 0
df['spam'] = df['Category'].apply(lambda x: 1 if x == "spam" else 0)

In [11]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [12]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [13]:
X_train.shape

(4457,)

In [14]:
X_test.shape

(1115,)

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
# we'll use CountVectorizer to do our bag of words (word count)

v = CountVectorizer()
# convert to bag of words using CountVectorizer which counts how many times each words appears in each email and transform into sparse matrix
X_train_cv = v.fit_transform(X_train.values) 
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59249 stored elements and shape (4457, 7688)>

In [30]:
# convert to numpy array to see it - use dir(v) to see all methods you can call on X_train_cv
print(X_train_cv.toarray())
print(X_train_cv.shape)
print(v.get_feature_names_out()) # gets all the words our document has
print(v.vocabulary_) # this shows you the position of where each word is in sparse matrix

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(4457, 7688)
['00' '000' '000pes' ... 'èn' 'ú1' '〨ud']


In [39]:
# lets now build ML model
# A Multinomial Naive Bayes classifier is a variant of the Naive Bayes classifier which is used for discrete data. 
# The term "multinomial" refers to the distribution that this classifier assumes for the features, which is the Multinomial distribution. 
# This distribution is particularly suitable for features that represent counts or frequency counts of events, making the Multinomial Naive Bayes classifier widely used in text classification (e.g., counting the frequency of words in a document).
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [42]:
X_test_cv = v.transform(X_test)

from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       961
           1       0.97      0.94      0.95       154

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [44]:
# there is actually much easier way to do all of the above using sklearn pipeliens! :) 
from sklearn.pipeline import Pipeline

# this pipeline will allow us to go straight to training the model
# clf typically stands for "classifier". 
# it's a common abbreviation used in machine learning code to refer to a variable that is an instance of a classification model. 
# In this case, clf is a pipeline that combines a CountVectorizer and a MultinomialNB (Multinomial Naive Bayes) classifier for text classification tasks.
clf = Pipeline([
                ('vectorizer',CountVectorizer()), # vectorisatin is first step in the pipeline
                ('nb', MultinomialNB()) # applying multinomial naive bayes is second step
                ])

clf.fit(X_train, y_train)


In [47]:
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       961
           1       0.97      0.94      0.95       154

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [6]:
# our model can be very sparse with words like 'or' 'the' and so on. we don't really care about these words, so we can remove with STOP words, which reduces the sparseness of our matrix 
# be carefull for instance like below
# 1. this is a good movie 
# 2. thi is not a good movie 
# if you removed stop words, you could end up with 'GOOD MOVIE' twice.. but actually the sentences are very different.
import spacy
from spacy.lang.en.stop_words import STOP_WORDS # out of box stop words in english

print(len(STOP_WORDS))

326


In [8]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("We just opened our wings, the flying part is coming soon")

for token in doc:
    if token.is_stop:
        print(token)

We
just
our
the
part
is


In [11]:
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return no_stop_words

preprocess("Musk wants time to prepare for a trial that is special!")

['Musk', 'wants', 'time', 'prepare', 'trial', 'special']

In [13]:
import pandas as pd
df = pd.read_json("doj_press.json", lines=True) # this are case files from trials
df.shape


(13087, 6)

In [19]:
# lets filter the documents  
df = df[df["topics"].str.len()!=0] # filter only rows which do NOT have empty topic
print(df.shape)
print(df.head())

(4688, 6)
         id                                              title  \
4    18-898  $100 Million Settlement Will Speed Cleanup Wor...   
7   14-1412  14 Indicted in Connection with New England Com...   
19  17-1419  2017 Southeast Regional Animal Cruelty Prosecu...   
22  15-1562  21st Century Oncology to Pay $19.75 Million to...   
23  17-1404  21st Century Oncology to Pay $26 Million to Se...   

                                             contents  \
4   The U.S. Department of Justice, the U.S. Envir...   
7   A 131-count criminal indictment was unsealed t...   
19  The United States Attorney’s Office for the Mi...   
22  21st Century Oncology LLC, has agreed to pay $...   
23  21st Century Oncology Inc. and certain of its ...   

                         date                                 topics  \
4   2018-07-09T00:00:00-04:00                          [Environment]   
7   2014-12-17T00:00:00-05:00                  [Consumer Protection]   
19  2017-12-14T00:00:00-05:00     

In [None]:
# to keep things simple, lets only focus on pre-processing first 100 rows
len(df["contents"].iloc[4]) # ioc[4] just prints 4th 'row' of contetn column as an example to see the length
# this column is LONG (5k lengh above).. lets remove stop words from content so we can build an NLP model
df["contents_new"] = df["contents"].apply(preprocess)
len(df["contents"].iloc[4]) # now check that the 4th row is smaller (we remove stop words from ALL rows, but using iloc[4] as example