In [1]:
# Data processing
# ==============================================================================

import numpy as np
import pandas as pd

# Plotting
# ==============================================================================
import matplotlib.pyplot as plt

# Modelling and Forecasting
# ==============================================================================
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Read data in proper format 
# ==============================================================================

df = pd.read_csv("spam.csv",encoding='ISO-8859-1')

In [3]:
# Sanity Check
# ==============================================================================

df.shape

(5572, 5)

In [4]:
# Sanity Check
# ==============================================================================

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
# Drop unncessary columns 
# ==============================================================================

df = df.drop(columns=['Unnamed: 2',	'Unnamed: 3', 'Unnamed: 4'])

In [6]:
# Sanity Check 
# ==============================================================================

df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Rename columns 
# ==============================================================================

df = df.rename(columns={'v1':'OUTPUT','v2':'TEXT'})

In [8]:
# Sanity Check 
# ==============================================================================

df.head()

Unnamed: 0,OUTPUT,TEXT
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
# Sanity Check 
# ==============================================================================

df.isnull().sum()

OUTPUT    0
TEXT      0
dtype: int64

In [10]:
# Encode labelts 
# ==============================================================================

encoder=LabelEncoder()

df['OUTPUT']=encoder.fit_transform(df['OUTPUT'])
df['OUTPUT']

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: OUTPUT, Length: 5572, dtype: int64

In [11]:
# Assign data to X,y to feed into the ML Algorithm 
# ==============================================================================

X = df['TEXT']
y = df['OUTPUT']

In [12]:
# Sanity Check 
# ==============================================================================

print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [13]:
# Split data into train and test sets 
# ==============================================================================

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=3)

In [14]:
# Transform the data to feed into Logistic Regression Algorithm 
# ==============================================================================

cv=CountVectorizer()
X_train_cv=cv.fit_transform(X_train)
X_test_cv=cv.fit_transform(X_test)

In [15]:
lr=LogisticRegression()
lr.fit(X_train_cv,y_train)
prediction_train=lr.predict(X_train_cv)
print(accuracy_score(y_train,prediction_train)*100)

99.75319721785955


In [16]:
lr.fit(X_test_cv,y_test)
prediction_test=lr.predict(X_test_cv)
print(accuracy_score(y_test,prediction_test)*100)

99.82062780269058


In [17]:
## example on how CountVectorizer works 

"""
    converting a collection of text documents into a matrix of token counts. 
    Each document is represented by a row in the matrix, and each feature (word or n-gram) by a column, 
    with the value in each cell indicating the frequency of the feature in the corresponding document.
"""

# Example documents
documents = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer to the documents and transform the documents into a document-term matrix
X = vectorizer.fit_transform(documents)

# Convert the document-term matrix to a DataFrame for better visualization
import pandas as pd
columns = vectorizer.get_feature_names_out()
df2 = pd.DataFrame(X.toarray(), columns=columns)
print(df2)

   and  document  first  is  one  second  the  third  this
0    0         1      1   1    0       0    1      0     1
1    0         2      0   1    0       1    1      0     1
2    1         0      0   1    1       0    1      1     1
3    0         1      1   1    0       0    1      0     1
