In [1]:
# Import libraries and tools for the machine learning model
import pandas as pd # This will be used for data management
import numpy as np
from sklearn.model_selection import train_test_split # This will be used for splitting data
from sklearn.feature_extraction.text import TfidfVectorizer # This is used to extract features from text
from sklearn.linear_model import LogisticRegression # We will use logistic regression to classify something as "spam" or not
from sklearn.metrics import accuracy_score 

In [2]:
# Importing the data from CSV
spamData = pd.read_csv("spam.csv", encoding = 'latin1')

In [3]:
spamData.shape # Check how many rows there are

(5572, 2)

In [4]:
spamData.tail()

Unnamed: 0,v1,v2
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [5]:
spamData = spamData.dropna()

In [6]:
spamData.info() # Checking to make sure there are no nulls

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
# Create our input and explanatory variable
X = spamData["v2"]
Y = spamData["v1"]

In [8]:
# Split the data into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, stratify = Y, random_state = 55)

# Make sure all of the shapes align accordingly
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(4179,)
(4179,)
(1393,)
(1393,)


In [9]:
# Creating model and standardizing data
feature_extraction = TfidfVectorizer(min_df = 110, max_df = 3000, stop_words = "english", lowercase = True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [10]:
print(X_train_features)

  (0, 11)	0.5758891623774188
  (0, 6)	0.5788139320365375
  (0, 13)	0.577344009007314
  (3, 19)	0.4319747531259221
  (3, 22)	0.8022431912913773
  (3, 15)	0.4120724143744945
  (5, 8)	1.0
  (8, 12)	0.5788662144695679
  (8, 23)	0.5701985650076984
  (8, 14)	0.5829129456521219
  (10, 0)	0.9040180145733753
  (10, 15)	0.4274943617485647
  (11, 22)	1.0
  (13, 2)	0.39366233990697785
  (13, 21)	0.42440072091485836
  (13, 8)	0.35270432764004284
  (13, 22)	0.7351963326145586
  (14, 11)	1.0
  (18, 3)	0.729448870961509
  (18, 19)	0.6840353387457258
  (19, 4)	1.0
  (21, 23)	1.0
  (26, 10)	0.5071610088316674
  (26, 0)	0.5115378010204416
  (26, 6)	0.4910907032201974
  :	:
  (4153, 15)	1.0
  (4155, 1)	0.6987337424981027
  (4155, 23)	0.7153818260863182
  (4156, 16)	0.4862019196315351
  (4156, 22)	0.8738464930104201
  (4157, 21)	1.0
  (4158, 9)	0.6818884267770944
  (4158, 12)	0.7314562006213764
  (4160, 20)	0.4692345006004463
  (4160, 8)	0.7785328094668276
  (4160, 15)	0.4167800955299307
  (4161, 1)	0.9059

In [11]:
# Create the actual model
model = LogisticRegression()

model.fit(X_train_features, Y_train)

In [12]:
trainingDataPreds = model.predict(X_train_features)
trainingDataAccuracy = accuracy_score(Y_train, trainingDataPreds)
print("Accuracy on training data:", trainingDataAccuracy)

Accuracy on training data: 0.9097870303900455


In [13]:
testDataPreds = model.predict(X_test_features)
testDataAccuracy = accuracy_score(Y_test, testDataPreds)
print("Accuracy on testing data:", testDataAccuracy)

Accuracy on testing data: 0.908829863603733


In [14]:
# Put input here; potentially create function call when implimented
new_input = ["test"]

input_data_features = feature_extraction.transform(new_input)

new_prediction = model.predict(input_data_features)
print(new_prediction)

if (new_prediction) == "spam":
    print("This was flagged as spam!")
          
else:
    print("No spam detected")

['ham']
No spam detected


In [15]:
# Put input here; potentially create function call when implimented
new_input = ["CONGRATS! YOU WON A FREE CRUISE SHIP WINNER --> SIGN UP HERE AT THIS LINK"]

input_data_features = feature_extraction.transform(new_input)

new_prediction = model.predict(input_data_features)
print(new_prediction)

if (new_prediction) == "spam":
    print("This was flagged as spam!")
          
else:
    print("No spam detected")

['spam']
This was flagged as spam!


In [16]:
# Put input here; potentially create function call when implimented
new_input = ["Luke is my buddy stop bugging him lol"]

input_data_features = feature_extraction.transform(new_input)

new_prediction = model.predict(input_data_features)
print(new_prediction)

if (new_prediction) == "spam":
    print("This was flagged as spam!")
          
else:
    print("No spam detected")

['ham']
No spam detected


In [17]:
# Put input here; potentially create function call when implimented
new_input = ["Free Minecoins at this server link -- check it out here with one click!"]

input_data_features = feature_extraction.transform(new_input)

new_prediction = model.predict(input_data_features)
print(new_prediction)

if (new_prediction) == "spam":
    print("This was flagged as spam!")
          
else:
    print("No spam detected")

['spam']
This was flagged as spam!


In [18]:
# Put input here; potentially create function call when implimented
new_input = ["I was merely stating how I felt about the state of the server haha"]

input_data_features = feature_extraction.transform(new_input)

new_prediction = model.predict(input_data_features)
print(new_prediction)

if (new_prediction) == "spam":
    print("This was flagged as spam!")
          
else:
    print("No spam detected")

['ham']
No spam detected
