In [7]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix


def eliminateStop(words, stop_words):
    returnList = [w for w in words if not w in stop_words]
    return returnList


def main():
    pd.options.mode.chained_assignment = None
    # Read In Data and Format It Better
    df = pd.read_csv('spam.csv', sep=',', encoding='ISO-8859-1')
    df = df.drop(columns=[ "Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

    # Add 2 New Columns: For binary spam/ham indicator and length of message
    df['Result']= df['Classification'].map( {'spam' : int(1), 'ham' : int(0)})
    df['Message_Size'] = df['Text'].apply(len)

   
    # Split Set Into Test Set and Training Set
    total = df['Result'].count()
    half = math.floor(total / 2)
    dfTrain, dfTest = train_test_split(df, test_size=0.2, shuffle=True)
    
    # Tokenize And Remove StopWords From Training Set
    dfTrain['Tokens'] = dfTrain['Text'].apply(word_tokenize)
    stop_words = set(stopwords.words("english"))
    dfTrain['Filtered_Tokens'] = dfTrain['Tokens'].apply(eliminateStop, args=(stop_words,),)
    
   
    # # #   Naive Bayes # # #
    classifier = MultinomialNB()
    
    targs = dfTrain['Result'].values
    # Vectorize Message
    vectorizer = CountVectorizer()
    counts = vectorizer.fit_transform(dfTrain['Text'].values)
    

    # Train data
    classifier.fit(counts, targs)
    # Test With Test Set
    predictions = classifier.predict(vectorizer.transform(dfTest['Text']))

    countRight = 0
    i = 0
    for entry in dfTest['Result']:
        if (entry == predictions[i]):
            countRight+=1
        i+=1
        
    size = dfTest['Result'].size
    
    
    numSpams = dfTest[dfTest['Result']==1]['Result'].count()
    print(f'{numSpams / size * 100}% of Test Messages Were Spam')
    
    print(f'Success Rate: {countRight / size * 100}%\n\n')
    
    # Read User File For More!
    fileName = 'userMessages.txt'
    if fileName:
        with open(fileName) as f:
            messages = f.readlines()
            userData = vectorizer.transform(messages)
            predictions = classifier.predict(userData)
            
            i = 0
            for res in predictions:
                if res == 1:
                    answer = 'Spam!'
                elif res == 0:
                    answer = 'Ham!'
                print(f'Message: {messages[i]}  --> {answer}')
                i+=1
    
    
if __name__ =='__main__':
    main()

12.197309417040358% of Test Messages Were Spam
Success Rate: 98.83408071748879%


Message: Hello! Hope you are doing well today
  --> Ham!
Message: When are you heading over to my place 
  --> Ham!
Message: get it free now! win prizes and big savings! reply
  --> Spam!
Message: i mean i guess so thats what i was thinking
  --> Ham!
