In [64]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix


wordDict = []

def eliminateStop(words, stop_words):
    returnList = [w for w in words if not w in stop_words]
    return returnList

def createDictionary(wordList):
    for entry in wordList:
        for word in entry:
            if word not in wordDict:
                wordDict.append(word)

def getCounts(tokens):
    result = []
    for word in wordDict:
        count = 0
        for tok in tokens:
            if tok == word:
                count+=1
        result.append(count)
            
    return result

def main():
    pd.options.mode.chained_assignment = None
    # Read In Data and Format It Better
    df = pd.read_csv('spam.csv', sep=',', encoding='ISO-8859-1')
    df = df.drop(columns=[ "Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

    # Add 2 New Columns: For binary spam/ham indicator and length of message
    df['Result']= df['Classification'].map( {'spam' : int(1), 'ham' : int(0)})
    df['Message_Size'] = df['Text'].apply(len)

   
    # Split Set Into Test Set and Training Set
    total = df['Result'].count()
    half = math.floor(total / 2)
    dfTrain, dfTest = train_test_split(df, test_size=0.2, shuffle=True)
    
    # Tokenize And Remove StopWords From Training Set
    dfTrain['Tokens'] = dfTrain['Text'].apply(word_tokenize)
    stop_words = set(stopwords.words("english"))
    dfTrain['Filtered_Tokens'] = dfTrain['Tokens'].apply(eliminateStop, args=(stop_words,),)
    
    # Create the counts dictionary
    createDictionary(dfTrain['Filtered_Tokens'])
    print(f'Size of Word Dictionary: {len(wordDict)}')
    
    # Create the counts vector for each entry
    dfTrain['Counts'] = dfTrain['Filtered_Tokens'].apply(getCounts)
    
    
    # # #   Naive Bayes # # #
    classifier = MultinomialNB()
    classifierMan = MultinomialNB()
    
    
    targs = dfTrain['Result'].values
    # Vectorize Message
    vectorizer = CountVectorizer()
    counts = vectorizer.fit_transform(dfTrain['Text'].values)
    
    
    # Train data
    print(f'Shape of Auto Counts: {counts.shape}')
    print(type(counts))
    classifier.fit(counts, targs)
    
    npA = np.array(dfTrain['Counts'].values)
    
    classifierMan.fit(npA, targs)
    
    
    # Test With Test Set
    predictions = classifier.predict(vectorizer.transform(dfTest['Text']))
    #predictionsMan = classifierMan.predict(vectorizer.transform(dfTest['Text']))
    
    
    countRight = 0
    i = 0
    for entry in dfTest['Result']:
        if (entry == predictions[i]):
            countRight+=1
        i+=1
        
    '''
    countRightMan = 0
    i = 0
    for entry in dfTest['Result']:
        if (entry == predictionsMan[i]):
            countRight+=1
        i+=1    
    '''
    size = dfTest['Result'].size
    
    
    
    numSpams = dfTest[dfTest['Result']==1]['Result'].count()
    print(f'{numSpams / size * 100}% of Messages Were Spam')
    
    print(f'Success Rate Auto: {countRight / size * 100}%')
    #print(f'Success Rate Manual: {countRightMan / size * 100}%')
    
if __name__ =='__main__':
    main()

Size of Word Dictionary: 9986
Shape of Auto Counts: (4457, 7665)
<class 'scipy.sparse.csr.csr_matrix'>


ValueError: setting an array element with a sequence.