In [54]:
#connect to google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
data_file = '/content/drive/My Drive/py/Project1/SMSSpamCollection.txt'

In [56]:
import pandas as pd

data = pd.read_csv(data_file, sep='\t', header=None, names=['label','sms'])
data.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [57]:
#load stopwords and punctuation
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')

stopwords = nltk.corpus.stopwords.words("english")
punctuation =string.punctuation

print(stopwords[:5])
print(punctuation)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['i', 'me', 'my', 'myself', 'we']
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [58]:
#preprocessor sms content

def pre_process(sms):
  lowercase = "".join([char.lower() for char in sms if char not in punctuation])
  tokenize = nltk.tokenize.word_tokenize(lowercase)
  remove_stopwords = [word for word in tokenize if word not in stopwords]
  return remove_stopwords

data['process'] = data['sms'].apply(lambda x: pre_process(x))
data.head()

Unnamed: 0,label,sms,process
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


In [59]:
#categorizing ham/spam associated words

def categorize_words():
  spam_words = []
  ham_words =[]

  #spam associated words
  for sms in data['process'][data['label'] == 'spam']:
    for words in sms:
      spam_words.append(words)
  #ham associated words
  for sms in data['process'][data['label'] == 'ham']:
    for words in sms:
      ham_words.append(words)

  return spam_words, ham_words

spam_words, ham_words = categorize_words()

print(spam_words[:5])
print(ham_words[:5])

['free', 'entry', '2', 'wkly', 'comp']
['go', 'jurong', 'point', 'crazy', 'available']


In [60]:
#itterate over all the words from the user input and count their occurances in both ham_words and spam_words

def predict(user_input):
  spam_counter = 0
  ham_counter = 0

  for words in user_input:
    spam_counter+= spam_words.count(words)
    ham_counter+= ham_words.count(words)

  print('********************************* RESULTS **********************************')
  if ham_counter>spam_counter:
    #adding accuracy
    accuracy = round((ham_counter / (ham_counter+spam_counter))* 100 ,2)
    print("The Message is not spam, with {}% accuracy".format(accuracy))
  elif ham_counter<spam_counter:
    accuracy = round((spam_counter / (ham_counter+spam_counter))* 100 ,2)
    print("The Message is Spam, with {}% accuracy".format(accuracy))
  else:
    print("The message could be spam, with 50% accuracy")


In [63]:
#collect user input 


user_input = input("please type a spam or ham message to check if the function predicts properly\n")

please type a spam or ham message to check if the function predicts properly
call this number to claim 5000 dollars 


In [64]:
processed_input = pre_process(user_input)

predict(processed_input)

********************************* RESULTS **********************************
The Message is Spam, with 62.22% accuracy
