In [303]:
import pandas as pd
import numpy as np

# import plotting libraries
import matplotlib.pyplot as plt
%matplotlib inline 

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

In [304]:
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import  MultinomialNB

In [305]:
from sklearn.feature_extraction.text import CountVectorizer

In [306]:
pd.set_option('max_colwidth', 400)

In [307]:
sms = pd.read_csv(r'D:\AI-DATASETS\01-MISC\sms.tsv', header=None, sep='\t',names=['label', 'message'])

In [308]:
sms.sample(10)

Unnamed: 0,label,message
107,ham,"Aight, I'll hit you up when I get some cash"
4112,spam,URGENT! Your Mobile number has been awarded a <UKP>2000 prize GUARANTEED. Call 09061790125 from landline. Claim 3030. Valid 12hrs only 150ppm
966,ham,Or better still can you catch her and let ask her if she can sell &lt;#&gt; for me.
1789,ham,Arun can u transfr me d amt
2406,ham,I'm meeting Darren...
4255,ham,"How about clothes, jewelry, and trips?"
351,ham,"Nah can't help you there, I've never had an iphone"
806,ham,I dled 3d its very imp
1507,spam,Thanks for the Vote. Now sing along with the stars with Karaoke on your mobile. For a FREE link just reply with SING now.
221,ham,Ok no prob. Take ur time.


In [309]:
sms.label.value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

to see if there is class imbalance

In [310]:
# convert label to a numerical variable
sms['label_num'] = sms.label.map({'ham': 0, 'spam': 1})

In [311]:
sms.head(10)

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives around here though",0
5,spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",1
6,ham,Even my brother is not like to speak with me. They treat me like aids patent.,0
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune,0
8,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.,1
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030,1


In [312]:
X = sms.message
y = sms.label_num

In [313]:
from sklearn.model_selection import train_test_split

In [314]:
# split X and y into training ans testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) 

In [315]:
X_train.shape, X_test.shape

((4179,), (1393,))

In [316]:
vect = CountVectorizer()

In [317]:
vect.fit(X_train, y_train)

CountVectorizer()

In [318]:
X_train_dtm = vect.transform(X_train)

In [319]:
X_train_dtm.shape

(4179, 7456)

In [320]:
np.set_printoptions(linewidth=140, edgeitems=12)

In [321]:
X_train_dtm.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0,

#### Using KNN classifier

In [322]:
knn = KNeighborsClassifier(n_neighbors=3,p=1, metric='minkowski')
knn.fit(X_train_dtm, y_train)

KNeighborsClassifier(n_neighbors=3, p=1)

In [323]:
X_test_dtm = vect.transform(X_test)   

y_pred     = knn.predict(X_test_dtm)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [324]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#### accuracy_score(y_test, y_pred)

In [325]:
confusion_matrix(y_test, y_pred)

array([[1208,    0],
       [ 106,   79]], dtype=int64)

In [326]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      1208
           1       1.00      0.43      0.60       185

    accuracy                           0.92      1393
   macro avg       0.96      0.71      0.78      1393
weighted avg       0.93      0.92      0.91      1393



#### Try logistic regression

In [327]:
model = LogisticRegression()

model.fit(X_train_dtm, y_train)

LogisticRegression()

In [328]:
X_test_dtm = vect.transform(X_test)

In [329]:
y_pred = model.predict(X_test_dtm)

In [330]:
accuracy_score(y_test, y_pred)

0.9877961234745154

In [331]:
confusion_matrix(y_test, y_pred)

array([[1207,    1],
       [  16,  169]], dtype=int64)

In [332]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1208
           1       0.99      0.91      0.95       185

    accuracy                           0.99      1393
   macro avg       0.99      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393



#### try naive bayes algo

In [333]:
from sklearn.naive_bayes import MultinomialNB

In [334]:
nb = MultinomialNB()

In [335]:
nb.fit(X_train_dtm, y_train)

MultinomialNB()

In [336]:
np.set_printoptions(edgeitems=10)

In [337]:
X_train_dtm.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 

In [338]:
X_train_dtm.shape

(4179, 7456)

In [339]:
y_pred_class = nb.predict(X_test_dtm)

accuracy_score(y_test, y_pred_class)

0.9885139985642498

In [340]:
confusion_matrix(y_test, y_pred_class)

array([[1203,    5],
       [  11,  174]], dtype=int64)

In [341]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 140)  # or 199

In [342]:
sms.sample(20)

Unnamed: 0,label,message,label_num
414,ham,Bring home some Wendy =D,0
4831,ham,"The word ""Checkmate"" in chess comes from the Persian phrase ""Shah Maat"" which means; ""the king is dead.."" Goodmorning.. Have a good day..:)",0
50,ham,What you thinked about me. First time you saw me in class.,0
3707,ham,Reading gud habit.. Nan bari hudgi yorge pataistha ertini kano:-),0
1174,ham,Ü dun need to pick ur gf?,0
4938,ham,G wants to know where the fuck you are,0
1639,ham,Great comedy..cant stop laughing da:),0
3205,ham,She's good. How are you. Where r u working now,0
559,ham,Hi Princess! Thank you for the pics. You are very pretty. How are you?,0
87,ham,Yes I started to send requests to make it but pain came back so I'm back in bed. Double coins at the factory too. I gotta cash in all my...,0


#### try some vectorizer settings

In [343]:
# vectorizer tuning
import nltk
from nltk.corpus import stopwords
from string import punctuation

# Stemmer
from nltk.stem import PorterStemmer
porter = PorterStemmer()

# lemmatizing
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [344]:
# Stopwords from stopwords-json
# https://raw.githubusercontent.com/6/stopwords-json/master/dist/en.json

stopwords_json = {"en":["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]}

In [345]:
stopwords_json_en = set(stopwords_json['en'])
stopwords_nltk_en = set(stopwords.words('english'))
stopwords_punct   = set(punctuation)

In [346]:
# Combine the stopwords. Its a lot longer so I'm not printing it out...
stoplist_combined = set.union(stopwords_json_en, stopwords_nltk_en, stopwords_punct)

In [347]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [348]:
vect = CountVectorizer(stop_words    = stoplist_combined, 
                       token_pattern = '(?u)\\b[a-zA-Z]{3,}\\b')

In [349]:
vect

CountVectorizer(stop_words={'!', '"', '#', '$', '%', '&', "'", '(', ')', '*',
                            '+', ',', '-', '.', '/', ':', ';', '<', '=', '>',
                            '?', '@', '[', '\\', ']', '^', '_', '`', 'a', "a's", ...},
                token_pattern='(?u)\\b[a-zA-Z]{3,}\\b')

In [350]:
vect.fit(X_train)                     # create the fitted vocabulary
X_train_dtm = vect.transform(X_train) # create the document-term matrix



In [351]:
# Number of rows = number of training samples,
# number of columns = features
X_train_dtm.shape

(4179, 5943)

In [352]:
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1393x5943 sparse matrix of type '<class 'numpy.int64'>'
	with 7238 stored elements in Compressed Sparse Row format>

In [353]:
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)

y_pred_class = nb.predict(X_test_dtm)
accuracy_score(y_test, y_pred_class)

0.9806173725771715

In [354]:
# Get the feature names (tokens)
feature_names = vect.get_feature_names_out()
feature_names

array(['aah', 'aaniye', 'aaooooright', 'aathi', 'abbey', 'abdomen', 'abeg', 'abel', 'aberdeen', 'abi', ..., 'zealand', 'zebra', 'zed',
       'zeros', 'zhong', 'zindgi', 'zoe', 'zoom', 'zouk', 'zyada'], dtype=object)

#### use NB as pre-processing

In [355]:
# Get the log probabilities of each feature given a class
log_prob = nb.feature_log_prob_
log_prob.shape

(2, 5943)

In [356]:
log_prob

array([[ -8.74201533,  -9.43516251,  -9.43516251,  -8.18239955,  -9.43516251,  -9.43516251,  -9.43516251,  -9.43516251, -10.12830969,
         -9.43516251, ...,  -9.43516251, -10.12830969, -10.12830969,  -9.43516251,  -9.43516251,  -9.02969741,  -9.43516251,  -9.43516251,
        -10.12830969,  -9.43516251],
       [ -9.41897924,  -9.41897924,  -9.41897924,  -9.41897924,  -9.41897924,  -9.41897924,  -9.41897924,  -9.41897924,  -8.72583206,
         -9.41897924, ...,  -9.41897924,  -8.72583206,  -7.47306909,  -9.41897924,  -9.41897924,  -9.41897924,  -8.72583206,  -9.41897924,
         -8.72583206,  -9.41897924]])

In [357]:
# Convert to DataFrame for easier manipulation
tokens_df = pd.DataFrame(log_prob.T, index=feature_names, columns=nb.classes_)

In [358]:
tokens_df

Unnamed: 0,0,1
aah,-8.742015,-9.418979
aaniye,-9.435163,-9.418979
aaooooright,-9.435163,-9.418979
aathi,-8.182400,-9.418979
abbey,-9.435163,-9.418979
...,...,...
zindgi,-9.029697,-9.418979
zoe,-9.435163,-8.725832
zoom,-9.435163,-9.418979
zouk,-10.128310,-8.725832


In [359]:
# Get the top tokens for each class
top_ham_tokens  = pd.DataFrame(tokens_df[0].sort_values(ascending=False)).head(10)
top_spam_tokens = pd.DataFrame(tokens_df[1].sort_values(ascending=False)).head(10)

top_ham_tokens['prob']  = np.exp(top_ham_tokens[0])
top_spam_tokens['prob'] = np.exp(top_spam_tokens[1])

In [360]:
top_ham_tokens

Unnamed: 0,0,prob
good,-4.95216,0.007068
call,-4.975018,0.006908
day,-5.053136,0.006389
time,-5.078454,0.00623
love,-5.097872,0.00611
lor,-5.349186,0.004752
home,-5.366136,0.004672
today,-5.400922,0.004512
dont,-5.464871,0.004233
back,-5.53319,0.003953


In [362]:
tokens_df.columns = ['ham', 'spam']

In [363]:
# Sort tokens based on their log probabilities for each class
# top_ham_tokens  = tokens_df[['ham']].sort_values(by='ham', ascending=False).rename(columns={'ham': 'ham_log_prob'})
# top_spam_tokens = tokens_df[['spam']].sort_values(by='spam', ascending=False).rename(columns={'spam': 'spam_log_prob'})

top_ham_tokens  = tokens_df[['ham']].rename(columns={'ham': 'ham_log_prob'})
top_spam_tokens = tokens_df[['spam']].rename(columns={'spam': 'spam_log_prob'})

# Combine the two dataframes based on tokens
combined_tokens = pd.merge(top_ham_tokens, 
                           top_spam_tokens, 
                           left_index=True, 
                           right_index=True, 
                           how='inner')

# Calculate the spam-factor for each token
combined_tokens['spam_factor'] = np.exp(combined_tokens['spam_log_prob']) / np.exp(combined_tokens['ham_log_prob'])

# Optionally, you can sort the combined dataframe based on the contribution
combined_tokens = combined_tokens.sort_values(by='spam_factor', ascending=False)

In [364]:
combined_tokens

Unnamed: 0,ham_log_prob,spam_log_prob,spam_factor
claim,-10.128310,-4.930343,180.904058
prize,-10.128310,-5.088246,154.479870
tone,-10.128310,-5.547778,97.566234
guaranteed,-10.128310,-5.657779,87.403084
www,-9.435163,-5.128520,74.190990
...,...,...,...
amp,-6.120977,-9.418979,0.036957
morning,-6.017436,-9.418979,0.033322
yeah,-5.969427,-9.418979,0.031760
lol,-5.969427,-9.418979,0.031760


#### considering useful bi-grams and n-grams

In [365]:
ngram_vectorizer = CountVectorizer(
                                   # stop_words    = stoplist_combined, 
                                   token_pattern = '(?u)\\b[a-zA-Z]{3,}\\b',
                                   ngram_range   = (1, 3))

In [366]:
X_ngrams      = ngram_vectorizer.fit_transform(X_train)
feature_names = ngram_vectorizer.get_feature_names_out()

In [367]:
from sklearn.feature_selection import chi2

In [368]:
# Calculate chi-square scores and p-values for all n-grams
chi2_scores, p_values = chi2(X_ngrams, y_train)

In [369]:
# Create a DataFrame with tokens, scores, and p-values
chi2_df = pd.DataFrame({'token': feature_names, 'score': chi2_scores, 'p_value': p_values})

`Higher Scores`: A higher chi-square score indicates that the feature is more strongly associated with the class labels. In other words, the feature occurs more frequently in one class compared to the other, suggesting it might be a good predictor for that class.

`Lower Scores`: A lower chi-square score indicates a weaker association, meaning the feature does not show a significant difference in occurrence between the classes.

In [370]:
# Sort the DataFrame by scores in descending order
chi2_df = chi2_df.sort_values(by='score', ascending=False)

In [371]:
chi2_df.head(20)

Unnamed: 0,token,score,p_value
7972,call,866.895663,1.543319e-190
18769,free,722.963161,3.0361800000000004e-159
56289,txt,635.462794,3.2413200000000004e-140
10172,claim,566.362989,3.4711120000000005e-125
33465,mobile,510.34065,5.346862e-113
41131,prize,482.69573,5.535569999999999e-107
62638,www,455.069647,5.686352e-101
48601,stop,441.964833,4.044129e-98
50366,text,363.173912,5.734321e-81
42781,reply,351.532338,1.965411e-78


In [372]:
# Filter to include only ngrams
ngrams_df = chi2_df[chi2_df['token'].str.split().str.len() == 2]

# Sort the DataFrame by scores in descending order
ngrams_df = ngrams_df.sort_values(by='score', ascending=False)

ngrams_df.head(50)

Unnamed: 0,token,score,p_value
23643,have won,283.181495,1.521623e-63
8123,call now,245.926484,2.0069270000000002e-55
8054,call from,241.384057,1.963011e-54
65201,your mobile,231.69395,2.546397e-52
63933,you have,197.092347,9.002635000000001e-45
40229,please call,158.309418,2.648695e-36
22191,guaranteed call,141.590747,1.194999e-32
41150,prize guaranteed,135.154804,3.054046e-31
28781,land line,109.411032,1.318935e-25
19419,from land,109.411032,1.318935e-25


In [373]:
# Initialize columns for counts
ngrams_df['count_in_spam'] = 0
ngrams_df['count_in_ham']  = 0

In [374]:
# Function to count occurrences of token in the review column using 'in'
def count_token_occurrences(token, reviews):
    token_lower = token.lower()
    return sum(token_lower in review.lower() for review in reviews)

In [375]:
%%time

# Loop through each n-gram
for index, row in ngrams_df.iterrows():
    token = row['token']
    
    # Count occurrences in spam reviews
    spam_reviews = sms[sms['label'] == 'spam']['message']
    ham_reviews  = sms[sms['label'] == 'ham']['message']
    
    ngrams_df.at[index, 'count_in_spam'] = count_token_occurrences(token, spam_reviews)
    ngrams_df.at[index, 'count_in_ham']  = count_token_occurrences(token, ham_reviews)

Wall time: 3min 22s


In [376]:
ngrams_df[ngrams_df.p_value < 0.05].head(50)

Unnamed: 0,token,score,p_value,count_in_spam,count_in_ham
23643,have won,283.181495,1.521623e-63,54,0
8123,call now,245.926484,2.0069270000000002e-55,19,3
8054,call from,241.384057,1.963011e-54,13,3
65201,your mobile,231.69395,2.546397e-52,50,0
63933,you have,197.092347,9.002635000000001e-45,71,45
40229,please call,158.309418,2.648695e-36,46,9
22191,guaranteed call,141.590747,1.194999e-32,5,0
41150,prize guaranteed,135.154804,3.054046e-31,22,0
28781,land line,109.411032,1.318935e-25,18,0
19419,from land,109.411032,1.318935e-25,35,0


In [165]:
ngrams_df[ngrams_df.count_in_spam>10].sort_values(['count_in_spam'], ascending=[False])

Unnamed: 0,token,score,p_value,count_in_spam,count_in_ham
63928,you hav,0.310755,5.772174e-01,72.0,48.0
63933,you have,197.092347,9.002635e-45,71.0,45.0
65197,your mob,38.615658,5.160327e-10,56.0,0.0
23643,have won,283.181495,1.521623e-63,54.0,0.0
65201,your mobile,231.693950,2.546397e-52,50.0,0.0
...,...,...,...,...,...
42842,reply stop,64.359431,1.036710e-15,11.0,0.0
2703,and tell,25.829599,3.729255e-07,11.0,7.0
6561,bonus caller,32.179715,1.405505e-08,11.0,0.0
41045,price line,32.179715,1.405505e-08,11.0,0.0


In [377]:
# Bigrams to add as features
bigrams_to_add = list(ngrams_df[ngrams_df.count_in_spam>10].sort_values(['count_in_spam'], ascending=[False])['token'].values)

In [170]:
bigrams_to_add

['you hav',
 'you have',
 'your mob',
 'have won',
 'your mobile',
 'please call',
 'for you',
 'from land',
 'you are',
 'find out',
 'customer service',
 'for your',
 'prize guaranteed',
 'call mobile',
 'send stop',
 'all mobile',
 'contact you',
 'every week',
 'call now',
 'from landline',
 'await collection',
 'per min',
 'land line',
 'out who',
 'the late',
 'the latest',
 'for free',
 'account statement',
 'statement for',
 'identifier code',
 'all free',
 'the word',
 'free entry',
 'ave been',
 'this week',
 'had you',
 'dating service',
 'txt stop',
 'draw shows',
 'had your',
 'just call',
 'just txt',
 'have been',
 'free text',
 'call free',
 'waiting for',
 'that you',
 'are trying',
 'shows that',
 'hows that',
 'the new',
 'has been',
 'get txt',
 'call from',
 'camera phone',
 'gift voucher',
 'your phone',
 'double mins',
 'claim code',
 'half price',
 'txt nok',
 'caller prize',
 'our dating',
 'nokia tone',
 'collect your',
 'opt out',
 'our free',
 'new nokia',
 

In [273]:
# Function to add bigrams explicitly to the review text
def add_bigram_feature(review, bigrams):
    review_lower    = review.lower()
    modified_review = review_lower
    
    for bigram in bigrams:
        bigram_underscore = bigram.replace(" ", ":")
        if bigram in review_lower:
            modified_review += f" {bigram_underscore}"
    return modified_review

In [274]:
# Apply the function to add bigrams as features
sms['modified_review'] = sms['message'].apply(add_bigram_feature, bigrams=bigrams_to_add)

In [275]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 400) 
pd.set_option('display.width', None)  

In [279]:
sms.sample(5)

Unnamed: 0,label,message,label_num,modified_review
1768,ham,"K, want us to come by now?",0,"k, want us to come by now?"
3085,ham,Ok lor. I ned 2 go toa payoh 4 a while 2 return smth u wan 2 send me there or wat?,0,ok lor. i ned 2 go toa payoh 4 a while 2 return smth u wan 2 send me there or wat?
3360,spam,Sorry I missed your call let's talk when you have the time. I'm on 07090201529,1,sorry i missed your call let's talk when you have the time. i'm on 07090201529 you:hav you:have
3255,ham,I can make lasagna for you... vodka...,0,i can make lasagna for you... vodka... for:you
4379,ham,"Doing nothing, then u not having dinner w us?",0,"doing nothing, then u not having dinner w us?"


#### Build NB with the new data

In [280]:
vect = CountVectorizer(
                       stop_words    = stoplist_combined, 
                       token_pattern = '(?u)\\b[a-zA-Z:]{3,}\\b',
                       ngram_range   = (1, 1)
)

In [281]:
X = sms.modified_review
y = sms.label_num

In [282]:
# split X and y into training ans testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) 

In [283]:
X_train.shape, X_test.shape

((4179,), (1393,))

In [284]:
vect.fit(X_train)
feature_names = vect.get_feature_names_out()



In [285]:
feature_names

array(['aah', 'aaniye', 'aaooooright', 'aathi', 'abbey', 'abdomen', 'abeg', 'abel', 'aberdeen', 'abi', ..., 'zealand', 'zebra', 'zed',
       'zeros', 'zhong', 'zindgi', 'zoe', 'zoom', 'zouk', 'zyada'], dtype=object)

In [286]:
X_train_dtm = vect.transform(X_train) # create the document-term matrix

In [287]:
X_train_dtm.shape

(4179, 6048)

In [288]:
X_test_dtm = vect.transform(X_test)

In [289]:
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)

MultinomialNB()

In [290]:
y_pred_class = nb.predict(X_test_dtm)

print('Accuracy : ', accuracy_score(y_test, y_pred_class))

Accuracy :  0.9791816223977028


#### handling contractions and expansions 

In [378]:
pip install contractions






In [387]:
import contractions

text = "I haven't  the code"
expanded_text = contractions.fix(text)

print(expanded_text)

I have not  the code


In [382]:
# Function to expand contractions in a text
def expand_contractions(text):
    return contractions.fix(text)

# Sample text with contractions
sample_text = "I can't believe it's already 2024! We're going to have so much fun."

# Expanding contractions
expanded_text = expand_contractions(sample_text)

print("Original Text:")
print(sample_text)
print("\nExpanded Text:")
print(expanded_text)

Original Text:
I can't believe it's already 2024! We're going to have so much fun.

Expanded Text:
I cannot believe it is already 2024! We are going to have so much fun.
