In [None]:
# create a spammer identification using guassian mixture model (SI-GMM)
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import precision_score, recall_score


# read the csv file and convert it to a numpy array
df = pd.read_csv('/content/spam-DATASET.csv', encoding='latin-1')
print(df.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [None]:
# replace ham with 0 and spam with 1
df['v1'] = df['v1'].apply(lambda x: 0 if x == 'ham' else 1)

# drop the unnecessary columns
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [None]:
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
 #function to find percentage of digits in a string
def find_digit_percent(string):
    count = 0
    for i in string:
        if i.isdigit():
            count += 1
    return count / (len(string) + 1)

# function to find percentage of question marks in a string
def find_question_percent(string):
    count = 0
    for i in string:
        if i == '?':
            count += 1
    return count / (len(string) + 1)

# function to find percentage of exclamation marks in a string
def find_exclamation_percent(string):
    count = 0
    for i in string:
        if i == '!':
            count += 1
    return count / (len(string) + 1)

# function to find percentage of capital letters in a string
def find_capital_percent(string):
    count = 0
    for i in string:
        if i.isupper():
            count += 1
    return count / (len(string) + 1)

# function to find percentage of special characters in a string
def find_special_percent(string):
    count = 0
    for i in string:
        if not i.isalnum():
            count += 1
    return count / (len(string) + 1)

# function to if a string contains a emoji
def find_emoji(string):
    return int(':)' in string or ':(' in string or ':-)' in string or ':=D' in string or ':D' in string or ':P' in string)


In [None]:
df['digit_percent'] = df['v2'].apply(find_digit_percent)
df['question_percent'] = df['v2'].apply(find_question_percent)
df['exclamation_percent'] = df['v2'].apply(find_exclamation_percent)
df['capital_percent'] = df['v2'].apply(find_capital_percent)
df['special_percent'] = df['v2'].apply(find_special_percent)
df['emoji'] = df['v2'].apply(find_emoji)

In [None]:
df.head()

Unnamed: 0,v1,v2,digit_percent,question_percent,exclamation_percent,capital_percent,special_percent,emoji
0,0,"Go until jurong point, crazy.. Available only ...",0.0,0.0,0.0,0.026786,0.25,0
1,0,Ok lar... Joking wif u oni...,0.0,0.0,0.0,0.066667,0.366667,0
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,0.160256,0.0,0.0,0.064103,0.211538,0
3,0,U dun say so early hor... U c already then say...,0.0,0.0,0.0,0.04,0.32,0
4,0,"Nah I don't think he goes to usf, he lives aro...",0.0,0.0,0.0,0.032258,0.225806,0


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stopwords = set(stopwords.words('english'))

def isalpha(string):
    string = string.replace('.', '')
    return string.isalpha()

def clean_sms(string):
    string = string.lower()
    return (' '.join(filter(lambda x: isalpha(x) and x not in stopwords, string.split()))).replace('.', '').split()

cv = CountVectorizer(analyzer=clean_sms,strip_accents='ascii', min_df=2)
df = pd.concat([df, pd.DataFrame(cv.fit_transform(df['v2']).toarray(), columns=cv.get_feature_names())], axis=1)

df.head()



Unnamed: 0,v1,v2,digit_percent,question_percent,exclamation_percent,capital_percent,special_percent,emoji,aah,aathilove,...,yrs,yummy,yun,yunny,yuo,yup,zed,zoe,åð,ìï
0,0,"Go until jurong point, crazy.. Available only ...",0.0,0.0,0.0,0.026786,0.25,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,Ok lar... Joking wif u oni...,0.0,0.0,0.0,0.066667,0.366667,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,0.160256,0.0,0.0,0.064103,0.211538,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,U dun say so early hor... U c already then say...,0.0,0.0,0.0,0.04,0.32,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,"Nah I don't think he goes to usf, he lives aro...",0.0,0.0,0.0,0.032258,0.225806,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# split the data into training and testing data
X = df.drop(['v1', 'v2'], axis=1)
y = df['v1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# predict the labels
gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
gmm.fit(X_train,y_train)

GaussianMixture(n_components=2, random_state=42)

In [None]:
y_pred = gmm.predict(X_test)

In [None]:
print(pd.Series(y_pred).value_counts())

1    1522
0     150
dtype: int64


In [None]:
# calculate the accuracy
print(accuracy_score(y_test, y_pred))

0.12260765550239235


In [None]:
# calculate the confusion matrix
print(confusion_matrix(y_test, y_pred))

[[  68 1385]
 [  82  137]]


In [None]:
# calculate the precision and recall
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.0900131406044678
0.6255707762557078


In [None]:
def test_message(message):
    message = pd.Series(message)
    message = pd.DataFrame(message, columns=['v2'])
    message['digit_percent'] = message['v2'].apply(find_digit_percent)
    message['question_percent'] = message['v2'].apply(find_question_percent)
    message['exclamation_percent'] = message['v2'].apply(find_exclamation_percent)
    message['capital_percent'] = message['v2'].apply(find_capital_percent)
    message['special_percent'] = message['v2'].apply(find_special_percent)
    message['emoji'] = message['v2'].apply(find_emoji)

    message = pd.concat([message, pd.DataFrame(cv.transform(message['v2']).toarray(), columns=cv.get_feature_names())], axis=1)
    X_test = message.drop(['v2'], axis=1)

    return gmm.predict(X_test)

result = test_message(['Hey, how are you doing ?'])

if result == 0:
    print('ham')
else:
    print('spam')

ham




In [None]:
import pickle
pickle.dump(gmm, open('model.pkl', 'wb'))

In [None]:
# save the vectorizer
pickle.dump(cv, open('vectorizer.pkl', 'wb'))

In [None]:
print(pd.Series(y_pred).value_counts())

0    1113
1       2
dtype: int64


In [None]:
print(pd.Series(y_test).value_counts())

0    965
1    150
Name: v1, dtype: int64


In [None]:
import joblib
filename = 'finalized_model.sav'
joblib.dump(gmm, filename)

['finalized_model.sav']

In [None]:
# compare the results with the SVM model
from sklearn.svm import SVC
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(accuracy_score(y_test, y_pred))

# compare the results with the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(accuracy_score(y_test, y_pred))

# compare the results with the Random Forest model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(accuracy_score(y_test, y_pred))

# compare the results with the KNN model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))

# compare the results with the Decision Tree model
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print(accuracy_score(y_test, y_pred))