In [1]:
import pandas as pd
import numpy as np

from src.data_cleaning import data_labeling
from src.data_cleaning import feature_engineering
from src.language_processing import to_tfidf
from src.language_processing import to_tfidf_v3
from src.language_processing import to_tfidf_v4
from src.language_processing import to_tfidf_v5
from src.modeling import test_model

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split,KFold

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_excel('testing2.xlsx')
df.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
df.head()

Unnamed: 0,text,spam,spam_marketing,spam_hijack,spam_corporate,spam_bot,spam_known,spam_own,Docusign,onespan,...,adobe sign,listed_count,description,statuses_count,followers_count,favourites_count,friends_count,time_float_sin,time_float_cos,is_description_none
0,just closed a deal in 27 hours using #AdobeSig...,0,0,0,0,1,0,0,False,False,...,True,1,There's no substitute for experience! Call 609...,6750,108,1,12,-0.04914,0.998792,False
1,just closed a deal in 2 days using #AdobeSign ...,0,0,0,0,1,0,0,False,False,...,True,14,We provide an affordable way to have a wonderf...,5018,47,2,25,-0.501007,0.865443,False
2,just closed a deal in 2 hours using #AdobeSign...,0,0,0,0,1,0,0,False,False,...,True,0,,29,60,10,52,-0.599548,0.800339,True
3,just closed a deal in 26 hours using #AdobeSig...,0,0,0,0,1,0,0,False,False,...,True,2,,5332,66,0,182,-0.325843,0.945424,True
4,just closed a deal in 6 days using #AdobeSign ...,0,0,0,0,1,0,0,False,False,...,True,8,Carefully Crafted & Intentionally Inspired Eve...,1337,225,19,348,-0.098162,0.99517,False


In [4]:
df['text'] = df['text'].astype(str)
data = df['text'].values.tolist()

In [5]:
from nltk.corpus import stopwords
import string
import re

regex = re.compile('[%s]' % re.escape(string.punctuation))
stopwords_ = stopwords.words('english')
punctuation_ = set(string.punctuation)

def filter_stopwords_punct(sentence):
    """remove stop words and punctuation"""
    sentence = regex.sub('', sentence)
    text = [word for word in sentence.split() if word not in stopwords_]
    return ' '.join(text)


from nltk.tokenize import word_tokenize

def sentence_to_words_clean(txt):
    """function to tokenzine sentence"""
    for sentence in txt:
        yield(word_tokenize(sentence))


from nltk.stem import WordNetLemmatizer

def lemming(text):
    """a function which stems each word in the given text"""
    lemmer = WordNetLemmatizer()
    text = [lemmer.lemmatize(word) for word in text]
    return text

# removing stopwords/punctuation --> tokenize --> lemmatize


In [6]:
data1 = list(filter_stopwords_punct(sentence) for sentence in data)
data_words = list(sentence_to_words_clean(data1))             
data_lem = [lemming(sentence) for sentence in data_words]

print(data[0])
data_lem[0]

just closed a deal in 27 hours using #AdobeSign https://t.co/Fe0YfarG31


['closed', 'deal', '27', 'hour', 'using', 'AdobeSign', 'httpstcoFe0YfarG31']

In [7]:
df['lemmed'] = data_lem
df['lemmed'] = df['lemmed'].apply(lambda x: ', '.join(x))

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features = 50)
X = vectorizer.fit_transform(df['lemmed'])

In [9]:
text_tfidf_matrix = X.toarray()

In [11]:
df2 = df
col_name_lst = []
for n in range(text_tfidf_matrix.shape[1]):
    col_name = f'text-TF-IDF-{n}'
    df2[col_name] = text_tfidf_matrix[:, n]
    col_name_lst.append(col_name)

In [12]:
df2.head()

Unnamed: 0,text,spam,spam_marketing,spam_hijack,spam_corporate,spam_bot,spam_known,spam_own,Docusign,onespan,...,text-TF-IDF-40,text-TF-IDF-41,text-TF-IDF-42,text-TF-IDF-43,text-TF-IDF-44,text-TF-IDF-45,text-TF-IDF-46,text-TF-IDF-47,text-TF-IDF-48,text-TF-IDF-49
0,just closed a deal in 27 hours using #AdobeSig...,0,0,0,0,1,0,0,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238573,0.0,0.0
1,just closed a deal in 2 days using #AdobeSign ...,0,0,0,0,1,0,0,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32519,0.0,0.0
2,just closed a deal in 2 hours using #AdobeSign...,0,0,0,0,1,0,0,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.376476,0.0,0.0
3,just closed a deal in 26 hours using #AdobeSig...,0,0,0,0,1,0,0,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.376476,0.0,0.0
4,just closed a deal in 6 days using #AdobeSign ...,0,0,0,0,1,0,0,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32519,0.0,0.0


In [13]:
model = MultinomialNB()
df3 = df

X = text_tfidf_matrix
y = df3['spam_bot']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

model.fit(X_train,y_train)

y_predict = model.predict(X_test)
y_pred = model.predict_proba(X_test)
print("Accuracy score of this model is" + "\n")
print(accuracy_score(y_test,y_predict))
print("\n" + "Confusion Matrix of this model is" + "\n")
print(confusion_matrix(y_test,y_predict))
print("\n" + "Log-Loss score of this model is" + "\n")
print(log_loss(y_test,y_pred))
print("\n" + "AUC score of this model is" + "\n")
print(roc_auc_score(y_test,y_predict))

Accuracy score of this model is

0.9872051179528188

Confusion Matrix of this model is

[[1955   57]
 [   7 2983]]

Log-Loss score of this model is

0.02362389674075928

AUC score of this model is

0.9846644214977692


In [14]:
model = MultinomialNB()
df3 = df

X = text_tfidf_matrix
y = df3['spam_marketing']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

model.fit(X_train,y_train)

y_predict = model.predict(X_test)
y_pred = model.predict_proba(X_test)
print("Accuracy score of this model is" + "\n")
print(accuracy_score(y_test,y_predict))
print("\n" + "Confusion Matrix of this model is" + "\n")
print(confusion_matrix(y_test,y_predict))
print("\n" + "Log-Loss score of this model is" + "\n")
print(log_loss(y_test,y_pred))
print("\n" + "AUC score of this model is" + "\n")
print(roc_auc_score(y_test,y_predict))

Accuracy score of this model is

0.9320271891243502

Confusion Matrix of this model is

[[4592   98]
 [ 242   70]]

Log-Loss score of this model is

0.1775461387396682

AUC score of this model is

0.6017317259854574
