# **Import Dataset**

In [3]:
import os
import tarfile
import urllib.request
import pandas as pd
import email
import string
from string import punctuation
import spacy
from bs4 import BeautifulSoup
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

In [4]:
DOWNLOAD_HAM = ["https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2",
                "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2",
                "https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"]

DOWNLOAD_SPAM = ["https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2",
                 "https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2"]


def fetch_tar_file(url,extract_path,file_name):
  os.makedirs(extract_path, exist_ok = True)
  tgz_path = os.path.join("datasets",file_name)
  urllib.request.urlretrieve(url,tgz_path)
  file_tgz = tarfile.open(tgz_path)
  file_tgz.extractall(extract_path)
  file_tgz.close()

i = 0
for url in DOWNLOAD_HAM:
  fetch_tar_file(url,"datasets",("ham"+str(i)))
  i = i + 1

i = 0
for url in DOWNLOAD_SPAM:
  fetch_tar_file(url,"datasets",("spam"+str(i)))
  i = i + 1



In [21]:
# def load_text_data(path,df):
#     subfolders = [f.path for f in os.scandir(path) if f.is_dir()]
#     print(subfolders)
#     for dir in subfolders:
#         print(dir)
#         label = 0 if dir.rfind("ham")!=-1 else 1
#         for file in os.scandir(dir):
#             print(file)
#             file_path = os.path.join(dir,file)
#             if os.path.isfile(file_path):
#                 text = open(file_path,"r",encoding="utf-8",errors="replace",)
#                 df = pd.concat([pd.DataFrame(columns=df.columns,data=[[text.read(),label]]),df],ignore_index=True)
#     return df

In [23]:
def load_text_data(path, df):
    subfolders = [f.path for f in os.scandir(path) if f.is_dir()]
    print(subfolders)
    for dir in subfolders:
        print(dir)
        label = 0 if "ham" in dir else 1  # Упрощение условия для определения метки
        for entry in os.scandir(dir):  # Используем 'entry' вместо 'file', так как 'file' может быть зарезервированным словом
            if entry.is_file():  # Проверка, является ли объект файлом
                file_path = entry.path  # Получение полного пути к файлу
                with open(file_path, "r", encoding="utf-8", errors="replace") as file:  # Открытие файла в контекстном менеджере
                    text = file.read()  # Чтение содержимого файла
                    # Создание временного DataFrame и добавление его к существующему df
                    temp_df = pd.DataFrame([[text, label]], columns=df.columns)
                    df = pd.concat([df, temp_df], ignore_index=True)
    return df

In [24]:
df = pd.DataFrame(columns=["text","label"])

df = load_text_data("datasets",df)

['datasets\\easy_ham', 'datasets\\easy_ham_2', 'datasets\\hard_ham', 'datasets\\spam', 'datasets\\spam_2']
datasets\easy_ham
datasets\easy_ham_2
datasets\hard_ham
datasets\spam
datasets\spam_2


In [28]:
df.label.unique()

array([0, 1], dtype=object)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6051 entries, 0 to 6050
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6051 non-null   object
 1   label   6051 non-null   object
dtypes: object(2)
memory usage: 94.7+ KB


In [27]:
df[(df["label"] == 1)].count()

text     1898
label    1898
dtype: int64

In [30]:
df.label.value_counts(normalize=True)

label
0    0.686333
1    0.313667
Name: proportion, dtype: float64

# **Data Cleaning**

In [31]:
#Regex is fantastic for its intended purpose: searching for highly-variable needles in highly-variable haystacks
#But it's very slow in our case, so we will use email.parser from email python library

In [32]:
def parse_text(text):
  parser = email.parser.Parser()
  parsed_mail_message = parser.parsestr(text)
  body = ""
  for element in parsed_mail_message.get_payload():
      body+=str(element)

  return body

In [33]:
df['text'] = [parse_text(text) for text in df['text']]

In [34]:
df.head()

Unnamed: 0,text,label
0,"Date: Wed, 21 Aug 2002 10:54:46 -05...",0
1,"Martin A posted:\nTassos Papadopoulos, the Gre...",0
2,Man Threatens Explosion In Moscow \n\nThursday...,0
3,Klez: The Virus That Won't Die\n \nAlready the...,0
4,"> in adding cream to spaghetti carbonara, whi...",0


# **Pre-Processing**

In [35]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 435.7 kB/s eta 0:00:30
     --------------------------------------- 0.0/12.8 MB 435.7 kB/s eta 0:00:30
     --------------------------------------- 0.0/12.8 MB 279.3 kB/s eta 0:00:46
     --------------------------------------- 0.1/12.8 MB 297.7 kB/s eta 0:00:43
     --------------------------------------- 0.1/12.8 MB 305.0 kB/s eta 0:00:42
     --------------------------------------- 0.1/12.8 MB 305.0 kB/s eta 0:00:42
     --------------------------------------- 0.1/12.8 MB

In [36]:
preprocessing_model = spacy.load('en_core_web_sm')

In [37]:
def is_ok(token):
  if token.is_stop:
    return False
  if len(token) <= 3:
    return False
  if token.is_punct:
    return False
  if token.is_space:
    return False

  return True

In [38]:
def preprocessing(text):
  text_without_tags = BeautifulSoup(text).get_text()

  for char in text_without_tags:
    if char.isdigit() or char in punctuation:
      text_without_tags = text_without_tags.replace(char," ")

  doc = preprocessing_model(text_without_tags)

  return [token.lemma_.lower() for token in doc if is_ok(token)]

In [39]:
df["text"] = [preprocessing(text) for text in df["text"]]

  text_without_tags = BeautifulSoup(text).get_text()
  text_without_tags = BeautifulSoup(text).get_text()


In [41]:
df.to_csv("Train1.csv")

In [42]:
shuffler = StratifiedShuffleSplit(n_splits=1,test_size=0.2)

for train_index,test_index in shuffler.split(df,df['label']):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]

In [43]:
train_x = train_set.text
train_y = train_set.label
test_x = test_set.text
test_y = test_set.label

In [44]:
def dummy(doc):
  return doc

tfidf = TfidfVectorizer(
    analyzer = "word",
    preprocessor = dummy,
    tokenizer = dummy,
    token_pattern = None
)

train_x = tfidf.fit_transform(train_x)
test_x = tfidf.transform(test_x)

In [45]:
train_y = train_y.astype('int')
test_y = test_y.astype('int')

# **Training and Testing**

**Logistic** **Regression**

In [46]:
lr = LogisticRegression()

lr.fit(train_x,train_y)

In [47]:
predictions = lr.predict(test_x)

In [48]:
acc_score = accuracy_score(test_y,predictions)
f1_score_ = f1_score(test_y,predictions)

In [49]:
print("Accuracy:",acc_score)
print("F1 Score:",f1_score_)

Accuracy: 0.9686209744013212
F1 Score: 0.9478021978021979


**SVM Classifier (with GridSearch Fine Tuning)**

In [50]:
param_grid = {'C':[1,0.1,0.01],'kernel':['rbf','linear']}
svc = SVC()
gs = GridSearchCV(svc,param_grid,cv=5,scoring='accuracy',return_train_score=True)

gs.fit(train_x,train_y)


In [51]:
gs.best_params_

{'C': 1, 'kernel': 'linear'}

In [52]:
svc = SVC(C=1,kernel='linear')
svc.fit(train_x,train_y)
predictions = svc.predict(test_x)

In [53]:
acc_score = accuracy_score(test_y,predictions)
f1_score_ = f1_score(test_y,predictions)

In [54]:
print("Accuracy:",acc_score)
print("F1 Score:",f1_score_)

Accuracy: 0.9876135425268373
F1 Score: 0.9799732977303071


**XGBoost (with GridSearch Fine Tuning)**

In [55]:
xgbc = XGBClassifier()
param_grid = {'learning_rate':[0.1],'min_child_weight':[1,2,0.5],'max_depth':[3,5,8]}

gs = GridSearchCV(xgbc,param_grid,cv=5,scoring='accuracy')
gs.fit(train_x,train_y)

  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sp

In [56]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 8, 'min_child_weight': 2}

In [57]:
xgbc = XGBClassifier(n_estimators=400,learning_rate=0.1,max_depth=8,min_child_weight=2)
xgbc.fit(train_x,train_y)
predictions = xgbc.predict(test_x)

In [58]:
acc_score = accuracy_score(test_y,predictions)
f1_score_ = f1_score(test_y,predictions)

In [59]:
print("Accuracy:",acc_score)
print("F1 Score:",f1_score_)

Accuracy: 0.9826589595375722
F1 Score: 0.9721115537848606
