In [1]:
import os
import pandas as pd
import numpy as np
import nltk
import re

import torch
import fasttext

from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer

from collections import Counter

from sklearn import feature_extraction, model_selection, pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from lightgbm import LGBMClassifier

from sklearn import set_config
set_config(display="diagram")

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style="ticks", context="talk")
# sns.set(style="darkgrid", context="talk")
plt.style.use("seaborn-pastel")
plt.rcParams.update({"grid.linewidth":0.4, "grid.alpha":0.8})

# Basic EDA

In [3]:
df = pd.read_csv("train_spam.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16278 entries, 0 to 16277
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text_type  16278 non-null  object
 1   text       16278 non-null  object
dtypes: object(2)
memory usage: 254.5+ KB


In [5]:
df.head()

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...


In [6]:
df.describe()

Unnamed: 0,text_type,text
count,16278,16278
unique,2,16267
top,ham,SPAM ALERT 🚔 User: Username: @DillyBubbl...
freq,11469,7


Маппинг целевой переменной

In [7]:
df['text_type'] = df['text_type'].map({"ham": 0, "spam": 1})

Текст уже выглядит предобработанным, но стемминг с лемматизацией сказываются на результате в сотых долях.

In [8]:
nltk.download('wordnet')

def preprocess_text(text, flg_stemm=True, flg_lemm=True, stopword_collection=None):
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())

    lst_text = text.split()

    if stopword_collection is not None:
        lst_text = [
            word for word in lst_text
            if word not in stopword_collection
        ]

    if flg_stemm == True:
#         ps = nltk.stem.porter.PorterStemmer()
        ps = SnowballStemmer("english")
        lst_text = [ps.stem(word) for word in lst_text]

    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    text = " ".join(lst_text)
    return text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
df["text_clean"] = df["text"].apply(lambda x: preprocess_text(x))
# df["text_clean"] = df["text"]
df["text_clean"].head()

0    make sure alex know his birthday is over in fi...
1    a resum for john lavorato thank vinc i will ge...
2    plzz visit my websit moviesgodml to get all mo...
3    urgent your mobil number ha been award with a ...
4    overview of hr associ analyst project per davi...
Name: text_clean, dtype: object

Интересно было посмотреть на самые часто встречающиеся слова, которые встречаются в спам сообщениях, но не входят в топ самых встречающихся в обычных.

In [10]:
spam_most_common = Counter(" ".join(df[df['text_type'] == 1]["text_clean"]).split()).most_common(50)
ham_most_common = Counter(" ".join(df[df['text_type'] == 0]["text_clean"]).split()).most_common(50)

print([w for w, c in spam_most_common if w not in [ww for ww, cc in ham_most_common]])

['free', 'our', 'get', 'all', 'now', 'more', 'no', '1', 'just', 'here', 'invest', 'call', 'receiv', 'click', 'new', 'onli', 'do']


# Data Split & Pipeline

In [11]:
df_train, df_test = model_selection.train_test_split(df, test_size=0.3, random_state=0)

y_train = df_train["text_type"].values
y_test = df_test["text_type"].values

In [12]:
X_train = df_train[["text_clean"]]
X_test = df_test[["text_clean"]]

In [13]:
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1, 3))

In [14]:
text_features = "text_clean"
text_transformer = pipeline.Pipeline(steps=[
    ('vectorizer', vectorizer)
])

preprocessor = ColumnTransformer(transformers=[
        ('text', text_transformer, text_features),
])

Пайплайны построены по схеме Vectorizer + Model. Т.к. TF-IDF учитывает "важность" слов, он должен быть предпочтительнее в большем числе случаев, чем Count Vectorizer.

## kNN

Нужен исключительно, чтобы убедиться, как долго работает на инференсе.

In [15]:
%%time

knn_pipeline = pipeline.Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", KNeighborsClassifier(n_neighbors=3)),
])

knn_pipeline.fit(X_train, y_train)

CPU times: user 5.89 s, sys: 257 ms, total: 6.14 s
Wall time: 6.15 s


In [16]:
%%time

roc_auc_score(y_test, knn_pipeline.predict(X_test))

CPU times: user 4min 31s, sys: 431 ms, total: 4min 32s
Wall time: 2min 52s


0.661787787654901

## Logistic Regression

Результат уже неплохой, но boosting/nn должны дать лучший результат

In [17]:
%%time

lr_pipeline = pipeline.Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(random_state=0, solver='liblinear')),
])

lr_pipeline.fit(X_train, y_train)

CPU times: user 6.78 s, sys: 328 ms, total: 7.11 s
Wall time: 7.32 s


In [18]:
%%time

roc_auc_score(y_test, lr_pipeline.predict(X_test))

CPU times: user 787 ms, sys: 31.9 ms, total: 819 ms
Wall time: 799 ms


0.9060271328714176

In [19]:
%%time

roc_auc_score(y_test, lr_pipeline.predict_proba(X_test)[:, 1])

CPU times: user 797 ms, sys: 1.04 ms, total: 798 ms
Wall time: 806 ms


0.9783539527497944

## LGBM Classifier

In [20]:
%%time

lgbm_pipeline = pipeline.Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LGBMClassifier(random_state=0)),
])

lgbm_pipeline.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 3400, number of negative: 7994
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.364711 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 157401
[LightGBM] [Info] Number of data points in the train set: 11394, number of used features: 5695
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.298403 -> initscore=-0.854916
[LightGBM] [Info] Start training from score -0.854916
CPU times: user 20.5 s, sys: 217 ms, total: 20.7 s
Wall time: 23.9 s


In [21]:
%%time

roc_auc_score(y_test, lgbm_pipeline.predict(X_test))

CPU times: user 796 ms, sys: 2.99 ms, total: 799 ms
Wall time: 802 ms


0.9156754879985295

In [22]:
%%time

roc_auc_score(y_test, lgbm_pipeline.predict_proba(X_test)[:, 1])

CPU times: user 754 ms, sys: 3.01 ms, total: 757 ms
Wall time: 761 ms


0.9783444557342063

## Random Forest

Несмотря на низкую скорость обучения, дает хороший результат

In [23]:
%%time

rf_pipeline = pipeline.Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=0, n_estimators=400, n_jobs=-1)),
])

rf_pipeline.fit(X_train, y_train)

CPU times: user 1min 22s, sys: 563 ms, total: 1min 22s
Wall time: 1min


In [24]:
%%time

roc_auc_score(y_test, rf_pipeline.predict(X_test))

CPU times: user 2.57 s, sys: 44.8 ms, total: 2.61 s
Wall time: 1.76 s


0.9012417603177926

In [25]:
%%time

roc_auc_score(y_test, rf_pipeline.predict_proba(X_test)[:, 1])

CPU times: user 2.62 s, sys: 33.8 ms, total: 2.65 s
Wall time: 1.8 s


0.9814273095363313

## fastText

In [26]:
!pip install fasttext



In [27]:
df_train["fasttext_labels"] = df_train["text_type"].map({1: "__label__spam", 0: "__label__ham"})
df_train[["fasttext_labels", "text_clean"]].to_csv("train_spam_processed", index=False, header=False, sep=" ")

In [28]:
%%time

# fasttext_model = fasttext.train_supervised(input="train_spam_processed", lr=1.0, epoch=25)
fasttext_model = fasttext.train_supervised(input="train_spam_processed", lr=1.0, epoch=25, wordNgrams=3, seed=0)

CPU times: user 34 s, sys: 790 ms, total: 34.8 s
Wall time: 42.5 s


In [29]:
%%time

preds = []
for text in X_test.values:
    text[0] = text[0].replace("\n", " ")
    label, proba = fasttext_model.predict(text[0])
    preds.append(1 if label[0] == ("__label__spam") else 0)

roc_auc_score(y_test, preds)

CPU times: user 398 ms, sys: 10 µs, total: 398 ms
Wall time: 400 ms


0.9273260795196349

In [30]:
%%time

preds = []
for text in X_test.values:
    text[0] = text[0].replace("\n", " ")
    label, proba = fasttext_model.predict(text[0])
    preds.append(proba[0] if label[0] == ("__label__spam") else max(0, 1 - proba[0]))

roc_auc_score(y_test, preds)

CPU times: user 413 ms, sys: 1.03 ms, total: 414 ms
Wall time: 415 ms


0.9749846567033102

# Scoring

В зависимости от seed, Random Forest и fasttext показывают схожие результаты. В solution.csv представлен скоринг от fasttext.

In [31]:
df_test_blind = pd.read_csv("test_spam.csv")

In [32]:
df_test_blind["text_clean"] = df_test_blind["text"].apply(lambda x: preprocess_text(x))
# df_test_blind["text_clean"] = df_test_blind["text"]
df_test_blind["text_clean"].to_csv("blind_spam_processed", index=False, header=False)

In [33]:
!head blind_spam_processed

j jim whitehead ejw cse ucsc edu write j you open sourc the new compon you develop for this j project so the next person who come along won t have to j reimplement them right no need all those compon alreadi exist either in the java class librari or from the various java jar collect most of the class i use came from the jakarta project and apachexml but if it s ani consol my thread of them all togeth into a newswir server is gpl and avail on sourceforg gari lawrenc murphi garym teledyn com teledynam communic inc busi advantag through communiti softwar url comput are useless they can onli give you answer pablo picasso
origin messag from bitbitch magnesium net peopl are scream and shout over the polit figur becaus they cannot be heard in ani other way what are they illiter mute what s their problem if somebodi stop them from post web page or print newslett or talk on the phone or organ their own confer then that would be wrong i don t think free speech is a licens to speak direct at and 

In [34]:
%%time

preds = []
with open("blind_spam_processed", "r") as f:
    for text in f.readlines():
        label, proba = fasttext_model.predict(text.replace("\n", " "))
        preds.append(proba[0] if label[0] == ("__label__spam") else 1 - min(1, proba[0]))

CPU times: user 466 ms, sys: 3.86 ms, total: 470 ms
Wall time: 918 ms


In [35]:
df_test_blind["score"] = pd.Series(preds)

In [36]:
df_test_blind[["score", "text"]].to_csv("solution.csv", index=False)

In [37]:
!head solution.csv

score,text
0.006756722927093506,j jim whitehead ejw cse ucsc edu writes j you open sourced the new components you developed for this j project so the next person who comes along won t have to j reimplement them right no need all those components already exist either in the java class libraries or from the various java jar collections most of the classes i used came from the jakarta project and apachexml but if it s any consolation my threading of them all together into a newswire server is gpl and available on sourceforge gary lawrence murphy garym teledyn com teledynamics communications inc business advantage through community software url computers are useless they can only give you answers pablo picasso
0.009048283100128174,original message from bitbitch magnesium net people are screaming and shouting over the political figures because they cannot be heard in any other way what are they illiterate mute what s their problem if somebody stops them from posting web pages or printing ne