In [1]:
import pandas as pd
import numpy as np
import os
import random
import re
import nltk
import joblib

from tqdm.notebook import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn import feature_extraction
from sklearn import metrics

In [2]:
config = {
    "seed": 2021,
    "num_folds": 5,
    "max_features": 5000,
    "print_report": True
}

In [3]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

set_random_seed(config['seed'])

In [4]:
def create_folds(data):
    data["kfold"] = -1
    data = data.sample(frac=1, random_state=config['seed']).reset_index(drop=True)
    
    y = data.target.values

    kf = model_selection.StratifiedKFold(n_splits=config['num_folds'], random_state=config['seed'], shuffle=True)
    for f, (t_, v_) in enumerate(kf.split(X=data, y=y)):
        data.loc[v_, 'kfold'] = f  
    
    return data

In [5]:
df = pd.read_csv("../input/smsspamcollection/spam.csv", encoding='latin-1')

df = df.rename(columns={"v1": "target", "v2": "message_text"})
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df.shape

(5572, 2)

In [6]:
df.head()

Unnamed: 0,target,message_text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.message_text.iloc[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [8]:
def clean_text(text):
    
    # convert to lower
    text = text.lower()
    
    # only keep alpha-numeric characters
    text_alpha_num = re.sub("[^0-9a-z]", " ", text)
    
    # remove stopwords and use stemming
    stemmer = nltk.stem.PorterStemmer()
    stopwords_list = nltk.corpus.stopwords.words("english")
    
    word_list = nltk.tokenize.word_tokenize(text_alpha_num)
    words = [stemmer.stem(y) for y in word_list if y]
    
    text_clean = " ".join(words)
    
    return text_clean

In [9]:
df['message_text'] = df['message_text'].progress_apply(clean_text)

target_dict = {
    "ham": 0,
    "spam": 1
}

df['target'] = df['target'].map(target_dict)

  0%|          | 0/5572 [00:00<?, ?it/s]

In [10]:
df.head()

Unnamed: 0,target,message_text
0,0,go until jurong point crazi avail onli in bugi...
1,0,ok lar joke wif u oni
2,1,free entri in 2 a wkli comp to win fa cup fina...
3,0,u dun say so earli hor u c alreadi then say
4,0,nah i don t think he goe to usf he live around...


In [11]:
df.message_text.iloc[0]

'go until jurong point crazi avail onli in bugi n great world la e buffet cine there got amor wat'

In [12]:
# split data into train and test

train, test = model_selection.train_test_split(df, test_size=0.2, shuffle=True, random_state=config['seed'], stratify=df['target'])

train.shape, test.shape

((4457, 2), (1115, 2))

In [13]:
# reset train and test index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [14]:
def vectorize(train_text, test_text):
    
    vectorizer = feature_extraction.text.TfidfVectorizer(max_features=config['max_features'], ngram_range=(1, 2))
    
    train_vectors = vectorizer.fit_transform(train_text).toarray()
    test_vectors = vectorizer.transform(test_text).toarray()
    
    return train_vectors, test_vectors

In [15]:
import lightgbm as lgbm
import xgboost as xgb
from sklearn import tree, ensemble

model_dispatcher = {
    "lgbm": lgbm.LGBMClassifier(random_state=config['seed']),
    "xgboost": xgb.XGBClassifier(verbose=0),
    "dt": tree.DecisionTreeClassifier(max_depth=12),
    "rf": ensemble.RandomForestClassifier()
}

In [16]:
def run(train, fold, model_name):
    
    train_data = train[train['kfold'] != fold].reset_index(drop=True)
    valid_data = train[train['kfold'] == fold].reset_index(drop=True)
    
    x_train = train_data.drop(['kfold', 'target', 'message_text'], axis=1)
    y_train = train_data['target'].values
    
    x_valid = valid_data.drop(['kfold', 'target', 'message_text'], axis=1)
    y_valid = valid_data['target'].values
    
    model = model_dispatcher[model_name]
    
    model.fit(x_train, y_train)
    
    y_preds = model.predict(x_valid)
    
    score = metrics.f1_score(y_valid, y_preds, average='macro')
    
    print(f"Fold: {fold}, f1_score: {round(score, 5)}")
    
    # save model
    joblib.dump("", f"{model_name}_model_{fold}.bin")
    
    if config['print_report']:
        print(metrics.classification_report(y_valid, y_preds))
    
    return score

In [17]:
# vectorize data

train_text = train.message_text.tolist()
test_text = test.message_text.tolist()

train_vectors, test_vectors = vectorize(train_text, test_text)
print(train_vectors.shape)

train_vectors = pd.DataFrame(train_vectors)
train_vectors.columns = [f"tfidf_{i}" for i in range(train_vectors.shape[1])]

test_vectors = pd.DataFrame(test_vectors)
test_vectors.columns = [f"tfidf_{i}" for i in range(test_vectors.shape[1])]
print(train.shape)
train = pd.concat([train, train_vectors], axis=1)
test = pd.concat([test, test_vectors], axis=1)

(4457, 5000)
(4457, 2)


In [18]:
train.head()

Unnamed: 0,target,message_text,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,...,tfidf_4990,tfidf_4991,tfidf_4992,tfidf_4993,tfidf_4994,tfidf_4995,tfidf_4996,tfidf_4997,tfidf_4998,tfidf_4999
0,1,hot live fantasi call now 08707509020 just 20p...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,hello no news on job they are make me wait a f...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,that mean get the door,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,uncl g just check up on you do have a reward m...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,good morn my love i go to sleep now and wish y...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# create_folds
train_folds = create_folds(train)

train_folds.kfold.value_counts()

0    892
1    892
3    891
4    891
2    891
Name: kfold, dtype: int64

In [20]:
train_folds.head()

Unnamed: 0,target,message_text,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,...,tfidf_4991,tfidf_4992,tfidf_4993,tfidf_4994,tfidf_4995,tfidf_4996,tfidf_4997,tfidf_4998,tfidf_4999,kfold
0,0,nvm it s ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
1,0,cool i ll text you in a few,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0,well i wasn t avail as i washob nob with last ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,1,congrat 2 mobil 3g videophon r your call 09063...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0,uncl abbey happi new year abiola,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [21]:
scores = []
for fold in range(config['num_folds']):
    score = run(train_folds, fold=fold, model_name='lgbm')
    scores.append(score)
print(f"CV score: {round(np.mean(scores), 5)}")

Fold: 0, f1_score: 0.97049
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       772
           1       0.97      0.93      0.95       120

    accuracy                           0.99       892
   macro avg       0.98      0.96      0.97       892
weighted avg       0.99      0.99      0.99       892

Fold: 1, f1_score: 0.97504
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       772
           1       1.00      0.92      0.96       120

    accuracy                           0.99       892
   macro avg       0.99      0.96      0.98       892
weighted avg       0.99      0.99      0.99       892

Fold: 2, f1_score: 0.95915
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       772
           1       0.99      0.87      0.93       119

    accuracy                           0.98       891
   macro avg       0.99      0.94      0.96    