### 0. Imports

In [1]:
import gzip
import json
from collections import Counter
import re
import itertools
import joblib
from pathlib import Path
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### 1. Load the data

In [2]:
# Cleaning functionality if necessary
def clean_html_tags(text, repl=''):
    exclude = '[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
    html_tags = re.compile('\\\\.|&nbsp;|<strong>|<.{0,3}>')
    non_ascii = re.compile(r'[^\x00-\x7F]+')
    tag_removal_regex = re.compile(r'(<!--.*?-->|<[^>]*>)(\\[nrt])')
    
    text = re.sub(html_tags, '', text)
    # text = re.sub(non_ascii, '', text) # Optional 
    text = re.sub(tag_removal_regex, '', text)
    # text = re.sub(str(exclude), '', text) # Optional
    return text 

In [3]:
data_file = Path('datasets/metadata-all-2022-05-24.jsonl.gz')
data = []
with gzip.open(data_file, 'rb') as fp:
    tag_removal_regex = re.compile(r'(<!--.*?-->|<[^>]*>)(<p>)(<strong>)(\\[nrt])')
    for ind, l in enumerate(fp):
        try:
            if ind%1000 == 0: # REMOVE ME
                r = json.loads(l)
                r = dict((k, v.lower() if type(v)=='str' else v) for k,v in r.items())
                data.append(r)
        except:
            print(f"There is a problem in:", ind)
            
df = pd.DataFrame(data)
df = df[['title','description','spam']] # Chosen features, it could be any combination of them
df.sample(10)

Unnamed: 0,title,description,spam
2168,A Century of Civil Engineering,,False
20,[FILMS VOIR] Le film LEGO 2 (2019) FILMS STREA...,"<p>CLICK THIS LINK TO WATCH &gt;&gt; <a href=""...",True
1705,FIG. 2 in Ciliated protozoa from a volcanic cr...,FIG. 2. Aerial photograph of the volcano crate...,False
2717,Figure 5 in The type specimens and type locali...,Figure 5. The nomenclature and geographic dist...,False
2626,Zur Fiehe'schen Reaktion,,False
353,The Medical Needs of Syria.,,False
2538,Zur Histologie der Nebenniere des Menschen,,False
410,Pulicaria dysenterica (L.) Bernh. (BR000001175...,"Belgium Herbarium image of <a href=""https://ww...",False
1077,FIGURES 5–8 in Four new species of Nemobiinae ...,FIGURES 5–8. Speonemobius sinensis sp. nov. 5....,False
2882,"Le Diable, tout le temps Film complet",<p>[WorldmoviesHD] ~ FiLm Comment regarder le ...,True


In [4]:
df['title'] = df['title'].map(lambda x: clean_html_tags(x))
df['description'] = df['description'].map(lambda x: clean_html_tags(x))

### 2. Train the model

In [42]:
import numpy as np
def split_train_test(data, labels, test_ratio = 0.2, random_state=7153):
    N = len(data)
    np.random.seed(random_state)
    ind_test = np.random.randint(N, size=int(N*test_ratio))
    ind_train = list(set(range(N)) - set(ind_test))

    # Train data, Test data, Train labels, Test labels
    return data[ind_train], data[ind_test], labels[ind_train], labels[ind_test]
    

In [30]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorize_layer = TextVectorization(
    standardize = 'lower_and_strip_punctuation',
    split       = 'whitespace',
    max_tokens  = 1000,
    output_mode ='tf-idf', 
    pad_to_max_tokens=False)

vectorize_layer.adapt(df['description'].values)
tfids = vectorize_layer(df['description'].values).numpy()
tfids.shape

(3036, 1000)

In [50]:
X_train, X_test, y_train, y_test = split_train_test(tfids, df.spam.to_numpy())

In [51]:
#import tensorflow_decision_forests as tfdf

model = tfdf.keras.RandomForestModel()
model.fit(x=X_train, y=y_train)

print(model.summary())
#n_estimators=100, n_jobs=4

Use /tmp/tmpwc5tpryl as temporary training directory
Reading training dataset...




Training dataset read in 0:00:09.715656. Found 2485 examples.
Training model...
Model trained in 0:00:02.723599
Compiling model...


[INFO kernel.cc:1176] Loading model from path /tmp/tmpwc5tpryl/model/ with prefix 16b9bb8ce5954400
[INFO abstract_model.cc:1246] Engine "RandomForestOptPred" built
[INFO kernel.cc:1022] Use fast generic engine






Model compiled.




Model: "random_forest_model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (1000):
	data:0.0
	data:0.1
	data:0.10
	data:0.100
	data:0.101
	data:0.102
	data:0.103
	data:0.104
	data:0.105
	data:0.106
	data:0.107
	data:0.108
	data:0.109
	data:0.11
	data:0.110
	data:0.111
	data:0.112
	data:0.113
	data:0.114
	data:0.115
	data:0.116
	data:0.117
	data:0.118
	data:0.119
	data:0.12
	data:0.120
	data:0.121
	data:0.122
	data:0.123
	data:0.124
	data:0.125
	data:0.126
	data:0.127
	data:0.128
	data:0.129
	data:0.13
	data:0.130
	data:0.131
	data:0.132
	data:0.133
	data:0.134
	data:0.135
	data:0.136
	data:0.137
	data:0.138
	data:0.139
	data:0.14
	data:0.140
	data:0.141
	data:0.142
	data:0.143
	data:0.144
	data:0.145


In [64]:
model(X_test)

<tf.Tensor: shape=(607, 1), dtype=float32, numpy=
array([[0.01333333],
       [0.        ],
       [0.00666667],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.01      ],
       [0.7633327 ],
       [0.        ],
       [0.99999917],
       [0.00333333],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.02      ],
       [0.02      ],
       [0.        ],
       [0.8899993 ],
       [0.        ],
       [0.03      ],
       [0.        ],
       [0.        ],
       [0.00333333],
       [0.        ],
       [0.03333334],
       [0.        ],
       [0.00666667],
       [0.85666597],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.00333333],
       [0.00666667],
       [0.04666667],
     

### 3. Calculate accuracy

In [54]:
# Evaluate the model
model.compile(metrics=["accuracy", tf.keras.metrics.AUC(), 
                       tf.keras.metrics.Recall(), tf.keras.metrics.Precision() ])
metrics = model.evaluate(x=X_test, y=y_test)

print("Accuracy: {0:.4f}".format(metrics[1]))

Accuracy: 0.9868


### 4. Dump model

In [57]:
models_dir = Path('models')
joblib.dump(model, models_dir / '2022_06_28_tf_record_spam.pkl')



INFO:tensorflow:Assets written to: ram://fe82036b-460d-48e7-84ce-46179915613d/assets


INFO:tensorflow:Assets written to: ram://fe82036b-460d-48e7-84ce-46179915613d/assets


['models/2022_06_28_tf_record_spam.pkl']

In [58]:
model = joblib.load(models_dir / '2022_06_28_tf_record_spam.pkl')


[INFO kernel.cc:1176] Loading model from path ram://a35b6268-4d7b-4b87-9e75-82f343a8b341/assets/ with prefix 16b9bb8ce5954400
[INFO decision_forest.cc:639] Model loaded with 300 root(s), 21220 node(s), and 627 input feature(s).
[INFO abstract_model.cc:1246] Engine "RandomForestOptPred" built
[INFO kernel.cc:1022] Use fast generic engine


### 5. Compare accuracy of old models

In [None]:
## Previous versions are not found


models = [
    '2017_06_18_record_spam.pkl',
    '2020_06_23_record_spam.pkl',
]

for model_path in models:
    model = joblib.load(models_dir / model_path)
    y_pred = old_model.predict(X_test)
    acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
    c = Counter(acc)
    print(f'### Accuracy of model {model_path} for {data_file}')
    print(c)
    print("Spam->Spam: {0:.4f}".format(c[(True, True)] / (c[(True, True)] + c[(True, False)])))
    print("Ham -> Ham: {0:.4f}".format(c[(False, False)] / (c[(False, False)] + c[(False, True)])))
    print("Accuracy: {0:.4f}".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))

#### Results

##### Accuracy of model for data up to 06/03/2017

Counter({(False, False): 56495, (True, True): 1652, (False, True): 140, (True, False): 38})

- Spam->Spam: 0.9775
- Ham -> Ham: 0.9975<br/>
- Accuracy: 0.9969<br/>

##### Accuracy of model for data up to 06/04/2017

Counter({(False, False): 58746, (True, True): 1855, (True, False): 65, (False, True): 54})

- Spam->Spam: 0.9661
- Ham -> Ham: 0.9991
- Accuracy: 0.9980

##### Accuracy of model for data up to 17/05/2018

Counter({(False, False): 133426, (True, True): 2627, (True, False): 149, (False, True): 128})

- Spam->Spam: 0.9463
- Ham -> Ham: 0.9990
- Accuracy: 0.9980

##### Accuracy of model for data up to 04/11/2019

Counter({(False, False): 482380, (True, True): 3526, (False, True): 970, (True, False): 515})

- Spam->Spam: 0.8726
- Ham -> Ham: 0.9980
- Accuracy: 0.9970

##### Accuracy of model for data up to 04/11/2019 with RandomForest Model

Counter({(False, False): 483307, (True, True): 3580, (True, False): 461, (False, True): 43})

- Spam->Spam: 0.8859
- Ham -> Ham: 0.9999
- Accuracy: 0.9990

##### Accuracy of model for data up to 23/06/2020 with RandomForest Model

Counter({(False, False): 530887, (True, True): 4623, (True, False): 918, (False, True): 65})<br/>

- Spam->Spam: 0.8343<br/>
- Ham -> Ham: 0.9999<br/>
- Accuracy: 0.9982<br/>

##### Accuracy of model for data up to 23/06/2020 with RandomForest Model (n_estimators=100)

Counter({(False, False): 530898, (True, True): 4691, (True, False): 850, (False, True): 54})

- Spam->Spam: 0.8466
- Ham -> Ham: 0.9999
- Accuracy: 0.9983

##### Accuracy of model for data up to 23/06/2020 with ExtraTreesClassifier (n_estimators=100)

Counter({(False, False): 530872, (True, True): 4705, (True, False): 836, (False, True): 80})

- Spam->Spam: 0.8491
- Ham -> Ham: 0.9998
- Accuracy: 0.9983



In [None]:
### 6. Plot results

In [None]:
import plotly.graph_objects as go

class CurrentModel:
    
    def __init__(self, acc, spam_spam, ham_ham, name):
        self.acc = acc
        self.spam_spam = spam_spam
        self.ham_ham = ham_ham
        self.name = name
        
    def plot(self):
        x = ['Acc', 'Spam', 'Ham']
        y = [self.acc, self.spam_spam, self.ham_ham]
        fig = go.Figure()
        fig.add_trace(go.Histogram(
            x=x,
            y=y,
            name=name
        ))
        fig.show()

def plot_all(models):
    x = ['Acc', 'Spam', 'Ham']
    fig = go.Figure()
    for model in models:
        fig.add_trace(go.Bar(
            x=x,
            y=[model.acc, model.spam_spam, model.ham_ham],
            name=model.name
        ))
    fig.update_yaxes(range=[0.85, 1])
    fig.show()
    
model_06_03_2017 = CurrentModel(0.9969, 0.9975, 0.9991, 'data up to 06/03/2017')
model_06_04_2017 = CurrentModel(0.9980, 0.9661, 0.9975, 'data up to 06/04/2017')
model_17_05_2018 = CurrentModel(0.9980, 0.9463, 0.9990, 'data up to 17/05/2018')
model_04_11_2019 = CurrentModel(0.9990, 0.8726, 0.9980, 'data up to 04/11/2019')
model_04_11_2019_RandomForest = CurrentModel(0.9990, 0.8859, 0.9999, 'data up to 04/11/2019 <RandomForest>')

models = [
    model_06_03_2017,
    model_06_04_2017,
    model_17_05_2018,
    model_04_11_2019,
    model_04_11_2019_RandomForest
]

plot_all(models)