In [1]:
import re
import string

import pandas as pd

from time import perf_counter 

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

re_url = re.compile(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
re_email = re.compile('(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])')

## Read dataset

In [2]:
df_full = pd.read_csv('../data/processed/20newsgroup_raw.csv', sep=';')

## Preprocessing

### Clean Header

In [3]:
%%time

def clean_header(text):
    text = re.sub(r'(From:\s+\w+\s+<[^>]+>)\n', '', text)
    text = re.sub(r'(Subject:[^\n]+\n)', '', text)
    text = re.sub(r'(([\sA-Za-z0-9\-]+)?[A|a]rchive-name:[^\n]+\n)', '', text)
    text = re.sub(r'(Last-modified:[^\n]+\n)', '', text)
    text = re.sub(r'(Version:[^\n]+\n)', '', text)

    return text


df_full['text'] = df_full['text'].apply(clean_header)


Wall time: 29.3 s


### Clean text

https://nlp.stanford.edu/IR-book/html/htmledition/normalization-equivalence-classing-of-terms-1.html

In [4]:
%%time

def clean_text(text):        
    text = text.lower()
    text = text.strip()
    text = re.sub(re_url, '', text)
    text = re.sub(re_email, '', text)
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'(\d+)', ' ', text)
    text = re.sub(r'(\s+)', ' ', text)
    
    return text

df_full['text'] = df_full['text'].apply(clean_text)

Wall time: 10.2 s


### Tokenization

https://nlp.stanford.edu/IR-book/html/htmledition/tokenization-1.html

In [5]:
%%time

df_full['text'] = df_full['text'].apply(word_tokenize) \
    .apply(lambda x: ' '.join(x))

Wall time: 15.5 s


### Remove stop words

https://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html

In [6]:
%%time

stop_words = stopwords.words('english')

df_full['text'] = df_full['text'].str.split() \
    .apply(lambda x: [word for word in x if word not in stop_words]) \
    .apply(lambda x: ' '.join(x))

Wall time: 7.1 s


### Stemming

https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

In [7]:
%%time

stemmer = PorterStemmer()

df_full['text'] = df_full['text'].str.split() \
    .apply(lambda x: set([stemmer.stem(word) for word in x])) \
    .apply(lambda x: ' '.join(x))

Wall time: 49.6 s


### Divide in train and test

#### Balance classes

In [8]:
df_full['target'].value_counts()

rec.sport.hockey            999
soc.religion.christian      997
rec.motorcycles             994
rec.sport.baseball          994
sci.crypt                   991
rec.autos                   990
sci.med                     990
sci.space                   987
comp.os.ms-windows.misc     985
comp.sys.ibm.pc.hardware    982
sci.electronics             981
comp.windows.x              980
comp.graphics               973
misc.forsale                972
comp.sys.mac.hardware       961
talk.politics.mideast       940
talk.politics.guns          910
alt.atheism                 799
talk.politics.misc          775
talk.religion.misc          628
Name: target, dtype: int64

In [9]:
X = df_full['text']
y = df_full['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45, stratify=y)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=45, stratify=y) # 0.25 x 0.8 = 0.2

### TF-IDF (Term frequency-inverse document frequency)

https://nlp.stanford.edu/IR-book/html/htmledition/tf-idf-weighting-1.html

In [10]:
vectorizer = TfidfVectorizer(min_df=2, max_df=0.4)

tfidf_matrix = vectorizer.fit(X_train)

X_train_tfidf = tfidf_matrix.transform(X_train)
X_test_tfidf = tfidf_matrix.transform(X_test)

### Baseline

In [11]:
model = GaussianNB()
model.fit(X_train_tfidf.toarray(), y_train)

y_pred = model.predict(X_test_tfidf.toarray())

In [12]:
accuracy_score(y_test, y_pred)

0.8157195963887414

### Save result preprocessing

In [13]:
df_preprocessing = pd.DataFrame(tfidf_matrix.transform(X).toarray())
df_preprocessing['target'] = y

In [14]:
# %%time
# df_preprocessing.to_csv('../data/processed/newsgroup_vectorized.csv', index=False)

## Model

In [15]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

def evaluate_classifier(model_name, start_time, end_time, y_truth, y_pred):
    accuracy = accuracy_score(y_truth, y_pred)
    
    return {
        'model' : model_name,
        'time' : round(end_time - start_time, 3),
        'accuracy' : round(accuracy, 3),
    }

### Divide dataset in train, test and validation

In [16]:
X = df_preprocessing.drop(columns={'target'})
y = df_preprocessing['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45, stratify=y)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=45, stratify=y_train)

### Training

In [17]:
%%time

model_relation = {
    'GaussianNB' : GaussianNB(),
    'DecisionTree' : DecisionTreeClassifier(),
    'KNN' : KNeighborsClassifier(),
    'MLPClassifier' : MLPClassifier()
}

list_metrics = []
for model_name in model_relation:
    start_time = perf_counter()
    
    model = model_relation[model_name]
    model.fit(X_train, y_train)
    
    end_time = perf_counter()

    list_metrics.append(evaluate_classifier(model_name, start_time, end_time, y_test, model.predict(X_test)))

Wall time: 1h 53min 10s


### Metrics

In [18]:
pd.DataFrame(list_metrics).set_index('model').sort_values('accuracy', ascending=False)

Unnamed: 0_level_0,time,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1
MLPClassifier,2508.189,0.905
GaussianNB,27.213,0.816
DecisionTree,582.118,0.603
KNN,60.337,0.359
