In [17]:
import os
from pathlib import Path
from typing import Iterable
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


class DocumentPreprocessor:
    def __init__(self, stop_words: Iterable[str] = None, lemmatizer: WordNetLemmatizer = None):
        self.stop_words = stop_words or set(stopwords.words('english'))
        self.lemmatizer = lemmatizer or WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return pd.Series([self.preprocess_document(doc) for doc in X])

    def preprocess_document(self, doc):
        tokens = word_tokenize(doc)
        tokens = [self.lemmatizer.lemmatize(token.lower()) for token in tokens if
                  token.lower() not in self.stop_words and token.isalnum()]
        return ' '.join(tokens)


def load_news_documents(dataset_path: Path):
    document_names = []
    document_texts = []
    document_classes = []
    dataset_classnames = os.listdir(dataset_path)
    for doc_class in dataset_classnames:
        class_path = os.path.join(dataset_path, doc_class)
        file_names = os.listdir(class_path)
        for file_name in file_names:
            file_path = os.path.join(class_path, file_name)
            document_names.append(file_name)
            document_classes.append(doc_class)
            with open(file_path, 'r', encoding='ISO-8859-1') as f:
                document_texts.append(f.read())
    return pd.DataFrame(
        {'document_name': document_names, 'document_class': document_classes, 'document_text': document_texts})


[nltk_data] Downloading package punkt to /home/gustavo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gustavo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gustavo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
import warnings

warnings.filterwarnings("ignore")


In [2]:
dataset = load_news_documents('../data/20news-18828')

In [3]:
train_df, remaining_df = train_test_split(dataset, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(remaining_df, test_size=0.75, random_state=42)

In [4]:
document_preprocessor = DocumentPreprocessor()

In [5]:
X_train = document_preprocessor.transform(train_df['document_text'])
X_val = document_preprocessor.transform(val_df['document_text'])
X_test = document_preprocessor.transform(test_df['document_text'])

In [6]:
y_train = train_df['document_class']
y_val = val_df['document_class']
y_test = test_df['document_class']

In [12]:
pipeline_c_lr = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression())
])

pipeline_c_lr.fit(X_train, y_train)
pipeline_c_lr.score(X_test, y_test)

0.8730748805098247

In [13]:
pipeline_tfidf_lr = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

pipeline_tfidf_lr.fit(X_train, y_train)
pipeline_tfidf_lr.score(X_test, y_test)

0.8849353867941229

In [9]:
transform_sparce = FunctionTransformer(lambda X: np.asarray(X.todense()))

In [16]:
pipeline_c_nb = Pipeline([
    ('vectorizer', CountVectorizer(max_features=10000)),
    ('to_dense', transform_sparce),
    ('classifier', GaussianNB())
])

pipeline_c_nb.fit(X_train, y_train)
pipeline_c_nb.score(X_test, y_test)

0.7459727385377943

In [17]:
pipeline_tfidf_nb = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=10000)),
    ('to_dense', transform_sparce),
    ('classifier', GaussianNB())
])

pipeline_tfidf_nb.fit(X_train, y_train)
pipeline_tfidf_nb.score(X_test, y_test)

0.7544698176668437

In [16]:


X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

param_grid = {
    'classifier__C': [0.1, 1, 10, 100]
}

pipeline_tfidf_lr = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression(solver='liblinear'))
])

kf = KFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline_tfidf_lr, param_grid, cv=kf, scoring='accuracy')
grid_search.fit(X_train_val, y_train_val)
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

Best parameters found: {'classifier__C': 100}
Best cross-validation score: 0.915851195352476


In [15]:
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

param_grid = {
    'classifier__var_smoothing': [1e-10, 1e-9, 1e-8]
}

pipeline_tfidf_nb = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=10000)),
    ('to_dense', transform_sparce),
    ('classifier', GaussianNB())
])

kf_nb = KFold(n_splits=10, shuffle=True, random_state=42)
grid_search_nb = GridSearchCV(pipeline_tfidf_nb, param_grid, cv=kf_nb, scoring='accuracy')
grid_search_nb.fit(X_train_val, y_train_val)
print(f"Best parameters found: {grid_search_nb.best_params_}")
print(f"Best cross-validation score: {grid_search_nb.best_score_}")

Best parameters found: {'classifier__var_smoothing': 1e-08}
Best cross-validation score: 0.7517262873846502


In [25]:
def evaluate_model(model, X_test, y_test):
    
    y_pred = model.predict(X_test)
    
    precision_macro = precision_score(y_test, y_pred, average='macro')
    recall_macro = recall_score(y_test, y_pred, average='macro')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    
    precision_micro = precision_score(y_test, y_pred, average='micro')
    recall_micro = recall_score(y_test, y_pred, average='micro')
    f1_micro = f1_score(y_test, y_pred, average='micro')
    
    print(f"Precision (Macro): {precision_macro}")
    print(f"Recall (Macro): {recall_macro}")
    print(f"F1 Score (Macro): {f1_macro}")
    print(f"Precision (Micro): {precision_micro}")
    print(f"Recall (Micro): {recall_micro}")
    print(f"F1 Score (Micro): {f1_micro}")


In [26]:
evaluate_model(grid_search.best_estimator_, X_test, y_test)

Precision (Macro): 0.919615441822286
Recall (Macro): 0.9168014513574156
F1 Score (Macro): 0.9178375068474682
Precision (Micro): 0.9201628606833068
Recall (Micro): 0.9201628606833068
F1 Score (Micro): 0.9201628606833068


In [27]:
evaluate_model(grid_search_nb.best_estimator_, X_test, y_test)

Precision (Macro): 0.762674604856379
Recall (Macro): 0.7648273062076675
F1 Score (Macro): 0.7611843493506316
Precision (Micro): 0.7663303239511418
Recall (Micro): 0.7663303239511418
F1 Score (Micro): 0.7663303239511418


sklearn.feature_extraction.text.TfidfVectorizer