In [11]:
import os
from pathlib import Path
from typing import Iterable
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


class DocumentPreprocessor:
    def __init__(self, stop_words: Iterable[str] = None, lemmatizer: WordNetLemmatizer = None):
        self.stop_words = stop_words or set(stopwords.words('english'))
        self.lemmatizer = lemmatizer or WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return [self.preprocess_document(doc) for doc in X]

    def preprocess_document(self, doc):
        tokens = word_tokenize(doc)
        tokens = [self.lemmatizer.lemmatize(token.lower()) for token in tokens if
                  token.lower() not in self.stop_words and token.isalnum()]
        return ' '.join(tokens)

def load_news_documents(dataset_path: Path):
    document_names = []
    document_texts = []
    document_classes = []
    dataset_classnames = os.listdir(dataset_path)
    for doc_class in dataset_classnames:
        class_path = os.path.join(dataset_path, doc_class)
        file_names = os.listdir(class_path)
        for file_name in file_names:
            file_path = os.path.join(class_path, file_name)
            document_names.append(file_name)
            document_classes.append(doc_class)
            with open(file_path, 'r', encoding='ISO-8859-1') as f:
                document_texts.append(f.read())
    return pd.DataFrame(
        {'document_name': document_names, 'document_class': document_classes, 'document_text': document_texts})


[nltk_data] Downloading package punkt to /home/gustavo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gustavo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gustavo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
dataset = load_news_documents('../data/20news-18828')

In [13]:
train_df, remaining_df = train_test_split(dataset, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(remaining_df, test_size=0.75, random_state=42)

In [14]:
X_train = train_df['document_text']
X_val = val_df['document_text']
X_test = test_df['document_text']

In [15]:
y_train = train_df['document_class']
y_val = val_df['document_class']
y_test = test_df['document_class']

In [6]:
pipeline_c_lr = Pipeline([
    ('preprocessor', DocumentPreprocessor()),
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression())
])

pipeline_c_lr.fit(X_train, y_train)
pipeline_c_lr.score(X_val, y_val)

0.8645778013807753

In [8]:
pipeline_tfidf_lr = Pipeline([
    ('preprocessor', DocumentPreprocessor()),
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

pipeline_tfidf_lr.fit(X_train, y_train)
pipeline_tfidf_lr.score(X_val, y_val)

0.8815719596388741

In [16]:
transform_sparce = FunctionTransformer(lambda X: np.asarray(X.todense()))

In [8]:
pipeline_c_nb = Pipeline([
    ('preprocessor', DocumentPreprocessor()),
    ('vectorizer', CountVectorizer(max_features=10000)),
    ('to_dense', transform_sparce),
    ('classifier', GaussianNB())
])

pipeline_c_nb.fit(X_train, y_train)
pipeline_c_nb.score(X_val, y_val)

0.7514604354753054

In [9]:
pipeline_tfidf_nb = Pipeline([
    ('preprocessor', DocumentPreprocessor()),
    ('vectorizer', TfidfVectorizer(max_features=10000)),
    ('to_dense', transform_sparce),
    ('classifier', GaussianNB())
])

pipeline_tfidf_nb.fit(X_train, y_train)
pipeline_tfidf_nb.score(X_val, y_val)

0.7488050982474774

In [None]:


X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

param_grid = {
    'classifier__C': [0.1, 1, 10, 100]
}

pipeline_tfidf_lr = Pipeline([
    ('preprocessor', DocumentPreprocessor()),
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression(solver='liblinear'))
])

kf = KFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline_tfidf_lr, param_grid, cv=kf, scoring='accuracy')
grid_search.fit(X_train_val, y_train_val)
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")