In [1]:
import os
from pathlib import Path
from typing import Iterable
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_document(doc):
    tokens = word_tokenize(doc)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if
              token.lower() not in stop_words and token.isalnum()]
    return ' '.join(tokens)


def create_tf_idf_model(documents: Iterable[str]) -> TfidfVectorizer:
    vectorizer = TfidfVectorizer()
    vectorizer.fit(documents)
    return vectorizer


def load_news_documents(dataset_path: Path):
    document_names = []
    document_texts = []
    document_classes = []
    dataset_classnames = os.listdir(dataset_path)
    for doc_class in dataset_classnames:
        class_path = os.path.join(dataset_path, doc_class)
        file_names = os.listdir(class_path)
        for file_name in file_names:
            file_path = os.path.join(class_path, file_name)
            document_names.append(file_name)
            document_classes.append(doc_class)
            with open(file_path, 'r', encoding='ISO-8859-1') as f:
                document_texts.append(f.read())
    return pd.DataFrame({'document_name': document_names,'document_class': document_classes, 'document_text': document_texts})
        

[nltk_data] Downloading package punkt to /home/gustavo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gustavo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gustavo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
dataset = load_news_documents('../data/20news-18828')
dataset['document_text'] = dataset['document_text'].apply(preprocess_document)

In [3]:
train_df, remaining_df = train_test_split(dataset, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(remaining_df, test_size=0.75, random_state=42)

In [4]:
del dataset

In [5]:
tfidf = create_tf_idf_model(train_df['document_text'])

In [6]:
X_train = np.asarray(tfidf.transform(train_df['document_text']).todense())
X_val = np.asarray(tfidf.transform(val_df['document_text']).todense())
X_test = np.asarray(tfidf.transform(test_df['document_text']).todense())

In [7]:
y_train = train_df['document_class']
y_val = val_df['document_class']
y_test = test_df['document_class']

In [None]:
model_nb = GaussianNB()
model_nb.fit(X_train, y_train)
y_pred_nb = model_nb.predict(X_val)
accuracy_nb = sum(y_pred_nb == y_val) / len(y_val)
print(accuracy_nb)