In [6]:
import os, re
from pathlib import Path
from typing import Iterable

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

import warnings

warnings.filterwarnings("ignore")

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
twenty_n_regex = r'''
^From:.*?\n|                      # Ignora la línea que empieza con 'From:' y lo que sigue hasta el final de la línea
^Subject:(?:\s*Re:\s*)?(.*)\n|    # Captura el texto del 'Subject' y elimina 'Subject:' o 'Re:'
.*Date:.*?\n|                      # Ignora la línea que empieza con 'Date:'
^Archive-name:.*?\n|              # Ignora la línea que empieza con 'Archive-name:' y lo que sigue hasta el final de la línea
^Alt-atheism-archive-name:.*?\n|  # Ignora la línea que empieza con 'Alt-atheism-archive-name:' y lo que sigue hasta el final de la línea
^Last-modified:.*?\n|             # Ignora la línea que empieza con 'Last-modified:' y lo que sigue hasta el final de la línea
^Version:.*?\n|                   # Ignora la línea que empieza con 'Version:' y lo que sigue hasta el final de la línea
^.*@.*?\n|                        # Ignora la línea que contiene '@' y lo que sigue hasta el final de la línea
In\sarticle.*?writes:\n|          # Ignora todo lo que está entre 'In article...' y 'writes:'
.*Newsgroups:.*?\n|               # Ignora cualquier línea que contenga 'Newsgroups:'
[^a-zA-Z0-9\s.,]                  # Elimina cualquier carácter que no sea una letra, un número o un espacio
|^>+                              # Elimina '>' al inicio de una línea
|\s*>+                            # Elimina '>' seguido por espacios
^-+$                              # Ignora las líneas que contienen solo '-'
^=+$                              # Ignora las líneas que contienen solo '='
'''

class DocumentPreprocessor:
    def __init__(self, stop_words: Iterable[str] = None, lemmatizer: WordNetLemmatizer = None):
        self.stop_words = stop_words or set(stopwords.words('english'))
        self.lemmatizer = lemmatizer or WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return pd.Series([self.preprocess_document(doc) for doc in X])

    def preprocess_document(self, doc):
        tokens = word_tokenize(doc)
        tokens = [self.lemmatizer.lemmatize(token.lower()) for token in tokens if
                  token.lower() not in self.stop_words and token.isalnum()]
        return ' '.join(tokens)

def load_news_documents(dataset_path: Path):
    document_names = []
    document_texts = []
    document_classes = []
    dataset_classnames = os.listdir(dataset_path)
    for doc_class in dataset_classnames:
        class_path = os.path.join(dataset_path, doc_class)
        file_names = os.listdir(class_path)
        for file_name in file_names:
            file_path = os.path.join(class_path, file_name)
            document_names.append(file_name)
            document_classes.append(doc_class)
            with open(file_path, 'r', encoding='ISO-8859-1') as f:
                # document_texts.append(f.read())
                text = f.read()
                cleaned_text = re.sub(twenty_n_regex, '', text, flags=re.VERBOSE | re.MULTILINE).replace("rh","")
                document_texts.append(cleaned_text)
    return pd.DataFrame(
        {'document_name': document_names, 'document_class': document_classes, 'document_text': document_texts})

In [57]:
dataset = load_news_documents('data/20news-18828/20news-18828')

In [58]:
dataset

Unnamed: 0,document_name,document_class,document_text
0,49960,alt.atheism,\n\n Atheist Reso...
1,51060,alt.atheism,\n\nBEGIN PGP SIGNED MESSAGE\n\n ...
2,51119,alt.atheism,"\n \nWell, John has a quite different, not nec..."
3,51120,alt.atheism,"\n Recently, RAs have been ordered and no..."
4,51121,alt.atheism,"\n\n 1 HOWEVER, I hate economic terrorism and ..."
...,...,...,...
18823,84564,talk.religion.misc,\n\n I wasnt sure if this was the right ne...
18824,84565,talk.religion.misc,"\n Probably not. But then, I dont pack heavy..."
18825,84568,talk.religion.misc,\nIf you would like to understand better the s...
18826,84569,talk.religion.misc,\n\n\nThe danger of anticult groups is that wh...


In [59]:
dataset[dataset.document_name == "54261"]

Unnamed: 0,document_name,document_class,document_text
790,54261,alt.atheism,\n\n Organization University of Southern Queen...
10450,54261,rec.sport.hockey,\n\n\tESPN through a fortunate rainout of a ba...
12479,54261,sci.electronics,\n Is there a readily available solvent that d...
15786,54261,talk.politics.guns,\n\n\n The issue has never been whether tanks ...


In [60]:
print(dataset[dataset.document_name == "54261"]["document_text"].iloc[0])



 Organization University of Southern Queensland


	First I want to start right out and say that Im a Christian.  It 

 I know I shouldnt get involved, but...   

 bit deleted

	The book says that Jesus was either a liar, or he was crazy  a 
modern day Koresh or he was actually who he said he was.
rest of rant deleted

This is a standard argument for fundies.  Can you spot the falicy The
statement is arguing from the assumption that Jesus actually existed.  So far,
they have not been able to offer real proof of that existance.  Most of them
try it using the very flawed writings of Josh McDowell and others to prove
it, but those writers use VERY flawed sources.  If they are real sources at
all, some are not.  When will they ever learn to do real research, instead of
believing the drivel sold in the Christian bookstores.

 Righto, DAN, try this one with your Cornflakes...

 The book says that Muhammad was either a liar, or he was
 crazy  a  modern day Mad Mahdi or he was actually who he

In [61]:
print(dataset[dataset.document_name == "82763"]["document_text"].iloc[0])



T.O. Radzykewycz writes

   666, the file permission of the beast.
 
  Sorry, but the file permission of the beast is 600.
  
  And the file permission of the home directory of the
  beast is 700.
 
 Hey, radzy, it must depend on your systems access policy.
 I get
 	 ls lg usrusers
 	total 3
 	drwxrwxrwx 22 beast    system       1536 Jan 01  1970 beast
 	drwxrxx 32 boylan   users        2048 Mar 31 0908 boylan
 	drwxrxrx  2 guest    users         512 Sep 18  1992 guest
 	 su
 	Password
 	root  su beast
 	beast  umask
 	111
 	beast  D
 	root  D
 	 
 
 Just a minute....
 
 	 grep beast etcpasswd
 	beastk5tUk76RAUogQ4970Not Walt Disneyusrusersbeast
 	 mv usrusersbeast.profile usrusersbeast.profile,
 	 echo umask 077  usrusersbeast.profile
 	 cat  usrusersbeast.profile
 	chmod 700 usrusersbeast
 	mv .mailrc .mailrc,
 	mv .mailrc, .mailrc
 	mv usrusersbeast.profile, usrusersbeast.profile
 	D
 	 chmod 777 usrusersbeast.profile
 	 cat usrusersbeast.profile,  usrusersbeast.profile
 
 waits a

In [12]:
train_df, remaining_df = train_test_split(dataset, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(remaining_df, test_size=0.75, random_state=42)

In [13]:
document_preprocessor = DocumentPreprocessor()

In [14]:
X_train = document_preprocessor.transform(train_df['document_text'])
X_val = document_preprocessor.transform(val_df['document_text'])
X_test = document_preprocessor.transform(test_df['document_text'])

In [15]:
y_train = train_df['document_class']
y_val = val_df['document_class']
y_test = test_df['document_class']

In [16]:
pipeline_c_lr = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression())
])

pipeline_c_lr.fit(X_train, y_train)
pipeline_c_lr.score(X_test, y_test)

0.8120021242697822

In [17]:
pipeline_tfidf_lr = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

pipeline_tfidf_lr.fit(X_train, y_train)
pipeline_tfidf_lr.score(X_test, y_test)

0.8452823508585591

In [18]:
transform_sparce = FunctionTransformer(lambda X: np.asarray(X.todense()))

In [19]:
pipeline_c_nb = Pipeline([
    ('vectorizer', CountVectorizer(max_features=10000)),
    ('to_dense', transform_sparce),
    ('classifier', GaussianNB())
])

pipeline_c_nb.fit(X_train, y_train)
pipeline_c_nb.score(X_test, y_test)

0.6877323420074349

In [20]:
pipeline_tfidf_nb = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=10000)),
    ('to_dense', transform_sparce),
    ('classifier', GaussianNB())
])

pipeline_tfidf_nb.fit(X_train, y_train)
pipeline_tfidf_nb.score(X_test, y_test)

0.6893255443441317

In [16]:


X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

param_grid = {
    'classifier__C': [0.1, 1, 10, 100]
}

pipeline_tfidf_lr = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression(solver='liblinear'))
])

kf = KFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline_tfidf_lr, param_grid, cv=kf, scoring='accuracy')
grid_search.fit(X_train_val, y_train_val)
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

Best parameters found: {'classifier__C': 100}
Best cross-validation score: 0.915851195352476


In [15]:
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

param_grid = {
    'classifier__var_smoothing': [1e-10, 1e-9, 1e-8]
}

pipeline_tfidf_nb = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=10000)),
    ('to_dense', transform_sparce),
    ('classifier', GaussianNB())
])

kf_nb = KFold(n_splits=10, shuffle=True, random_state=42)
grid_search_nb = GridSearchCV(pipeline_tfidf_nb, param_grid, cv=kf_nb, scoring='accuracy')
grid_search_nb.fit(X_train_val, y_train_val)
print(f"Best parameters found: {grid_search_nb.best_params_}")
print(f"Best cross-validation score: {grid_search_nb.best_score_}")

Best parameters found: {'classifier__var_smoothing': 1e-08}
Best cross-validation score: 0.7517262873846502


In [25]:
def evaluate_model(model, X_test, y_test):
    
    y_pred = model.predict(X_test)
    
    precision_macro = precision_score(y_test, y_pred, average='macro')
    recall_macro = recall_score(y_test, y_pred, average='macro')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    
    precision_micro = precision_score(y_test, y_pred, average='micro')
    recall_micro = recall_score(y_test, y_pred, average='micro')
    f1_micro = f1_score(y_test, y_pred, average='micro')
    
    print(f"Precision (Macro): {precision_macro}")
    print(f"Recall (Macro): {recall_macro}")
    print(f"F1 Score (Macro): {f1_macro}")
    print(f"Precision (Micro): {precision_micro}")
    print(f"Recall (Micro): {recall_micro}")
    print(f"F1 Score (Micro): {f1_micro}")


In [26]:
evaluate_model(grid_search.best_estimator_, X_test, y_test)

Precision (Macro): 0.919615441822286
Recall (Macro): 0.9168014513574156
F1 Score (Macro): 0.9178375068474682
Precision (Micro): 0.9201628606833068
Recall (Micro): 0.9201628606833068
F1 Score (Micro): 0.9201628606833068


In [27]:
evaluate_model(grid_search_nb.best_estimator_, X_test, y_test)

Precision (Macro): 0.762674604856379
Recall (Macro): 0.7648273062076675
F1 Score (Macro): 0.7611843493506316
Precision (Micro): 0.7663303239511418
Recall (Micro): 0.7663303239511418
F1 Score (Micro): 0.7663303239511418


sklearn.feature_extraction.text.TfidfVectorizer