In [1]:
# import shutil and nltk library
import os
import shutil

from nltk import download, word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [2]:
# download 20news-bydate.tar.gz file

!curl 'http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz' >> './20news-bydate.tar.gz'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 13.7M  100 13.7M    0     0  7072k      0  0:00:01  0:00:01 --:--:-- 7069k


In [3]:
# unpack files

shutil.unpack_archive('/content/20news-bydate.tar.gz', '/content/20news-bydate')
!rm '/content/20news-bydate.tar.gz' # delete file

In [4]:
# Download NLTK resources if not already downloaded
download('punkt')
download('averaged_perceptron_tagger')
download('wordnet')

# Function to perform lemmatization
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []

    # Perform POS tagging for lemmatization
    pos_tags = pos_tag(word_tokenize(text))

    for word, pos in pos_tags:
        # Map POS tags to WordNet POS tags
        wn_pos = pos[0].lower() if pos[0].lower() in 'nvar' else None

        if wn_pos:
            lemmatized_word = lemmatizer.lemmatize(word, wn_pos)
        else:
            lemmatized_word = lemmatizer.lemmatize(word)

        lemmatized_tokens.append(lemmatized_word)

    return ' '.join(lemmatized_tokens)

# Function to lemmatize all documents in a directory
def lemmatize_directory(directory):
  for root, dirs, files in os.walk(directory):
    print(f'Processing {root}... found directories: {dirs} and {len(files)} files...')
    for file in files:
        file_path = os.path.join(root, file)

        # read file and get lematized content
        with open(file_path, 'r', encoding='latin1') as f:
            content = f.read()
        lemmatized_content = lemmatize_text(content)

        # write lematized content to new file
        with open(file_path, 'w', encoding='latin1') as f:
            f.write(lemmatized_content)
    print(f'Finished {root}.')
  print('Finished lemmatizing all files.')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
data_path = '/content/20news-bydate'
lemmatize_directory(data_path)

Processing /content/20news-bydate... found directories: ['20news-bydate-train', '20news-bydate-test'] and 0 files...
Finished /content/20news-bydate.
Processing /content/20news-bydate/20news-bydate-train... found directories: ['comp.graphics', 'comp.sys.ibm.pc.hardware', 'rec.motorcycles', 'sci.electronics', 'talk.politics.guns', 'alt.atheism', 'soc.religion.christian', 'comp.windows.x', 'rec.sport.hockey', 'rec.sport.baseball', 'talk.religion.misc', 'sci.space', 'comp.os.ms-windows.misc', 'rec.autos', 'talk.politics.misc', 'comp.sys.mac.hardware', 'talk.politics.mideast', 'sci.med', 'misc.forsale', 'sci.crypt'] and 0 files...
Finished /content/20news-bydate/20news-bydate-train.
Processing /content/20news-bydate/20news-bydate-train/comp.graphics... found directories: [] and 584 files...
Finished /content/20news-bydate/20news-bydate-train/comp.graphics.
Processing /content/20news-bydate/20news-bydate-train/comp.sys.ibm.pc.hardware... found directories: [] and 590 files...
Finished /cont

In [6]:
# import sklearn/tensorflow and other dependencies

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

In [7]:
def load_and_preprocess_data(data_path):
  texts = []
  labels = []

  for root, dirs, files in os.walk(data_path):
    for file in files:
      file_path = os.path.join(root, file)

      # read file and append content and label
      with open(file_path, 'r', encoding='latin1') as f:
        content = f.read()
        texts.append(content)
        labels.append(root.split("/")[-1])

  # Create a DataFrame for better handling
  df = pd.DataFrame({'text': texts, 'label': labels})

  # Encode labels
  label_encoder = LabelEncoder()
  df['encoded_label'] = label_encoder.fit_transform(df['label'])

  # Split the data into training and testing sets
  train_data, test_data, train_labels, test_labels = train_test_split(
    df['text'], df['encoded_label'], test_size=0.2, random_state=42
  )

  return train_data, test_data, train_labels, test_labels, label_encoder

In [8]:
# Load and preprocess the lemmatized 20 Newsgroups dataset
train_data, test_data, train_labels, test_labels, label_encoder = load_and_preprocess_data(data_path)

# Tokenize and pad the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data)

vocab_size = len(tokenizer.word_index) + 1
max_length = 200  # Adjust this based on your dataset and available resources

train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

X_train = pad_sequences(train_sequences, maxlen=max_length, padding='post')
X_test = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Create a simple neural network model
model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=max_length))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, train_labels, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, test_labels)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 81.88%
