In [1]:
# Import Libraries (1)
#-----------------------------------------------------------#

## NLP ##
import nltk
from nltk import pos_tag, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from collections import Counter
## Data ##
import numpy as np
import pandas as pd
import os
import csv
## ML ##
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [2]:
# CREATE CSV (ONE TIME ONLY)

path='C:/Mydata/natural_language_processing/bbc/'
folder_names = ['business', 'entertainment', 'politics', 'sport', 'tech']
class_labels = [0, 1, 2, 3, 4]
# List of possible encodings to try
encodings_to_try = ['utf-8', 'latin-1', 'windows-1252']
# Create or open the CSV file
csv_filename = path+'document_classification.csv'
with open(csv_filename, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['filename', 'class', 'text'])
    # Loop through folders and files
    for folder_name, class_label in zip(folder_names, class_labels):
        folder_path = os.path.join(path + folder_name)
        file_list = os.listdir(folder_path)
        file_list.sort()
        for file_name in file_list:
            if file_name.endswith('.txt'):  # Consider only text files, adjust if needed
                document_id = file_name.split('.')[0]
                file_path = os.path.join(folder_path, file_name)
                
                # Try different encodings to read the file
                for encoding in encodings_to_try:
                    try:
                        with open(file_path, 'r', encoding=encoding) as txt_file:
                            file_content = txt_file.read()
                        break  # Stop trying encodings if one works
                    except UnicodeDecodeError:
                        continue

                csv_writer.writerow([document_id, class_label, file_content])

In [3]:
# Pandas DataFrame (2)
#-----------------------------------------------------------#

df=pd.read_csv(csv_filename,encoding='latin-1')
inputs=df['text']
labels=df['class']
df.head()

Unnamed: 0,filename,class,text
0,1,0,Ad sales boost Time Warner profit\n\nQuarterly...
1,2,0,Dollar gains on Greenspan speech\n\nThe dollar...
2,3,0,Yukos unit buyer faces loan claim\n\nThe owner...
3,4,0,High fuel prices hit BA's profits\n\nBritish A...
4,5,0,Pernod takeover talk lifts Domecq\n\nShares in...


In [None]:
labels.hist(figsize=(10,3))

In [4]:
# Datasets (3)
#-----------------------------------------------------------#

x_train, x_test, y_train, y_test = train_test_split(inputs, labels, random_state=10)
vectorizer=CountVectorizer(stop_words='english')
Xtrain=vectorizer.fit_transform(x_train)
Xtest=vectorizer.transform(x_test)

In [59]:
# Custom Classes (4)
#-----------------------------------------------------------#
class LemmaTokenizer:
    def __init__(self):
        self.wnl=WordNetLemmatizer()
    def __call__(self, doc):
        tokens=word_tokenize(doc)
        words_and_tags=nltk.pos_tag(tokens)
        return [self.wnl.lemmatize(word, pos=get_wordnet_pos(tag)) for word,tag in words_and_tags]

class StemTokenizer:
    def __init__(self):
        self.porter=PorterStemmer()
    def __call__(self, doc):
        tokens=word_tokenize(doc)
        return [self.porter.stem(word) for word in tokens]
        
# Custom Functions (5)
#-----------------------------------------------------------#

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [60]:
# Model (6)
#-----------------------------------------------------------#

x_train, x_test, y_train, y_test = train_test_split(inputs, labels, random_state=10)
vectorizer=CountVectorizer(tokenizer=StemTokenizer(),stop_words='english',lowercase=True)
Xtrain=vectorizer.fit_transform(x_train)
Xtest=vectorizer.transform(x_test)
model=MultinomialNB()
model.fit(Xtrain,y_train)
print("train score = ", model.score(Xtrain,y_train))
print("test score = ", model.score(Xtest,y_test))



train score =  0.9928057553956835
test score =  0.9838420107719928


In [84]:
# NLP Predetermined Methods (3)
#-----------------------------------------------------------#

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
sentences = text.split(".")
lemmatized_sentences = []
lemmatized_words=[]
lemmatized_sentences=[]

In [89]:
# MAIN CODE (5)
#-----------------------------------------------------------#

for token in sentences:
    words = token.split(" ")
    lemmatized_words.extend([lemmatizer.lemmatize(word.lower(), pos=get_wordnet_pos(tag))
                             for word, tag in pos_tag(words) if word.lower() not in stop_words])
    lemmatized_sentences.append(" ".join(lemmatized_words))
    
lemmatized_text = ". ".join(lemmatized_sentences)
lemmatized_words = lemmatized_text.split()
print(Counter(pos_tag(lemmatized_words)))