In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
import math
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score
import heapq
import sys
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [2]:
df=pd.read_csv("English Dataset.csv")
df=df.drop(columns=["ArticleId"],axis=1)
#df["Category"]=df["Category"].astype('category').cat.codes

df.head()

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business


In [3]:
Y=df["Category"]
X=df.drop(columns="Category",axis=1)

In [4]:
categories=["sport","business","tech","politics","entertainment"]
for x in categories:
    cat = df[df["Category"]==x]
    all_text = cat['Text'].str.cat(sep=' ')
    temp=all_text.split()
    counter=Counter(temp)
    print(counter)




In [None]:
# Sports have england': 313 'game': 285  'win': 261
# Business has 'firm': 242 'company': 240 'market': 235
# Tech has 'mobile': 311 'technology': 263 'users': 249
# Politics has 'labour': 469 'government': 430 'blair': 372
# Entertainment has 'film': 506 'music': 232 'show': 187

# These are catergory specific apper most words according to me

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [7]:
def build_model(Vectorized_x,y_train,V,categories):
    # From a vector of words and Y_train finds the log values of the words
    labels_train = y_train.tolist() 
    log_likelihoods={}
    # Get log probs of each word in each class
    for cat in categories:
        indicies=[]
        for i in range(len(labels_train)):
            if labels_train[i]==cat:
                indicies.append(i)
        A_category = Vectorized_x[indicies]
        # Use +1 and V for smoothing
        word_counts_in_category = A_category.sum(axis=0) + 1
        total_words_in_category = A_category.sum() + V
        log_likelihoods_category = np.log(word_counts_in_category / total_words_in_category)
        log_likelihoods[cat] = log_likelihoods_category
    # Finds prior probs of classes
    log_priors = {}
    N = len(y_train) 
    category_counts_train = y_train.value_counts()
    for category_name in categories:
        Nc = category_counts_train.get(category_name, 0) 
        log_priors[category_name] = np.log((Nc + 1) / (N)) 
    return log_priors,log_likelihoods

This is the Naive Bias model after getting vectors 


In [8]:
def predict(X_test_vectorized, log_priors, log_likelihoods):
    # After getting the probs of the words before , than for each new 
    # test case vector find the prob for each category than assigns it to max
    predictions = []
    
    for i in range(X_test_vectorized.shape[0]):
        # Vectorize current x
        doc_vector = X_test_vectorized[i] 
        scores = {}

        for cat in categories: 
            score = log_priors[cat] 
            # Get the log numbers from dict
            log_likelihoods_cat = log_likelihoods[cat]
            
            log_likelihoods_cat = np.array(log_likelihoods_cat).flatten()


            for word_index, word_count in zip(doc_vector.indices, doc_vector.data):
                
                if word_index < log_likelihoods_cat.shape[0]:
                   # Adding the probs becouse of log
                   score += word_count * log_likelihoods_cat[word_index] 
                   
            scores[cat] = score 

        predicted_category = max(scores, key=scores.get)
        predictions.append(predicted_category)
        
    return predictions

In [None]:
# Most basic model with count_Vectorizer and 1 word without any changes
corpus = X_train['Text'] 
vectorizer = CountVectorizer()
A = vectorizer.fit_transform(corpus)
vocabulary = vectorizer.get_feature_names_out()
V = len(vocabulary)
log_priors,log_likelihoods = build_model(A,y_train,V,categories)

In [10]:
X_test_texts = X_test['Text']
X_test_vec = vectorizer.transform(X_test_texts)
pred=predict(X_test_vec,log_priors,log_likelihoods)
print(pred)

['tech', 'politics', 'politics', 'tech', 'tech', 'business', 'tech', 'entertainment', 'business', 'tech', 'politics', 'entertainment', 'business', 'sport', 'tech', 'politics', 'business', 'sport', 'politics', 'business', 'tech', 'entertainment', 'sport', 'business', 'sport', 'tech', 'business', 'sport', 'tech', 'business', 'sport', 'business', 'business', 'politics', 'sport', 'tech', 'politics', 'business', 'entertainment', 'tech', 'sport', 'business', 'politics', 'business', 'sport', 'business', 'politics', 'tech', 'politics', 'business', 'sport', 'sport', 'business', 'sport', 'politics', 'entertainment', 'entertainment', 'politics', 'business', 'entertainment', 'tech', 'business', 'entertainment', 'tech', 'tech', 'tech', 'business', 'tech', 'politics', 'politics', 'politics', 'sport', 'sport', 'sport', 'entertainment', 'entertainment', 'business', 'politics', 'business', 'business', 'business', 'sport', 'sport', 'sport', 'business', 'entertainment', 'sport', 'business', 'business', '

In [None]:
# Bigram vectorizer ngram_range=(2,2) to get 2 words
corpus = X_train['Text'] 
vectorizer_bigram = CountVectorizer(ngram_range=(2,2)) 
A_bigram = vectorizer_bigram.fit_transform(corpus)
vocabulary_bigram = vectorizer_bigram.get_feature_names_out()
V_bigram = len(vocabulary_bigram)
log_priors_bigram, log_likelihoods_bigram = build_model(A_bigram, y_train, V_bigram, categories)

In [12]:
X_test_vec_bigram = vectorizer_bigram.transform(X_test_texts) 
pred_bigram = predict(X_test_vec_bigram, log_priors_bigram, log_likelihoods_bigram)

In [13]:
results = {}

for category_name in categories:
    print(category_name)

    log_likelihood_array = np.array(log_likelihoods[category_name]).flatten()
    
    sorted_indices = np.argsort(log_likelihood_array)
    
    top_10_indices = sorted_indices[-10:][::-1]
    top_10_words = vocabulary[top_10_indices]
    top_10_values = log_likelihood_array[top_10_indices]
    
    bottom_10_indices = sorted_indices[:10]
    bottom_10_words = vocabulary[bottom_10_indices]
    bottom_10_values = log_likelihood_array[bottom_10_indices]
    
    print("BEST: ",top_10_words)
    print("WORST: ",bottom_10_words)
    print()

sport
BEST:  ['the' 'to' 'in' 'and' 'of' 'for' 'he' 'on' 'is' 'but']
WORST:  ['0001' '000bn' 'zillion' 'zip' 'ziyi' 'zodiac' 'zombie' '0400' '041'
 'zoe']

business
BEST:  ['the' 'to' 'of' 'in' 'and' 'said' 'is' 'that' 'for' 'it']
WORST:  ['zorro' 'zooropa' 'zooms' 'zoom' 'zonealarm' 'zeppelin' '0400' '04secs'
 '050505' '0530']

tech
BEST:  ['the' 'to' 'of' 'and' 'in' 'that' 'is' 'it' 'for' 'on']
WORST:  ['zurich' 'zuluaga' 'zorro' 'zooropa' 'zeppelin' 'zeta' 'zhang' 'zhaoxing'
 '050' '050505']

politics
BEST:  ['the' 'to' 'of' 'and' 'in' 'said' 'he' 'for' 'that' 'is']
WORST:  ['zurich' 'zuluaga' 'zorro' 'zooropa' 'zooms' 'zoom' 'zones' 'zonealarm'
 'zombies' 'zeppelin']

entertainment
BEST:  ['the' 'and' 'to' 'of' 'in' 'for' 'on' 'was' 'it' 'is']
WORST:  ['04secs' '050' '001' '001and' '0051' '01' '028' '03' 'zhaoxing' 'zheng']



Same words with best since all the stopwords are in all classes.

Worst are also similar since those words are very specific but not specific to class.

In [14]:
# Vectorizer with TD IDF
corpus = X_train['Text'] 
vectorizer = CountVectorizer()
A = vectorizer.fit_transform(corpus)
vocabulary = vectorizer.get_feature_names_out()
V = len(vocabulary)
# Using unigram fit it into TFs
TF=TfidfTransformer().fit_transform(A)
log_priors_tf,log_likelihoods_tf = build_model(TF,y_train,V,categories)
pred_tf=predict(X_test_vec,log_priors_tf,log_likelihoods_tf)
print(pred)

['tech', 'politics', 'politics', 'tech', 'tech', 'business', 'tech', 'entertainment', 'business', 'tech', 'politics', 'entertainment', 'business', 'sport', 'tech', 'politics', 'business', 'sport', 'politics', 'business', 'tech', 'entertainment', 'sport', 'business', 'sport', 'tech', 'business', 'sport', 'tech', 'business', 'sport', 'business', 'business', 'politics', 'sport', 'tech', 'politics', 'business', 'entertainment', 'tech', 'sport', 'business', 'politics', 'business', 'sport', 'business', 'politics', 'tech', 'politics', 'business', 'sport', 'sport', 'business', 'sport', 'politics', 'entertainment', 'entertainment', 'politics', 'business', 'entertainment', 'tech', 'business', 'entertainment', 'tech', 'tech', 'tech', 'business', 'tech', 'politics', 'politics', 'politics', 'sport', 'sport', 'sport', 'entertainment', 'entertainment', 'business', 'politics', 'business', 'business', 'business', 'sport', 'sport', 'sport', 'business', 'entertainment', 'sport', 'business', 'business', '

In [15]:
# Using stop words with unigram stop_words="english"
corpus = X_train['Text'] 
vectorizer_stop = CountVectorizer(stop_words="english") 
A_stop = vectorizer_stop.fit_transform(corpus)
vocabulary_stop = vectorizer_stop.get_feature_names_out()
V_stop = len(vocabulary_stop)
log_priors_stop, log_likelihoods_stop = build_model(A_stop, y_train, V_stop, categories)
X_test_texts = X_test['Text'] 
X_test_vec_stop = vectorizer_stop.transform(X_test_texts) 
pred_stop = predict(X_test_vec_stop, log_priors_stop, log_likelihoods_stop)

In [16]:
# Bigram with stopwords
corpus = X_train['Text'] 
vectorizer_stop_bi = CountVectorizer(stop_words="english",ngram_range=(2,2))
A_stop_bi = vectorizer_stop_bi.fit_transform(corpus)
vocabulary_stop_bi = vectorizer_stop_bi.get_feature_names_out()
V_stop_bi = len(vocabulary_stop_bi)
log_priors_stop_bi,log_likelihoods_stop_bi = build_model(A_stop_bi,y_train,V_stop_bi,categories)
X_test_texts = X_test['Text'] 
X_test_vec_stop = vectorizer_stop_bi.transform(X_test_texts) 
pred_stop_bi=predict(X_test_vec_stop,log_priors_stop_bi,log_likelihoods_stop_bi)

In [17]:
# Stopwords with TF
corpus = X_train['Text'] 
vectorizer_TF_stop = CountVectorizer(stop_words="english")
A_stop_tf = vectorizer_TF_stop.fit_transform(corpus)
vocabulary_tf_bi = vectorizer_TF_stop.get_feature_names_out()
V_stop_tf = len(vocabulary_tf_bi)
# Using unigram fit it into TFs
TF_stop=TfidfTransformer().fit_transform(A_stop_tf)
log_priors_tf_stop,log_likelihoods_tf_stop = build_model(TF_stop,y_train,V_stop_tf,categories)
X_test_texts = X_test['Text'] 
X_test_vec_stop = vectorizer_TF_stop.transform(X_test_texts) 
pred_tf_stop=predict(X_test_vec_stop,log_priors_tf_stop,log_likelihoods_tf_stop)
print(pred)

['tech', 'politics', 'politics', 'tech', 'tech', 'business', 'tech', 'entertainment', 'business', 'tech', 'politics', 'entertainment', 'business', 'sport', 'tech', 'politics', 'business', 'sport', 'politics', 'business', 'tech', 'entertainment', 'sport', 'business', 'sport', 'tech', 'business', 'sport', 'tech', 'business', 'sport', 'business', 'business', 'politics', 'sport', 'tech', 'politics', 'business', 'entertainment', 'tech', 'sport', 'business', 'politics', 'business', 'sport', 'business', 'politics', 'tech', 'politics', 'business', 'sport', 'sport', 'business', 'sport', 'politics', 'entertainment', 'entertainment', 'politics', 'business', 'entertainment', 'tech', 'business', 'entertainment', 'tech', 'tech', 'tech', 'business', 'tech', 'politics', 'politics', 'politics', 'sport', 'sport', 'sport', 'entertainment', 'entertainment', 'business', 'politics', 'business', 'business', 'business', 'sport', 'sport', 'sport', 'business', 'entertainment', 'sport', 'business', 'business', '

In [18]:
results = {}

for category_name in categories:
    print(category_name)
    log_likelihood_array = np.array(log_likelihoods_stop[category_name]).flatten()
    
    sorted_indices = np.argsort(log_likelihood_array)
    
    top_10_indices = sorted_indices[-10:][::-1]
    top_10_words = vocabulary[top_10_indices]
    top_10_values = log_likelihood_array[top_10_indices]
    
    bottom_10_indices = sorted_indices[:10]
    bottom_10_words = vocabulary[bottom_10_indices]
    bottom_10_values = log_likelihood_array[bottom_10_indices]
    
    print("BEST: ",top_10_words)
    print("WORST: ",bottom_10_words)
    print()

sport
BEST:  ['rose' 'fujitsu' 'enables' 'wilson' 'terminal' 'wentworth' 'wars' 'naris'
 'crucially' 'job']
WORST:  ['0400' '041' '050' '050505' '0530' '001st' '0051' '007' '0100' '0130']

business
BEST:  ['rose' 'wilson' 'money' 'naris' 'figo' 'maldives' 'grassroot'
 'commanding' 'earlier' 'glorious']
WORST:  ['wooden' 'woodbridge' 'wood' 'wonky' 'wolf' 'wolfgang' 'wolfram' 'wolpaw'
 '050505' '0530']

tech
BEST:  ['rose' 'partner' 'millar' 'money' 'naris' 'swiss' 'wilson' 'skidelsky'
 'undulating' 'unemployment']
WORST:  ['03' 'wolf' 'wolfgang' 'wolfram' 'wolpaw' 'nandrolone' 'nanoscale'
 'nanotechnology' 'naoki' 'nadu']

politics
BEST:  ['rose' 'money' 'kirby' 'glorious' 'editing' 'biological' 'partner'
 'owning' 'metatarsal' 'treason']
WORST:  ['0530' '050' '05' '04secs' '041' '0400' '033' '03' '0130' '0100']

entertainment
BEST:  ['rose' 'feroz' 'bennett' 'wilson' 'motherboards' 'naris' 'avenue'
 'activists' 'webster' 'treason']
WORST:  ['wong' 'peter' 'peterhead' 'wolfram' 'wolpaw

Using stop words are meaningles for the most part since they are almost used in every context without giveing context. So it is just slowing down the training and doest give much information. The new words are have much more meaning.

In [19]:
acc = accuracy_score(y_test,pred)
print(f"Normal accu: {acc}")
f1 = f1_score(y_test,pred,average="weighted")
print(f"Normal f1: {f1}")

Normal accu: 0.9765100671140939
Normal f1: 0.9765452996043319


In [20]:
acc_bigram = accuracy_score(y_test, pred_bigram)
print(f"Bigram Acc: {acc_bigram}")
f1_bigram = f1_score(y_test, pred_bigram, average="weighted")
print(f"Bigram F1: {f1_bigram}")

Bigram Acc: 0.9731543624161074
Bigram F1: 0.9733343931508186


In [21]:
acc = accuracy_score(y_test,pred_tf)
print(f"TF IDF ACC: {acc}")
f1 = f1_score(y_test,pred_tf,average="weighted")
print(f"TF IDF F1: {f1}")

TF IDF ACC: 0.9295302013422819
TF IDF F1: 0.928621714775582


In [22]:
acc_stop = accuracy_score(y_test, pred_stop)
print(f"Stop Words Uni Accuracy: {acc_stop}")
f1_stop = f1_score(y_test, pred_stop, average="weighted")
print(f"Stop Words Uni F1: {f1_stop}")

Stop Words Uni Accuracy: 0.9765100671140939
Stop Words Uni F1: 0.9764795084886346


In [23]:
acc_stop_bi = accuracy_score(y_test, pred_stop_bi)
print(f"Stop Words Bigram Accuracy: {acc_stop_bi}")
f1_stop_bi = f1_score(y_test, pred_stop_bi, average="weighted")
print(f"Stop Words Bigram F1: {f1_stop_bi}")

Stop Words Bigram Accuracy: 0.959731543624161
Stop Words Bigram F1: 0.959914515726645


In [24]:
acc_stop_tf = accuracy_score(y_test, pred_tf_stop)
print(f"Stop Words TF Accuracy: {acc_stop_tf}")
f1_stop_tf = f1_score(y_test, pred_tf_stop, average="weighted")
print(f"Stop Words TF F1: {f1_stop_tf}")

Stop Words TF Accuracy: 0.9697986577181208
Stop Words TF F1: 0.969936956003185


Stop words imporved performance in every case . Using bigram dint give any big improvement and TF give a bit worse results.