In [103]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [104]:
import pandas as pd
train = pd.read_csv('/content/drive/My Drive/keggledatasetIR/BBC News Train.csv')


-> extracting data from csv file
-> seprating article ID, Text columns

In [105]:
train = train[['Text','Category']]


PreProcessig

In [106]:
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download("punkt",quiet="True")
nltk.download("stopwords",quiet="True")
nltk.download("wordnet",quiet="True")

stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer() #wordnet lemmatizer from NLTK (inbuilt)
stemmer = PorterStemmer() #Porter Stemmer from NLTK (inbuilt)

def preprocess_data(data):


    data = data.translate(str.maketrans('', '', string.punctuation)) 
    data = data.lower()

    tokens = word_tokenize(data)

    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens] #Lemmitization 
    #tokens = [stemmer.stemmer(token) for token in tokens] #if stemming to be done

    return ' '.join(tokens)

train['Text'] = train['Text'].apply(preprocess_data)

In [107]:
print(train)

                                                   Text       Category
0     worldcom exboss launch defence lawyer defendin...       business
1     german business confidence slide german busine...       business
2     bbc poll indicates economic gloom citizen majo...       business
3     lifestyle governs mobile choice faster better ...           tech
4     enron boss 168m payout eighteen former enron d...       business
...                                                 ...            ...
1485  double eviction big brother model caprice holb...  entertainment
1486  dj double act revamp chart show dj duo jk joel...  entertainment
1487  weak dollar hit reuters revenue medium group r...       business
1488  apple ipod family expands market apple expande...           tech
1489  santy worm make unwelcome visit thousand websi...           tech

[1490 rows x 2 columns]


In [108]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(train, test_size=0.3, random_state=None)

train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [109]:
print(train_data)

                                                   Text       Category
0     british library get wireless net visitor briti...           tech
1     arthur hailey king bestseller novelist arthur ...  entertainment
2     vodafone appoints new japan bos vodafone draft...       business
3     germany call eu reform german chancellor gerha...       business
4     blair ready call election tony blair seems cer...       politics
...                                                 ...            ...
1038  dal maso replace bergamasco david dal maso han...          sport
1039  hotspot user gain free net call people using w...           tech
1040  observer monitor uk election minister invite i...       politics
1041  park sell day ticket scotland biggest music fe...  entertainment
1042  roddick face saulnier final andy roddick play ...          sport

[1043 rows x 2 columns]


In [110]:
print(test_data)

                                                  Text  Category
0    mild winter drive u oil 6 u oil price fallen 6...  business
1    camera phone musthaves four time mobile camera...      tech
2    uk risk breaking golden rule uk government rai...  business
3    strong demand trigger oil rally crude oil pric...  business
4    sony psp console hit u march u gamers able buy...      tech
..                                                 ...       ...
442  parliament record scandal locked room heart pa...  politics
443  lifestyle governs mobile choice faster better ...      tech
444  uk help raped rwandan woman britain give £4m g...  politics
445  tmobile bet pocket office tmobile launched lat...      tech
446  wolf appoint hoddle manager glenn hoddle unvei...     sport

[447 rows x 2 columns]


In [146]:
import math

tf={} #no of occurences of term in particular class
cf={} #no of classes that contain the term

tficf={}#dict of dict -> weights

classes=set()
words=set()

N=len(train_data['Text'])
for doc_no in range(N):

  category=train_data["Category"][doc_no]
  text=train_data['Text'][doc_no]

  text=text.split()
  classes.add(category)
  
  #tf and cf
  for term in text:
    words.add(term)
    if term not in tf:
        tf[term]={}
    if category not in tf[term]:
        tf[term][category]=1
    else:
        tf[term][category]+=1

for term in tf:
  cf[term]=len(tf[term])


In [147]:
print(tf)
print(cf)



In [216]:
# calculate tficf
for term in tf:
    for category in tf[term]:
        tf_val = tf[term][category]
        icf_val = math.log10(N / (cf[term]))  # +1 for smoothing/handling cf[term]=0 also
        weight = tf_val * icf_val
        if term not in tficf:
            tficf[term] = {}
        tficf[term][category] = weight

In [220]:
# normalization euclidean
for term in tficf:
    norm = math.sqrt(sum([weight**2 for weight in tficf[term].values()]))
    for category in tficf[term]:
        tficf[term][category] /= norm

In [221]:
print(tficf)



In [180]:
category_probabilities = {}
for category in train_data['Category'].unique():
    category_count = len(train_data[train_data['Category'] == category])
    category_probabilities[category] = category_count / len(train_data)
print(category_probabilities)

{'tech': 0.174496644295302, 'entertainment': 0.19175455417066156, 'business': 0.2195589645254075, 'politics': 0.174496644295302, 'sport': 0.23969319271332695}


In [222]:
classes = set(train_data["Category"])

feature_probs = {}
for term in tficf:
    term_probs = {}
    for category in classes:
        term_weight = tficf[term].get(category, 0)
        term_sum = sum(tficf[term].get(c, 0) for c in classes)
        term_probs[category] = term_weight / term_sum
    feature_probs[term] = term_probs
print(feature_probs)



In [223]:
# classify test data
predictions = []
for doc_no in range(len(test_data)):

    text = test_data['Text'][doc_no]
    text = text.split()

    score = {}
    for category in train_data['Category'].unique():
        score[category]=math.log10(category_probabilities[category])
        for term in text:
          if term in tf:
            if category in tf[term]:
              score[category]+=math.log10(tficf[term][category])
            else:
              score[category]+=0   
    max_score = max(score, key=score.get)
    predictions.append(max_score)

In [240]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

actual_labels = test_data['Category'].tolist()
accuracy = accuracy_score(actual_labels, predictions)
precision= precision_score(actual_labels, predictions,average='macro')
recall=recall_score(actual_labels, predictions,average='macro')
f1=f1_score(actual_labels, predictions,average='macro')
report = classification_report(actual_labels, predictions)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f'Recall:{recall}')
print(f"F1 score: {f1}")


Accuracy: 0.9217002237136466
Precision: 0.9268486214236799
Recall:0.9138441187936115
F1 score: 0.9138397492076933


using feature_probs

In [241]:
def classify_document(document, category_probabilities, feature_probs):
    text = document.split()
    scores = {}
    
    for category in category_probabilities:
        score = math.log10(category_probabilities[category])
        
        for term in text:
            if term in feature_probs:
                if category in feature_probs[term]:
                    # add Laplace smoothing
                    numerator = feature_probs[term][category] + 0.1
                    denominator = sum(feature_probs[term].values()) + 0.1*len(feature_probs[term])
                    score += math.log10(numerator/denominator)
                else:
                    score += math.log10(0.1/(sum(feature_probs[term].values()) + 0.1*len(feature_probs[term])))
            
        scores[category] = score    
    return max(scores, key=scores.get)



In [234]:
test_data['Predicted_Category'] = test_data['Text'].apply(lambda x: classify_document(x, category_probabilities, feature_probs))


In [239]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Get the actual categories for the testing set
actual_categories = test_data['Category'].tolist()

predicted_categories = test_data['Predicted_Category'].tolist()

accuracy = accuracy_score(actual_categories, predicted_categories)

precision = precision_score(actual_categories, predicted_categories, average='macro')

recall = recall_score(actual_categories, predicted_categories, average='macro')

f1 = f1_score(actual_categories, predicted_categories, average='macro')

print('Accuracy: {:.2f}%'.format(accuracy * 100))
print('Precision: {:.2f}%'.format(precision * 100))
print('Recall: {:.2f}%'.format(recall * 100))
print('F1 score: {:.2f}%'.format(f1 * 100))


Accuracy: 96.20%
Precision: 96.08%
Recall: 95.91%
F1 score: 95.88%
