In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing Libraries

In [7]:
import nltk
import pandas as pd
import numpy as np
import math
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import GaussianNB, MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

In [8]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [95]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import string

from nltk.tokenize import WhitespaceTokenizer

# Importing the dataset

In [10]:
import os

df_train_og = pd.read_csv('/content/drive/MyDrive/CSE508_Winter2023_A2_98/Q2/BBC_News_Dataset/BBC News Train.csv')
df_test_og = pd.read_csv('/content/drive/MyDrive/CSE508_Winter2023_A2_98/Q2/BBC_News_Dataset/BBC News Test.csv')


df_train = pd.read_csv('/content/drive/MyDrive/CSE508_Winter2023_A2_98/Q2/BBC_News_Dataset/BBC News Train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/CSE508_Winter2023_A2_98/Q2/BBC_News_Dataset/BBC News Test.csv')
df_sample = pd.read_csv('/content/drive/MyDrive/CSE508_Winter2023_A2_98/Q2/BBC_News_Dataset/BBC News Sample Solution.csv')

In [11]:
df_train.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [12]:
df_test.head()

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...


In [13]:
df_sample.head()

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,business
3,459,entertainment
4,1020,politics


#(1) Pre-processing dataset

##(1.1)Converting text to lower case

In [14]:
df_train['Text'] = df_train['Text'].apply(str.lower)
df_test['Text'] = df_test['Text'].apply(str.lower)

df_train.head()


Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


##(1.2)Removing puntuations from the text

In [15]:
df_train['Text'] = df_train['Text'].str.replace('[^\w\s]','')

df_train_og['Text'][0]




In [16]:
df_train['Text'][0]



##(1.3)Removing stop words

In [17]:
df_train['Text'] = df_train['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

df_train['Text'][0]



##(1.4)Performing lemmatization

In [18]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df_train['Text'] = df_train['Text'].apply(lemmatize)

df_train['Text'][0]



##(1.5)Perform tokenization

In [19]:


def tokenize(text):
  text = nltk.word_tokenize(text)
  return text

df_train['Text'] = df_train['Text'].apply(tokenize)

In [20]:
print(df_train['Text'][0][0:20])

['worldcom', 'exboss', 'launch', 'defence', 'lawyer', 'defending', 'former', 'worldcom', 'chief', 'bernie', 'ebbers', 'battery', 'fraud', 'charge', 'called', 'company', 'whistleblower', 'first', 'witness', 'cynthia']


In [21]:
df_train.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,"[worldcom, exboss, launch, defence, lawyer, de...",business
1,154,"[german, business, confidence, slide, german, ...",business
2,1101,"[bbc, poll, indicates, economic, gloom, citize...",business
3,1976,"[lifestyle, governs, mobile, choice, faster, b...",tech
4,917,"[enron, boss, 168m, payout, eighteen, former, ...",business


In [22]:
list(df_train.keys())

['ArticleId', 'Text', 'Category']

In [23]:
dict ={}
for word in list(df_train.keys()):
  dict[word] = list(np.zeros(5,dtype=int))


dict['hey'] = list(np.zeros(5,dtype=int))

dict1 ={}

for word in list(dict.keys()):
  if 'hey1' not in word:
    dict1['hey1'] = list(np.zeros(5,dtype=int))

dict1  

{'hey1': [0, 0, 0, 0, 0]}

#### Encoding categories

In [24]:

# Creating a instance of label Encoder.
le = LabelEncoder()
 
# Using .fit_transform function to fit label
# encoder and return encoded label
df_train['Category']= le.fit_transform(df_train['Category'])
 
# printing label
type(df_train['Category'][0])

numpy.int64

In [25]:
df_train

Unnamed: 0,ArticleId,Text,Category
0,1833,"[worldcom, exboss, launch, defence, lawyer, de...",0
1,154,"[german, business, confidence, slide, german, ...",0
2,1101,"[bbc, poll, indicates, economic, gloom, citize...",0
3,1976,"[lifestyle, governs, mobile, choice, faster, b...",4
4,917,"[enron, boss, 168m, payout, eighteen, former, ...",0
...,...,...,...
1485,857,"[double, eviction, big, brother, model, capric...",1
1486,325,"[dj, double, act, revamp, chart, show, dj, duo...",1
1487,1590,"[weak, dollar, hit, reuters, revenue, medium, ...",0
1488,1587,"[apple, ipod, family, expands, market, apple, ...",4


In [26]:
df_train['Text'][0][:15]

['worldcom',
 'exboss',
 'launch',
 'defence',
 'lawyer',
 'defending',
 'former',
 'worldcom',
 'chief',
 'bernie',
 'ebbers',
 'battery',
 'fraud',
 'charge',
 'called']

##(1.6)Implementing TF-ICF weighing scheme

In [27]:
tf = {}

for ind in range(df_train.shape[0]):
  text=df_train['Text'][ind]
  category = df_train['Category'][ind]
  for word in text:
    if word in list(tf.keys()):
        tf[word][category] +=1  

    else:
      tf[word] = list(np.zeros(5,dtype=int))
      

In [33]:
#Calculating Class Frequency

# adding CF value at the last pos in list for each term

tf_icf = {}
tf_icf = tf.copy()

for word in (list(tf_icf.keys())):
  cf = np.count_nonzero(tf[word])
  # print('cf:',cf)
  if cf>0:
    icf = math.log(5/cf,10)
    # print('icf',icf)
    tf_icf[word] = [ele*icf for ele in tf_icf[word]]


In [34]:
tf['worldcom']

[53, 0, 0, 0, 0]

In [35]:
tf['exboss']

[1, 0, 0, 0, 0]

In [36]:
tf['launch']

[22, 14, 17, 5, 58]

## (1.7)Creating the dataframe with tf-icf weighing scheme

In [38]:
len(tf_icf.keys())

24591

In [39]:
df_train['Category'][1]

0

In [40]:
## Trial Block


vocab = list(tf_icf.keys())[:5]

dict_sample = dict.fromkeys(vocab,0)
dict_sample2 = dict.fromkeys(vocab,1)

list_of_dict= []
list_of_dict.append(dict_sample)
list_of_dict.append(dict_sample2)


temp3 = pd.DataFrame(list_of_dict)

In [41]:
df_train.shape[0]
# df_train['Text'][ind]

1490

In [42]:
vocab = list(tf_icf.keys())
list_of_dict = []

for ind in range(df_train.shape[0]):
  category = df_train['Category'][ind]
  tf_icf_weights = dict.fromkeys(vocab,0)

  for word in list(df_train['Text'][ind]):
    tf_icf_weights[word] = tf_icf[word][category]

  list_of_dict.append(tf_icf_weights)

In [43]:
len(list_of_dict)

1490

In [44]:
result = pd.DataFrame(list_of_dict)
result

Unnamed: 0,worldcom,exboss,launch,defence,lawyer,defending,former,chief,bernie,ebbers,...,4gb,microsoftpartnered,secondgeneration,santy,unwelcome,defaced,phpbb,randomly,tailed,defacement
0,37.04541,0.69897,0.0,0.0,0.0,0.09691,0.0,0.0,4.89279,30.75468,...,0,0,0,0.00000,0,0.00000,0.00000,0,0,0
1,0.00000,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.00000,...,0,0,0,0.00000,0,0.00000,0.00000,0,0,0
2,0.00000,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.00000,...,0,0,0,0.00000,0,0.00000,0.00000,0,0,0
3,0.00000,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.00000,...,0,0,0,0.00000,0,0.00000,0.00000,0,0,0
4,0.00000,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.00000,...,0,0,0,0.00000,0,0.00000,0.00000,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1485,0.00000,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.00000,...,0,0,0,0.00000,0,0.00000,0.00000,0,0,0
1486,0.00000,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.00000,...,0,0,0,0.00000,0,0.00000,0.00000,0,0,0
1487,0.00000,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.00000,...,0,0,0,0.00000,0,0.00000,0.00000,0,0,0
1488,0.00000,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.00000,...,0,0,0,0.00000,0,0.00000,0.00000,0,0,0


In [45]:
# concatenating tf_icf weights with the train data 
df = pd.concat([result, df_train], axis=1)

#dropping text column
df.drop(['Text','ArticleId'], axis=1,inplace=True)
df.head()

Unnamed: 0,worldcom,exboss,launch,defence,lawyer,defending,former,chief,bernie,ebbers,...,microsoftpartnered,secondgeneration,santy,unwelcome,defaced,phpbb,randomly,tailed,defacement,Category
0,37.04541,0.69897,0.0,0.0,0.0,0.09691,0.0,0.0,4.89279,30.75468,...,0,0,0.0,0,0.0,0.0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0.0,0,0.0,0.0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0.0,0,0.0,0.0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0.0,0,0.0,0.0,0,0,0,4
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0.0,0,0.0,0.0,0,0,0,0


In [46]:
y=df['Category']
X=df.drop(['Category'],axis=1)

print(X.shape,y.shape)

(1490, 24591) (1490,)


#(2) Train-Test split

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

(1043, 24591) (1043,) (447, 24591) (447,)


In [48]:
X_train.head()

Unnamed: 0,worldcom,exboss,launch,defence,lawyer,defending,former,chief,bernie,ebbers,...,4gb,microsoftpartnered,secondgeneration,santy,unwelcome,defaced,phpbb,randomly,tailed,defacement
240,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0.0,0,0.0,0.0,0,0,0
1305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0.0,0,0.0,0.0,0,0,0
1042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0.0,0,0.0,0.0,0,0,0
1426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0.0,0,0.0,0.0,0,0,0
1364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0.0,0,0.0,0.0,0,0,0


In [49]:
y_train.head()

240     3
1305    4
1042    3
1426    2
1364    2
Name: Category, dtype: int64

In [50]:
X_test.head()

Unnamed: 0,worldcom,exboss,launch,defence,lawyer,defending,former,chief,bernie,ebbers,...,4gb,microsoftpartnered,secondgeneration,santy,unwelcome,defaced,phpbb,randomly,tailed,defacement
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0.0,0,0.0,0.0,0,0,0
354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0.0,0,0.0,0.0,0,0,0
1227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0.0,0,0.0,0.0,0,0,0
907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0.0,0,0.0,0.0,0,0,0
575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0.0,0,0.0,0.0,0,0,0


In [51]:
y_test.head()

9       1
354     2
1227    3
907     2
575     3
Name: Category, dtype: int64

#(3) Training the Naive Bayes classifier with TF-ICF

In [52]:
# import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
# create a Gaussian Classifier
model = GaussianNB()
# train the model using the training sets
model.fit(X_train, y_train)


In [53]:
# predicting for X_test
y_pred = model.predict(X_test)
print(y_pred[:5])

[1 2 3 2 3]


In [54]:


# create a Gaussian Classifier
model_multi = MultinomialNB()
# train the model using the training sets
model_multi.fit(X_train, y_train)

In [55]:
# predicting for X_test
y_pred_multi = model_multi.predict(X_test)
print(y_pred_multi[:5])

[1 2 3 2 3]


In [56]:
len(y_train)

1043

### (3.1) Calculate the probability of each category based on the frequency of documents in the training set that belong to that category

In [57]:
# Calculating the probablities
proba_cat = []
total = len(y_train)

for i in range(5):
  cat_freq = list(y_train).count(i)
  prob = cat_freq/total
  proba_cat.append(prob)


In [58]:
df_train_og['Category'].unique()

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

In [59]:
# Print the prbablities
i=0
print("="*10,"Probability of each category based on the frequency of documents in the training set that belong to that category","="*10)
print()
for category in (df_train_og['Category'].unique()):
  print("The Probablity of ",category, " category is: ",proba_cat[i])
  i=i+1




The Probablity of  business  category is:  0.2233940556088207
The Probablity of  tech  category is:  0.17641418983700863
The Probablity of  politics  category is:  0.1850431447746884
The Probablity of  sport  category is:  0.23873441994247363
The Probablity of  entertainment  category is:  0.17641418983700863


#(4) Testing the Naive Bayes classifier with TF-ICF

###(4.1) Calculate the accuracy, precision, recall, and F1 score of the classifier.

In [60]:
# import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# printing accuracy
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 1.0


In [61]:
# import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# printing accuracy
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_multi))

Accuracy: 1.0


In [62]:
from sklearn.metrics import precision_score

print('Precision: %.3f' % precision_score(y_test, y_pred, average = 'weighted'))

Precision: 1.000


In [63]:
from sklearn.metrics import recall_score

print('Recall: %.3f' % recall_score(y_test, y_pred, average = 'weighted'))

Recall: 1.000


In [64]:
from sklearn.metrics import f1_score

f1_score(y_test, y_pred, average='weighted')

1.0

#(5) Improving the classifier

### Performance of the classifier on different train-test splits

In [65]:
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train_3.shape,y_train_3.shape,X_test_3.shape,y_test_3.shape)

(1192, 24591) (1192,) (298, 24591) (298,)


In [66]:
# Calculating y_pred for the above train-test split

# train the model using the training sets
model.fit(X_train_3, y_train_3)

# predicting for X_test
y_pred_3 = model.predict(X_test_3)
print(y_pred_3[:5])


[1 2 3 2 3]


In [67]:
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X, y, test_size=0.4, random_state=0)

print(X_train_4.shape,y_train_4.shape,X_test_4.shape,y_test_4.shape)

(894, 24591) (894,) (596, 24591) (596,)


In [68]:
# Calculating y_pred for the above train-test split

# train the model using the training sets
model.fit(X_train_4, y_train_4)

# predicting for X_test
y_pred_4 = model.predict(X_test_4)
print(y_pred_4[:5])


[1 2 3 2 3]


In [69]:
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(X, y, test_size=0.5, random_state=0)

print(X_train_5.shape,y_train_5.shape,X_test_5.shape,y_test_5.shape)

(745, 24591) (745,) (745, 24591) (745,)


In [70]:
# Calculating y_pred for the above train-test split

# train the model using the training sets
model.fit(X_train_5, y_train_5)

# predicting for X_test
y_pred_5 = model.predict(X_test_5)
print(y_pred_5[:5])


[1 2 3 2 3]


In [71]:
# Function to calculate accuracy, precision, recall and F1 score

def calculate_results(y_test, y_pred, split):

  train_split = 100 -split
  print("="*10,"The performance Report for ",train_split,":",split,"train-test split:","="*12)
  print()
  print('Accuracy:', accuracy_score(y_test, y_pred))
  print('Precision: %.3f' % precision_score(y_test, y_pred, average = 'weighted'))
  print('Recall: %.3f' % recall_score(y_test, y_pred, average = 'weighted'))
  print('F1-Score: %.3f' % f1_score(y_test, y_pred, average = 'weighted'))
  

### Print the results (for different train-test splits)

In [96]:
calculate_results(y_test_3, y_pred_3, 20)


Accuracy: 1.0
Precision: 1.000
Recall: 1.000
F1-Score: 1.000


In [73]:
calculate_results(y_test_4, y_pred_4, 40)


Accuracy: 1.0
Precision: 1.000
Recall: 1.000
F1-Score: 1.000


In [74]:
calculate_results(y_test_5, y_pred_5, 50)


Accuracy: 0.9973154362416108
Precision: 0.997
Recall: 0.997
F1-Score: 0.997


### Performance of the classifier for different encodings

In [75]:
# Preparing dataset

df_train_tfidf = df_train['Text'].apply(' '.join)
# df_train_tfidf = df_train_tfidf.reshape([1490,])

# stopwordremove_text = ' '.join(stopwordremove)

df_train_tfidf

0       worldcom exboss launch defence lawyer defendin...
1       german business confidence slide german busine...
2       bbc poll indicates economic gloom citizen majo...
3       lifestyle governs mobile choice faster better ...
4       enron boss 168m payout eighteen former enron d...
                              ...                        
1485    double eviction big brother model caprice holb...
1486    dj double act revamp chart show dj duo jk joel...
1487    weak dollar hit reuters revenue medium group r...
1488    apple ipod family expands market apple expande...
1489    santy worm make unwelcome visit thousand websi...
Name: Text, Length: 1490, dtype: object

In [76]:
X2 = df_train_tfidf
y2 = df_train_og['Category']
print(type(X2))
print(type(y2))

print(X2.shape)
print(y2.shape)


<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
(1490,)
(1490,)


In [79]:
# Performing train-test split 
X_train_tfidf, X_test_tfidf, Y_train_tfidf, Y_test_tfidf = train_test_split(X2,y2, test_size = 0.3, random_state = 60,shuffle=True)

print(len(X_train_tfidf))
print(type(X_test_tfidf))



print(X_test_tfidf.shape)
print(X_train_tfidf.shape)

print(Y_test_tfidf.shape)
print(Y_train_tfidf.shape)

1043
<class 'pandas.core.series.Series'>
(447,)
(1043,)
(447,)
(1043,)


In [82]:
nb = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', MultinomialNB()),])
nb.fit(X_train_tfidf,Y_train_tfidf)

In [83]:
y_pred_tfidf = nb.predict(X_test_tfidf)

In [84]:
def calculate_results_2(y_test, y_pred):

  # train_split = 100 -split
  # print("The performance Report for ",train_split,":",split,"train-test split:")
  # print()
  print('Accuracy: %.2f' % (accuracy_score(y_test, y_pred)*100),"%")
  print('Precision: %.3f' % precision_score(y_test, y_pred, average = 'weighted'))
  print('Recall: %.3f' % recall_score(y_test, y_pred, average = 'weighted'))
  print('F1-Score: %.3f' % f1_score(y_test, y_pred, average = 'weighted'))

In [85]:
print("="*10,"Results using TF-IDF","="*12)
print()
calculate_results_2(Y_test_tfidf,y_pred_tfidf)


Accuracy: 96.64 %
Precision: 0.967
Recall: 0.966
F1-Score: 0.967


#(6) Conclusion

The performance of TF-ICF was 100% and TF-IDF was 96.6%.
The performance on different train-test splits were almost similar as mentioned in the above sections.
Multinomial Naive Bayes algorithm was used for values generated from TF-IDF vectorizer due to its ability to handle feature counts that are zero (0).
Gaussian Naive Bayes and Multinomial Naive Bayes performed similar  on the values from TF-ICF vectorizer.
