<a href="https://colab.research.google.com/github/faya98/BBC_NEWS_CLASSIFICATION/blob/main/bbc_news_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing necessary libraries

In [1]:
import numpy as np
import csv  
import pandas as pd
import re
import os
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from google.colab import files
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.ensemble import RandomForestClassifier
#from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
#from sklearn.naive_bayes import MultinomialNB
#from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score,recall_score,f1_score

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Upload datasets_coursework1.zip file as it is

In [2]:
uploaded = files.upload()

Saving bbc_news_dataset.zip to bbc_news_dataset.zip


# Unzip the datasets_coursework1.zip file

In [3]:
filename = "bbc_news_dataset.zip"
with ZipFile(filename, 'r') as zip:
  zip.extractall()

# Data Extraction

Reads through every text document and extracts the content

In [4]:
data = []
for (root,dirs,files) in sorted(os.walk('bbc')):
  if root=='bbc': continue
  category = root.replace('bbc/','')
  for txt in sorted(files):
    with open(root+'/'+txt,'r',encoding='latin1') as f:
      data.append([f.read(),category])

# Convert the extracted data to CSV file

In [5]:
header = ['Data', 'Category']
with open('bbc.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    for news in data:
      writer.writerow(news)

## Reading the CSV file and looking at the basic information of the file contents

Read the csv file

In [6]:
raw_data = pd.read_csv("bbc.csv")

Looking at the first few rows(5) of information

In [7]:
print(raw_data.head())

                                                Data  Category
0  Ad sales boost Time Warner profit\n\nQuarterly...  business
1  Dollar gains on Greenspan speech\n\nThe dollar...  business
2  Yukos unit buyer faces loan claim\n\nThe owner...  business
3  High fuel prices hit BA's profits\n\nBritish A...  business
4  Pernod takeover talk lifts Domecq\n\nShares in...  business


Displaying the count of news articles in each type

In [8]:
print(raw_data['Category'].value_counts())

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: Category, dtype: int64


Structure of the data (number of rows, number of columns)

In [9]:
print(raw_data.shape)

(2225, 2)


# Label Encoding

Provides numberic labels to each category of news atricles

In [10]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
raw_data['Category_label'] = le.fit_transform(raw_data['Category'])

In [11]:
raw_data.head()

Unnamed: 0,Data,Category,Category_label
0,Ad sales boost Time Warner profit\n\nQuarterly...,business,0
1,Dollar gains on Greenspan speech\n\nThe dollar...,business,0
2,Yukos unit buyer faces loan claim\n\nThe owner...,business,0
3,High fuel prices hit BA's profits\n\nBritish A...,business,0
4,Pernod takeover talk lifts Domecq\n\nShares in...,business,0


Maps text category with its numeric form

In [12]:
textlabels = {}
for i in range(raw_data.shape[0]):
  textlabels[raw_data['Category_label'][i]] = raw_data['Category'][i]
print(textlabels)

{0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'}


# Data Pre-Process

In [13]:
'''
Pretreatment
:parameter
    :param text: String - the name of the column that contains the text
    :param stopwords_list: List - list of stop words to delete
:return
    cleaned text
'''
def preprocess_text(text, stopwords_list=None):
    #Clean (convert to lowercase and delete punctuation)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    #Tokenization (conversion from string to list)
    lst_text = text.split()
   
    #Delete stop words
    if stopwords_list is not None:
      lst_text = [word for word in lst_text if word not in stopwords_list]

    #Lemmatization
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    lst_text = [lem.lemmatize(word) for word in lst_text]
            
    #Return to string from list
    text = " ".join(lst_text)
    return text

In [14]:
#stopwords
stopwordslist = nltk.corpus.stopwords.words("english")

In [15]:
#performs pre-processing of all the texts in the column 'Data'
raw_data['Processed_Data'] = raw_data['Data'].apply(lambda x: preprocess_text(x, stopwords_list=stopwordslist))

In [16]:
#prints first 5 rows of the dataset
raw_data.head()

Unnamed: 0,Data,Category,Category_label,Processed_Data
0,Ad sales boost Time Warner profit\n\nQuarterly...,business,0,ad sale boost time warner profit quarterly pro...
1,Dollar gains on Greenspan speech\n\nThe dollar...,business,0,dollar gain greenspan speech dollar hit highes...
2,Yukos unit buyer faces loan claim\n\nThe owner...,business,0,yukos unit buyer face loan claim owner embattl...
3,High fuel prices hit BA's profits\n\nBritish A...,business,0,high fuel price hit ba profit british airway b...
4,Pernod takeover talk lifts Domecq\n\nShares in...,business,0,pernod takeover talk lift domecq share uk drin...


# train/dev/test split of 80%/10%/10%

In [17]:
#splits 80% of dataset into train set and rest 20% into development_test set
x_train, x_dev_test, y_train, y_dev_test = train_test_split(raw_data['Processed_Data'], raw_data['Category_label'],test_size=0.2, random_state=42)

#development_test set is split into equal halves of development set and test set, each having 10% of the original dataset
x_dev, x_test, y_dev, y_test = train_test_split(x_dev_test,y_dev_test,test_size=0.5, random_state=42)

In [18]:
#converts all the dataset into an array
x_train=np.array(x_train)
x_train_dev=np.array(x_dev)
x_test=np.array(x_test)
y_train=np.array(y_train)
y_train_dev=np.array(y_dev)
y_test=np.array(y_test)

#Tuning number of features and feature selection methods using development set and model training - for each feature entity

Count Feature Entity

In [19]:
# list of feature selection methods
fs=[chi2,f_classif]
fslist=['chi2','f_classif']

# list of number of features
klist=[1000,2000,3000,4000]

#initially the best accuracy is assigned to 0
b1=0.0

#defining the count vectorizer for simple bag of words(unigrams)
cvec=CountVectorizer(ngram_range=(1,1))

#getting the vocabulary
cvec.fit(x_train)

#transforms all the text in training dataset into count document matrix
x_train_c=cvec.transform(x_train).toarray()

#transforms all the text in development dataset into count document matrix
x_dev_c=cvec.transform(x_dev).toarray()

#transforms all the text in testing dataset into count document matrix
x_test_c=cvec.transform(x_test).toarray()

for i in range(0,len(klist)):
  for j in range(0,len(fs)):
  
    #selects k best features
    c_selector = SelectKBest(fs[j], k=min(klist[i], x_train_c.shape[1]))
    c_selector.fit(x_train_c, y_train)

    #Reduces the number of features in train document matrix to k best features
    x_train_cvec = c_selector.transform(x_train_c)

    #Reduces the number of features in development document matrix to k best features
    x_dev_cvec = c_selector.transform(x_dev_c)

    #Defines Logistic Regression model
    model = LogisticRegression(max_iter=200,random_state=42)

    #trains the model
    model.fit(x_train_cvec,y_train)

    #making predictions using development document matrix
    y_dev_pred=model.predict(x_dev_cvec)

    #calculating accuracy
    accuracy=(accuracy_score(y_dev,y_dev_pred))*100

    print('Number of Features = {} '.format(klist[i]),end=' ')
    print('Feature Selection = {} '.format(fslist[j]),end=' ')
    print('Accuracy = {} '.format(accuracy))

    #comparing the accuracy with best accuracy
    if b1<accuracy:
      if abs(b1-accuracy)>=1: # ignores if very small changes are made in the accuracy
        b1=accuracy
        k1=klist[i]
        fs1=j
        opt_c_model=model #best model (Model-1)
        opt_c_selector=c_selector #best k features selector

  print()

print('For count feature entity:')
print('   Optimum Number of Features = {}'.format(k1))
print('   Optimum Feature Selection  = {}'.format(fslist[fs1]))
fs1=fs[fs1]

Number of Features = 1000  Feature Selection = chi2  Accuracy = 95.4954954954955 
Number of Features = 1000  Feature Selection = f_classif  Accuracy = 95.04504504504504 

Number of Features = 2000  Feature Selection = chi2  Accuracy = 95.94594594594594 
Number of Features = 2000  Feature Selection = f_classif  Accuracy = 95.94594594594594 

Number of Features = 3000  Feature Selection = chi2  Accuracy = 95.94594594594594 
Number of Features = 3000  Feature Selection = f_classif  Accuracy = 95.4954954954955 

Number of Features = 4000  Feature Selection = chi2  Accuracy = 96.3963963963964 
Number of Features = 4000  Feature Selection = f_classif  Accuracy = 95.94594594594594 

For count feature entity:
   Optimum Number of Features = 1000
   Optimum Feature Selection  = chi2


One-hot feature entity

In [20]:
#initially the best accuracy is assigned to 0
b2=0.0

# copies train count document matrix
x_train_o=x_train_c.copy()

# copies development count document matrix
x_dev_o=x_dev_c.copy()

# copies test count document matrix
x_test_o=x_test_c.copy()

#changes all the values which are greater than or equal to 1 in train document matrix to 1
row=(x_train_o.shape)[0]
col=(x_train_o.shape)[1]
for m in range(0,row):
  for n in range(0,col):
    if(x_train_o[m][n]>1):
      x_train_o[m][n]=1

#changes all the values which are greater than or equal to 1 in development document matrix to 1
devrow=(x_dev_o.shape)[0]
devcol=(x_dev_o.shape)[1]
for m in range(0,devrow):
  for n in range(0,devcol):
    if(x_dev_o[m][n]>1):
      x_dev_o[m][n]=1

#changes all the values which are greater than or equal to 1 in test document matrix to 1
testrow=(x_test_o.shape)[0]
testcol=(x_test_o.shape)[1]
for m in range(0,testrow):
  for n in range(0,testcol):
    if(x_test_o[m][n]>1):
      x_test_o[m][n]=1

for i in range(0,len(klist)):
  for j in range(0,len(fs)):

    #selects k best features
    o_selector = SelectKBest(fs[j], k=min(klist[i], x_train_o.shape[1]))
    o_selector.fit(x_train_o, y_train)

    #Reduces the number of features in train document matrix to k best features
    x_train_ovec = o_selector.transform(x_train_o)

    #Reduces the number of features in development document matrix to k best features
    x_dev_ovec = o_selector.transform(x_dev_o)

    #Defines Logistic Regression model
    o_model = LogisticRegression(random_state=42)

    #trains the model
    o_model.fit(x_train_ovec,y_train)

    #making predictions using development document matrix
    y_o_dev_pred=o_model.predict(x_dev_ovec)

    #calculating accuracy
    accuracy=(accuracy_score(y_dev,y_o_dev_pred))*100

    print('Number of Features = {} '.format(klist[i]),end=' ')
    print('Feature Selection = {} '.format(fslist[j]),end=' ')
    print('Accuracy = {} '.format(accuracy))

    #comparing the accuracy with best accuracy
    if b2<accuracy:
      if abs(b2-accuracy)>=1: # ignores if very small changes are made in the accuracy
        b2=accuracy
        k2=klist[i]
        fs2=j
        opt_o_model=o_model #best model (Model-2)
        opt_o_selector=o_selector #best k features selector
    
  print()

print('For one-hot feature entity:')
print('   Optimum Number of Features = {}'.format(k2))
print('   Optimum Feature Selection  = {}'.format(fslist[fs2]))
fs2=fs[fs2]

Number of Features = 1000  Feature Selection = chi2  Accuracy = 96.84684684684684 
Number of Features = 1000  Feature Selection = f_classif  Accuracy = 97.2972972972973 

Number of Features = 2000  Feature Selection = chi2  Accuracy = 96.3963963963964 
Number of Features = 2000  Feature Selection = f_classif  Accuracy = 96.84684684684684 

Number of Features = 3000  Feature Selection = chi2  Accuracy = 96.84684684684684 
Number of Features = 3000  Feature Selection = f_classif  Accuracy = 96.84684684684684 

Number of Features = 4000  Feature Selection = chi2  Accuracy = 96.84684684684684 
Number of Features = 4000  Feature Selection = f_classif  Accuracy = 96.84684684684684 

For one-hot feature entity:
   Optimum Number of Features = 1000
   Optimum Feature Selection  = chi2


Tf-idf feature entity

In [21]:
#initially the best accuracy is assigned to 0
b3=0.0

#defining the tf-idf vectorizer for simple bag of words(unigrams)
tvec=TfidfVectorizer(ngram_range=(1,1))

#getting the vocabulary
tvec.fit(x_train)

#transforms all the text in training dataset into tf-idf document matrix
x_train_t=tvec.transform(x_train).toarray()

#transforms all the text in development dataset into tf-idf document matrix
x_dev_t=tvec.transform(x_dev).toarray()

#transforms all the text in testing dataset into tf-idf document matrix
x_test_t=tvec.transform(x_test).toarray()

for i in range(0,len(klist)):
  for j in range(0,len(fs)):
  
    #selects k best features
    t_selector = SelectKBest(fs[j], k=min(klist[i], x_train_t.shape[1]))
    t_selector.fit(x_train_t, y_train)

    #Reduces the number of features in train document matrix to k best features
    x_train_tvec = t_selector.transform(x_train_t)

    #Reduces the number of features in development document matrix to k best features
    x_dev_tvec = t_selector.transform(x_dev_t)

    #Defines Logistic Regression model
    t_model = LogisticRegression(random_state=42)

    #trains the model
    t_model.fit(x_train_tvec,y_train)
    
    #making predictions using development document matrix
    y_t_dev_pred=t_model.predict(x_dev_tvec)

    #calculating accuracy
    accuracy=(accuracy_score(y_dev,y_t_dev_pred))*100

    print('Number of Features = {} '.format(klist[i]),end=' ')
    print('Feature Selection = {} '.format(fslist[j]),end=' ')
    print('Accuracy = {} '.format(accuracy))

    #comparing the accuracy with best accuracy
    if b3<accuracy:
      if abs(b3-accuracy)>=1: # ignores if very small changes are made in the accuracy
        b3=accuracy
        k3=klist[i]
        fs3=j
        opt_t_model=t_model #best model (Model-3)
        opt_t_selector=t_selector #best k features selector

  print()

print('For tf-idf feature entity:')
print('   Optimum Number of Features = {}'.format(k3))
print('   Optimum Feature Selection  = {}'.format(fslist[fs3])) 
fs3=fs[fs3]

Number of Features = 1000  Feature Selection = chi2  Accuracy = 95.4954954954955 
Number of Features = 1000  Feature Selection = f_classif  Accuracy = 95.04504504504504 

Number of Features = 2000  Feature Selection = chi2  Accuracy = 95.94594594594594 
Number of Features = 2000  Feature Selection = f_classif  Accuracy = 95.94594594594594 

Number of Features = 3000  Feature Selection = chi2  Accuracy = 95.94594594594594 
Number of Features = 3000  Feature Selection = f_classif  Accuracy = 95.94594594594594 

Number of Features = 4000  Feature Selection = chi2  Accuracy = 96.3963963963964 
Number of Features = 4000  Feature Selection = f_classif  Accuracy = 95.4954954954955 

For tf-idf feature entity:
   Optimum Number of Features = 1000
   Optimum Feature Selection  = chi2


# Making predictions using test Set

Prediction from Model 1


In [22]:
#Reduces the number of features in test count document matrix to k best count features
x_test_cvec = opt_c_selector.transform(x_test_c)

#making predictions using reduced test count document matrix
y_c_test_pred=opt_c_model.predict(x_test_cvec)

#calculating the accuracy of model-1
accuracy1=(accuracy_score(y_test,y_c_test_pred))*100

Prediction from Model 2

In [23]:
#Reduces the number of features in test one-hot document matrix to k best one-hot features
x_test_ovec = opt_o_selector.transform(x_test_o)

#making predictions using reduced test one-hot document matrix
y_o_test_pred=opt_o_model.predict(x_test_ovec)

#calculating the accuracy of model-2
accuracy2=(accuracy_score(y_test,y_o_test_pred))*100

Prediction from Model 3

In [24]:
#Reduces the number of features in test tf-idf document matrix to k best tf-idf features
x_test_tvec = opt_t_selector.transform(x_test_t)

#making predictions using reduced test tf-idf document matrix
y_t_test_pred=opt_t_model.predict(x_test_tvec)

#calculating the accuracy of model-3
accuracy3=(accuracy_score(y_test,y_t_test_pred))*100

# Final Prediction: Combining the results of Model 1, 2 and 3 (Majority Voting)

In [25]:
pred_size=(y_test.shape)[0]

final_pred=[]

#determines which type of the news aricles has maximum votes from the predictions made by model-1,2,3 for every prediction
for i in range(0,pred_size):
  countlabels = {0:0,1:0,2:0,3:0,4:0}
  countlabels[y_c_test_pred[i]]=countlabels[y_c_test_pred[i]]+1
  countlabels[y_t_test_pred[i]]=countlabels[y_t_test_pred[i]]+1
  countlabels[y_o_test_pred[i]]=countlabels[y_o_test_pred[i]]+1
  final_pred.append(max(countlabels,key=countlabels.get))
final_pred=np.array(final_pred)

#calculates the accuracy of the combined model
final_accuracy=accuracy_score(y_test, final_pred)*100

# Accuracy of Model 1, 2, 3 and Accuracy of model combiner

In [26]:
print('Accuracy of Model 1        : {} %'.format(round(accuracy1,2)))
print('Accuracy of Model 2        : {} %'.format(round(accuracy2,2)))
print('Accuracy of Model 3        : {} %'.format(round(accuracy3,2)))
print('Accuracy of Model Combiner : {} %'.format(round(final_accuracy,2)))

Accuracy of Model 1        : 96.41 %
Accuracy of Model 2        : 96.41 %
Accuracy of Model 3        : 95.96 %
Accuracy of Model Combiner : 96.86 %


# Macro-Averaged Precision, Macro-Averaged Recall, Macro-Averaged F1_Score

In [27]:
precision=precision_score(y_test, final_pred, average='macro')
recall=recall_score(y_test, final_pred, average='macro')
f1=f1_score(y_test, final_pred, average='macro')


print ("Macro-Averaged Precision : {} %".format(round(precision*100,2)))
print ("Macro-Averaged Recall    : {} %".format(round(recall*100,2)))
print ("Macro-Averaged F1-Score  : {} %".format(round(f1*100,2)))

Macro-Averaged Precision : 97.11 %
Macro-Averaged Recall    : 96.78 %
Macro-Averaged F1-Score  : 96.93 %
