## Classification of News Articles

In [1]:
#Importing Libraries
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

## Using Spacy's small language model

In [4]:
!pip install spacy



In [5]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [6]:
df = pd.read_csv('BBC News.csv')
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


In [8]:
df.drop('ArticleId', axis=1, inplace=True)

In [10]:
def is_whitespace(data):
    
    blank = []
    for idx, text, label in data.itertuples():
        if text.isspace():
            blank.append(idx)
    
    return blank

In [11]:
is_whitespace(df)

[]

In [12]:
df.Text[0]



## Preprocessing using spacy

In [13]:
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct or token.is_space:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

Removing stop words, punctuations from the text.

In [14]:
df['processed_text'] = df['Text'].apply(preprocess)

In [15]:
df.head()

Unnamed: 0,Text,Category,processed_text
0,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex boss launch defence lawyer defend ...
1,german business confidence slides german busin...,business,german business confidence slide german busine...
2,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicate economic gloom citizen major...
3,lifestyle governs mobile choice faster bett...,tech,lifestyle govern mobile choice fast well funky...
4,enron bosses in $168m payout eighteen former e...,business,enron boss $ 168 m payout eighteen enron direc...


In [16]:
df.Text[0]



In [17]:
df.processed_text[0]



## Encoding and Splitting

In [18]:
le = LabelEncoder()
cat_fit = le.fit(df.Category)
y = cat_fit.transform(df.Category)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df.processed_text, y, 
                                                    test_size=0.2, random_state=42)

In [20]:
X_train.shape, X_test.shape

((1192,), (298,))

In [21]:
y_train.shape, y_test.shape

((1192,), (298,))

## Modelling and evaluation

Used sklearn pipelines to perform preprocessing and modelling in sequence.

## Model 1 using
  CountVectorizer: 1-gram and bi-gram
  
  
  Naive Bayes algorithm

In [22]:
model_1 = Pipeline([('c_vectorizer', CountVectorizer(ngram_range=(1, 2))), 
                      ('bayes_model', MultinomialNB())])

In [23]:
model_1.fit(X_train, y_train)

Pipeline(steps=[('c_vectorizer', CountVectorizer(ngram_range=(1, 2))),
                ('bayes_model', MultinomialNB())])

In [24]:
model_1_pred = model_1.predict(X_test)

In [25]:
print(f'\nAccuracy score of count vectorizer based model: {accuracy_score(y_test, model_1_pred):.2f}')



Accuracy score of count vectorizer based model: 0.98


In [26]:
model_2 = Pipeline([('t_vector', TfidfVectorizer()), 
                    ('bayes_model_2', MultinomialNB())])

In [27]:
model_2.fit(X_train, y_train)

Pipeline(steps=[('t_vector', TfidfVectorizer()),
                ('bayes_model_2', MultinomialNB())])

In [28]:
model_2_pred = model_2.predict(X_test)


In [29]:
print(f'\nAccuracy score of tfidf based model: {accuracy_score(y_test, model_2_pred):.2f}')


Accuracy score of tfidf based model: 0.96


## In the case of this data:

It appears countvectorizer with uni-gram and bi-gram range gives a better result: 98% accuracy.
Compared to this the tfidf vectorizer based method only provided