# Economic news article Text Classification

In [38]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd # to work with csv files

# matplotlib imports are used to plot confusion matrices for the classifiers
import matplotlib as mpl 
import matplotlib.cm as cm 
import matplotlib.pyplot as plt 

# import feature extraction methods from sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# pre-processing of text
import string
import re

# import classifiers from sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# import different metrics to evaluate the classifiers
from sklearn.metrics import accuracy_score

# from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from sklearn import metrics

# import time function from time module to track the training duration
from time import time

In [None]:
!wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv
!ls -lah DATAPATH


--2025-06-03 16:07:54--  https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12383529 (12M) [application/octet-stream]
Saving to: ‘DATAPATH/Full-Economic-News-DFE-839861.csv’


2025-06-03 16:08:06 (1.09 MB/s) - ‘DATAPATH/Full-Economic-News-DFE-839861.csv’ saved [12383529/12383529]

total 12M
drwxr-xr-x 2 fahad fahad 4.0K Jun  3 16:07 .
drwxr-xr-x 3 fahad fahad 4.0K Jun  3 16:07 ..
-rw-r--r-- 1 fahad fahad  12M Jun  3 16:08 Full-Economic-News-DFE-839861.csv


In [26]:
our_data = pd.read_csv("DATAPATH/Full-Economic-News-DFE-839861.csv" , encoding = "ISO-8859-1" )
shape = our_data.shape
shape

(8000, 15)

In [27]:
# Class distribution in dataset
def class_distribution():
    return display(our_data["relevance"].value_counts() / our_data.shape[0] * 100)

class_distribution()

relevance
no          82.1375
yes         17.7500
not sure     0.1125
Name: count, dtype: float64

There is an imbalance in the data with not relevant being 82% in the dataset. That is, most of the articles are not relevant to US Economy, which makes sense in a real-world scenario, as news articles discuss various topics. We should keep this class imbalance mind when interpreting the classifier performance later. Let us first convert the class labels into binary outcome variables for convenience. 1 for Yes (relevant), and 0 for No (not relevant), and ignore "Not sure"

In [29]:
# removing the row where relavance is 'not sure'
# our_data.head()
our_data = our_data[our_data.relevance != "not sure"]
our_data.shape

(7991, 15)

In [19]:
# Convert label to a numerical variable
class_distribution()
our_data.shape

relevance
no     82.230009
yes    17.769991
Name: count, dtype: float64

(7991, 15)

In [30]:
# converting label to numeric data
# relavent is 1, non-relavent is 0
our_data['relevance'] = our_data.relevance.map({ 'yes': 1, 'no': 0 })

# Take only two column we need
our_data = our_data[["text", "relevance"]]
our_data.head()

Unnamed: 0,text,relevance
0,NEW YORK -- Yields on most certificates of dep...,1
1,The Wall Street Journal Online</br></br>The Mo...,0
2,WASHINGTON -- In an effort to achieve banking ...,0
3,The statistics on the enormous costs of employ...,0
4,NEW YORK -- Indecision marked the dollar's ton...,1


## 2. Text Preprocessing

Typical steps involve tokenization, lower casing, removing, stop words, punctuation markers etc, and vectorization. Other processes such as stemming/lemmatization can also be performed. Here, we are performing the following steps: removing br tags, punctuation, numbers, and stopwords. While we are using sklearn's list of stopwords, there are several other stop word lists (e.g., from NLTK) or sometimes, custom stopword lists are needed depending on the task.



In [22]:
our_data.head()

Unnamed: 0,text,relevance
0,NEW YORK -- Yields on most certificates of dep...,
1,The Wall Street Journal Online</br></br>The Mo...,
2,WASHINGTON -- In an effort to achieve banking ...,
3,The statistics on the enormous costs of employ...,
4,NEW YORK -- Indecision marked the dollar's ton...,


In [47]:
stopwords = ENGLISH_STOP_WORDS

def clean(doc): # doc is a string of text
    doc = doc.replace("</br>", " ") # This text contains a lot of <br/> tags.
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    doc = " ".join([token for token in doc.split() if token not in stopwords])
    # remove punctuation and numbers
    return doc

In [48]:
our_data['text'] = our_data['text'].apply(clean)
our_data.head()

Unnamed: 0,text,relevance
0,NEW YORK Yields certificates deposit offered m...,1
1,The Wall Street Journal Online The Morning Bri...,0
2,WASHINGTON In effort achieve banking reform Se...,0
3,The statistics enormous costs employee drug ab...,0
4,NEW YORK Indecision marked dollars tone trader...,1


'textrelevance'