In [32]:
from functools import reduce
import nltk
import warnings
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer 
import string
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andry\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\andry\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andry\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


# Read data

In [70]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)

In [71]:
df

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


In [72]:
df['category'].value_counts()

POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATION       

In [73]:
drop_categories = ['LATINO VOICES', 'WORLDPOST', 'WORLD NEWS', 'THE WORLDPOST', 'QUEER VOICES', 'BLACK VOICES']

In [74]:
df = df.drop(index=df[df['category'].isin(drop_categories)].index)

In [75]:
df.describe()

Unnamed: 0,link,headline,category,short_description,authors,date
count,187925,187925,187925,187925.0,187925.0,187925
unique,187887,186432,36,168282.0,25111.0,3878
top,https://www.huffingtonpost.comhttp://www.nytim...,Sunday Roundup,POLITICS,,,2013-06-03 00:00:00
freq,2,90,35602,17115.0,32332.0,100
first,,,,,,2012-01-28 00:00:00
last,,,,,,2022-09-23 00:00:00


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187925 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               187925 non-null  object        
 1   headline           187925 non-null  object        
 2   category           187925 non-null  object        
 3   short_description  187925 non-null  object        
 4   authors            187925 non-null  object        
 5   date               187925 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 10.0+ MB


# Train test split

In [77]:
X_train, X_test, y_train, y_test = train_test_split(df[['headline', 'short_description']], df['category'], test_size=0.3, stratify=df['category'])

In [78]:
y_train.value_counts()

POLITICS          24921
WELLNESS          12561
ENTERTAINMENT     12153
TRAVEL             6930
STYLE & BEAUTY     6870
PARENTING          6154
HEALTHY LIVING     4686
FOOD & DRINK       4438
BUSINESS           4194
COMEDY             3780
SPORTS             3554
HOME & LIVING      3024
PARENTS            2769
WEDDINGS           2557
WOMEN              2500
CRIME              2493
IMPACT             2439
DIVORCE            2398
MEDIA              2061
WEIRD NEWS         1944
GREEN              1835
RELIGION           1804
STYLE              1578
SCIENCE            1544
TECH               1473
TASTE              1467
MONEY              1229
ARTS               1056
ENVIRONMENT        1011
FIFTY               981
GOOD NEWS           979
U.S. NEWS           964
ARTS & CULTURE      937
COLLEGE             801
CULTURE & ARTS      752
EDUCATION           710
Name: category, dtype: int64

In [79]:
y_test.value_counts()

POLITICS          10681
WELLNESS           5384
ENTERTAINMENT      5209
TRAVEL             2970
STYLE & BEAUTY     2944
PARENTING          2637
HEALTHY LIVING     2008
FOOD & DRINK       1902
BUSINESS           1798
COMEDY             1620
SPORTS             1523
HOME & LIVING      1296
PARENTS            1186
WEDDINGS           1096
WOMEN              1072
CRIME              1069
IMPACT             1045
DIVORCE            1028
MEDIA               883
WEIRD NEWS          833
GREEN               787
RELIGION            773
STYLE               676
SCIENCE             662
TECH                631
TASTE               629
MONEY               527
ARTS                453
ENVIRONMENT         433
FIFTY               420
GOOD NEWS           419
U.S. NEWS           413
ARTS & CULTURE      402
COLLEGE             343
CULTURE & ARTS      322
EDUCATION           304
Name: category, dtype: int64

# Create pre-processing class

In [80]:
class Transformer():
    def __init__(self):
        pass
    
    def transform(self, x):
        x['summary'] = x['headline'] + ' ' + x['headline']
        x['summary'] = x['summary'].map(Transformer.__remove_punctuation)
        x['summary'] = x['summary'].map(Transformer.__clean_text)
        x['summary'] = x['summary'].map(Transformer.__lemmatizer)
        x['summary'] = x['summary'].map(Transformer.__remove_stopwords)

        return x['summary']

    def fit(self, x, y):
        return self
    
    @staticmethod
    def __clean_text(text):
        text= text.lower()
        text= ' '.join(re.sub("(@[A-Za-z0-9]+)", " ", text).split()) #tags
        text= ' '.join(re.sub("^@?(\w){1,15}$", " ", text).split())

        text= ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split())   #Links
        text= ' '.join(re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"," ", text).split()) 
        text= ' '.join(re.sub(r'http\S+', '',text).split())
        text= ' '.join(re.sub(r'www\S+', '',text).split())
        text= ' '.join(re.sub("\s+", " ",text).split()) #Extrem white Space
        text= ' '.join(re.sub("[^-9A-Za-z ]", "" ,text).split()) #digits 
        text= ' '.join(re.sub('-', ' ', text).split()) 
        text= ' '.join(re.sub('_', ' ', text).split()) #underscore     
        return text
    
    @staticmethod
    def __remove_stopwords(text):
        """The function to removing stopwords"""
        stop = stopwords.words('english')
        text = [word.lower() for word in text.split() if word.lower() not in stop]
        return " ".join(text)
    
    @staticmethod
    def __lemmatizer(text):
        """The function to apply lemmatizing"""
        word_list = nltk.word_tokenize(text)
        lemmatized_text = ' '.join([WordNetLemmatizer().lemmatize(w) for w in word_list])
        return lemmatized_text
    
    @staticmethod
    def __remove_punctuation(text):
        """The function to remove punctuation"""
        text = str(text)
        table = str.maketrans('', '', string.punctuation)
        return text.translate(table)    

# Preprocess data

In [81]:
tfidf = TfidfTransformer()
cv = CountVectorizer()
X_train = tfidf.fit_transform(cv.fit_transform(Transformer().transform(X_train)))

In [82]:
joblib.dump(tfidf, 'models/tfidf.joblib')
joblib.dump(cv, 'models/cv.joblib')

['models/cv.joblib']

# Fit models

## Naive Bayes

In [83]:
# fit new model
model_NB = MultinomialNB()
model_NB = GridSearchCV(model_NB, {'alpha': [0.05, 0.1, 0.5, 1, 2]}, cv=5, n_jobs=4).fit(X_train, y_train).best_estimator_
model_NB

In [84]:
# Dump model
joblib.dump(model_NB, 'models/NB.joblib')

['models/NB.joblib']

In [85]:
# Load fitted model
model_NB = joblib.load('models/NB.joblib')
model_NB

## Linear SVC

In [86]:
# fit new model
model_SVM = LinearSVC()
model_SVM = GridSearchCV(model_SVM, {'C': [1, 7, 20, 35, 50]}, cv=5, verbose=100).fit(X_train, y_train).best_estimator_
model_SVM

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5; 1/5] START C=1.........................................................
[CV 1/5; 1/5] END ..........................C=1;, score=0.598 total time=   7.4s
[CV 2/5; 1/5] START C=1.........................................................
[CV 2/5; 1/5] END ..........................C=1;, score=0.595 total time=   7.3s
[CV 3/5; 1/5] START C=1.........................................................
[CV 3/5; 1/5] END ..........................C=1;, score=0.597 total time=   7.4s
[CV 4/5; 1/5] START C=1.........................................................
[CV 4/5; 1/5] END ..........................C=1;, score=0.601 total time=   7.3s
[CV 5/5; 1/5] START C=1.........................................................
[CV 5/5; 1/5] END ..........................C=1;, score=0.597 total time=   7.3s
[CV 1/5; 2/5] START C=7.........................................................
[CV 1/5; 2/5] END ..........................C=7;,

In [87]:
# Dump model
joblib.dump(model_SVM, 'models/SVC.joblib')

['models/SVC.joblib']

In [88]:
# Load fitted model
model_SVC = joblib.load('models/SVC.joblib')
model_SVC

## Logistic regression

In [89]:
# fit new model
model_LR = LogisticRegression()
model_LR = GridSearchCV(model_LR, {'C': [0.1, 1, 7, 15, 35, 50]}, verbose=100).fit(X_train, y_train).best_estimator_
model_LR

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5; 1/6] START C=0.1.......................................................
[CV 1/5; 1/6] END ........................C=0.1;, score=0.467 total time=  31.9s
[CV 2/5; 1/6] START C=0.1.......................................................
[CV 2/5; 1/6] END ........................C=0.1;, score=0.464 total time=  32.1s
[CV 3/5; 1/6] START C=0.1.......................................................
[CV 3/5; 1/6] END ........................C=0.1;, score=0.463 total time=  32.3s
[CV 4/5; 1/6] START C=0.1.......................................................
[CV 4/5; 1/6] END ........................C=0.1;, score=0.464 total time=  31.7s
[CV 5/5; 1/6] START C=0.1.......................................................
[CV 5/5; 1/6] END ........................C=0.1;, score=0.467 total time=  32.7s
[CV 1/5; 2/6] START C=1.........................................................
[CV 1/5; 2/6] END ..........................C=1;,

In [90]:
# Dump model
joblib.dump(model_LR, 'models/LR.joblib')

['models/LR.joblib']

In [91]:
# Load fitted model
model_LR = joblib.load('models/LR.joblib')
model_LR

# Models testing

In [100]:
tfidf = joblib.load('models/tfidf.joblib')
cv = joblib.load('models/cv.joblib')

In [101]:
X_test = tfidf.transform(cv.transform(Transformer().transform(X_test)))

IndexError: Index dimension must be 1 or 2

## Naive Bayes

In [93]:
y_pred_NB = model_NB.predict(X_test)
accuracy_score(y_test, y_pred_NB)

0.5553584731632907

In [94]:
print(classification_report(y_test, y_pred_NB))

                precision    recall  f1-score   support

          ARTS       0.40      0.09      0.14       453
ARTS & CULTURE       0.44      0.07      0.13       402
      BUSINESS       0.48      0.33      0.39      1798
       COLLEGE       0.69      0.11      0.19       343
        COMEDY       0.53      0.30      0.38      1620
         CRIME       0.60      0.52      0.56      1069
CULTURE & ARTS       0.60      0.13      0.21       322
       DIVORCE       0.77      0.48      0.59      1028
     EDUCATION       0.70      0.05      0.09       304
 ENTERTAINMENT       0.54      0.77      0.63      5209
   ENVIRONMENT       0.66      0.14      0.23       433
         FIFTY       0.67      0.05      0.10       420
  FOOD & DRINK       0.61      0.70      0.65      1902
     GOOD NEWS       0.48      0.13      0.20       419
         GREEN       0.43      0.19      0.27       787
HEALTHY LIVING       0.29      0.11      0.15      2008
 HOME & LIVING       0.71      0.60      0.65  

## Linear SVC

In [95]:
y_pred_SVC = model_SVC.predict(X_test)
accuracy_score(y_test, y_pred_SVC)

0.6026109475327255

In [96]:
print(classification_report(y_test, y_pred_SVC))

                precision    recall  f1-score   support

          ARTS       0.28      0.18      0.22       453
ARTS & CULTURE       0.36      0.21      0.26       402
      BUSINESS       0.47      0.43      0.45      1798
       COLLEGE       0.50      0.39      0.44       343
        COMEDY       0.56      0.43      0.49      1620
         CRIME       0.57      0.59      0.58      1069
CULTURE & ARTS       0.40      0.25      0.31       322
       DIVORCE       0.74      0.66      0.70      1028
     EDUCATION       0.36      0.25      0.29       304
 ENTERTAINMENT       0.65      0.74      0.69      5209
   ENVIRONMENT       0.41      0.26      0.32       433
         FIFTY       0.35      0.18      0.24       420
  FOOD & DRINK       0.62      0.71      0.66      1902
     GOOD NEWS       0.40      0.24      0.30       419
         GREEN       0.39      0.34      0.36       787
HEALTHY LIVING       0.29      0.17      0.21      2008
 HOME & LIVING       0.69      0.70      0.70  

## Logistic regression

In [98]:
y_pred_LR = model_LR.predict(X_test)
accuracy_score(y_test, y_pred_LR)

0.5951967079357197

In [99]:
print(classification_report(y_test, y_pred_LR))

                precision    recall  f1-score   support

          ARTS       0.40      0.16      0.22       453
ARTS & CULTURE       0.39      0.08      0.13       402
      BUSINESS       0.48      0.44      0.46      1798
       COLLEGE       0.55      0.39      0.46       343
        COMEDY       0.52      0.43      0.47      1620
         CRIME       0.57      0.58      0.58      1069
CULTURE & ARTS       0.28      0.41      0.33       322
       DIVORCE       0.79      0.62      0.69      1028
     EDUCATION       0.43      0.24      0.31       304
 ENTERTAINMENT       0.63      0.73      0.67      5209
   ENVIRONMENT       0.52      0.23      0.32       433
         FIFTY       0.35      0.18      0.23       420
  FOOD & DRINK       0.63      0.68      0.65      1902
     GOOD NEWS       0.34      0.26      0.30       419
         GREEN       0.42      0.36      0.39       787
HEALTHY LIVING       0.32      0.20      0.24      2008
 HOME & LIVING       0.71      0.68      0.69  