# 📥 Importing Libraries

In [1]:
import re
import json
import string
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import  CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# 🗃️ Load Dataset 

In [2]:
path ='Dataset/news.json'

In [3]:
list_ = []
with open(path) as files:
    for file in files:
        list_.append(json.loads(file))

# 📝 Meta information of Dataframe

In [4]:
data = pd.DataFrame(list_)
data.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   category           200853 non-null  object
 1   headline           200853 non-null  object
 2   authors            200853 non-null  object
 3   link               200853 non-null  object
 4   short_description  200853 non-null  object
 5   date               200853 non-null  object
dtypes: object(6)
memory usage: 9.2+ MB


# 🔎 Checking for NaN values

In [6]:
data.isnull().sum()

category             0
headline             0
authors              0
link                 0
short_description    0
date                 0
dtype: int64

# 🧹 Cleaning Data

In [7]:
def clean_text(text):
    text = text.lower()                                  # lower-case all characters
    text =  re.sub(r'@\S+', '',text)                     # remove twitter handles
    text =  re.sub(r'http\S+', '',text)                  # remove urls
    text =  re.sub(r'pic.\S+', '',text) 
    text =  re.sub(r"[^a-zA-Z+']", ' ',text)             # only keeps characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text+' ')      # keep words with length>1 only
    text = "".join([i for i in text if i not in string.punctuation])
    words = nltk.tokenize.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')   # remove stopwords
    text = " ".join([i for i in words if i not in stopwords and len(i)>2])
    text= re.sub("\s[\s]+", " ",text).strip()            # remove repeated/leading/trailing spaces
    return text

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
data['Text_cleaning'] = data.headline.apply(clean_text)
data.head()

Unnamed: 0,category,headline,authors,link,short_description,date,Text_cleaning
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26,mass shootings texas last week
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26,smith joins diplo nicky jam world cups officia...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26,hugh grant marries first time age
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26,jim carrey blasts castrato adam schiff democra...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26,julianna margulies uses donald trump poop bags...


# 🔢 The process of converting words into numbers

In [10]:
vectorizer = CountVectorizer()
data_vectorizer = vectorizer.fit_transform(data['Text_cleaning'])

In [11]:
labels = data['category']

# ✂️ Train test split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data_vectorizer, labels, test_size=0.2, random_state=42)

# 📚 Training model

In [13]:
nb = MultinomialNB()
nb.fit(X_train,y_train)

In [14]:
y_pred = nb.predict(X_test)

# 🧪 Test & Train Accuracy

In [15]:
Acc_train = nb.score(X_train, y_train)
acc_test = nb.score(X_test, y_test)
print('Train Accuracy : {:.2f}%'.format(Acc_train*100))
print('Test Accuracy  : {:.2f}%'.format(acc_test*100))

Train Accuracy : 60.84%
Test Accuracy  : 52.05%


# ✔️ Classification report

In [16]:
print(classification_report(y_test, y_pred))

                precision    recall  f1-score   support

          ARTS       0.41      0.05      0.08       298
ARTS & CULTURE       0.46      0.02      0.04       283
  BLACK VOICES       0.60      0.23      0.33       882
      BUSINESS       0.51      0.31      0.39      1121
       COLLEGE       0.62      0.04      0.08       243
        COMEDY       0.67      0.27      0.38      1068
         CRIME       0.52      0.51      0.52       650
CULTURE & ARTS       0.85      0.10      0.18       222
       DIVORCE       0.82      0.44      0.57       696
     EDUCATION       0.88      0.04      0.07       188
 ENTERTAINMENT       0.45      0.78      0.57      3139
   ENVIRONMENT       1.00      0.03      0.06       275
         FIFTY       0.00      0.00      0.00       269
  FOOD & DRINK       0.64      0.70      0.67      1265
     GOOD NEWS       0.62      0.05      0.09       276
         GREEN       0.48      0.14      0.22       515
HEALTHY LIVING       0.40      0.06      0.10  

In [17]:
from joblib import dump,load

In [18]:
clf_dict = {'model':nb, 'vectorizer':vectorizer}
dump(clf_dict, 'news_classifier.joblib')

['news_classifier.joblib']

In [19]:
def make_predict(news_data):
    xinp = clean_text(xinp)
    loaded_model = load('news_classifier.joblib')
    vectorizer = loaded_model['vectorizer']
    nb = loaded_model['model']
    xinpv = vectorizer.transform([xinp])
    y_pred = nb.predict(xinpv)
    return y_pred

# ✨ Thanks