In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize 
    # nltk.download('stopwords')
# nltk.download('punkt')
STOPWORDS = set(stopwords.words('english'))

In [3]:
data = pd.read_json('dataset/news.json', lines=True,)
data.drop(['authors','link','date'],axis=1,inplace=True)

In [4]:
data['Text'] = data['headline'] + data['short_description']
data.drop(['headline','short_description'],axis=1,inplace=True)

In [5]:
data.head()

Unnamed: 0,category,Text
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 5...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...


In [6]:
from sklearn.utils import shuffle

#Handle Multiclass Imbalance Datasets
def Imbalance_to_balance(df,No_of_sample):
    
    # Dict used to store data class wise
    df_dic = {}
    
    # Create dataframe to store data after sampling
    df_new = pd.DataFrame(columns=['category','Text'])
    
    #cal use to iterate over each class in target col
    for cal in df['category'].unique():
        
        # Filtering class from target which has more than No_of_sample row
        if df[df['category'] == cal].shape[0] > No_of_sample:
            
            #Extracting 4000 sample class wise from dataframe
            df_dic[cal] = df[df['category'] == cal].sample(No_of_sample,random_state=42,ignore_index=True)
    
    # store all class which has more than 4000 row
    cal = list(df_dic.keys())
    for classs in cal:
        
        #concate data of each class into new data frame
        df_new = pd.concat([df_new,df_dic[f"{classs}"]],axis=0)
    
    return shuffle(df_new)

In [7]:
df= Imbalance_to_balance(data,8000)

In [8]:
# df = df.head(500).copy()

In [9]:
df['Text'].head()

6056    Extended Breastfeeding -- Time to Come Out of ...
309     What Not to Say to a Woman After a Miscarriage...
6885    Interview With Nick Frost on Cuban Fury, Wrapp...
3698    The Best Cartoon Character Baby NamesToday is ...
4349    12 Lessons My Twins Taught Me In Their First 1...
Name: Text, dtype: object

In [10]:
df['category'].value_counts()

PARENTING         8000
ENTERTAINMENT     8000
WELLNESS          8000
STYLE & BEAUTY    8000
TRAVEL            8000
POLITICS          8000
Name: category, dtype: int64

In [11]:
max_len = df['Text'].apply(lambda x:len(x.split())).max()
max_len

209

In [12]:
import re
def process_text(text):
    text = text.lower().replace('\n',' ').replace('\r','').strip()
    text = re.sub(' +', ' ', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'[0-9]','',text)
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    text = " ".join(filtered_sentence)
    return text

In [13]:
df['Text'] = df['Text'].apply(lambda x:process_text(x))

In [14]:
max_len = df['Text'].apply(lambda x:len(x.split())).max()
max_len

128

In [15]:
label_dic = {}
for key, label in enumerate(df['category'].unique()):
    label_dic[label] = key

In [16]:
print(label_dic)

{'PARENTING': 0, 'ENTERTAINMENT': 1, 'WELLNESS': 2, 'STYLE & BEAUTY': 3, 'TRAVEL': 4, 'POLITICS': 5}


`Building Model`

In [17]:
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomTreesEmbedding
from xgboost import XGBRFClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn import svm


models = []
models.append(("DecisionTree",DecisionTreeClassifier()))
models.append(("RandomForest",RandomForestClassifier()))
models.append(("ExtraTreeClassifier",ExtraTreeClassifier()))
models.append(("XGBRFClassifier",XGBRFClassifier()))
models.append(("XGBClassifier",XGBClassifier()))
models.append(("ExtraTreeClassifier",ExtraTreeClassifier()))
models.append(("LinearSVC",LinearSVC()))
models.append(("KNeighbors",KNeighborsClassifier()))


# results = []
# namwarnings[]
# for name,model in models:
#     result = cross_val_score(model, X, y,  cv=5)
#     names.append(name)
#     results.append(result)

# for i in range(len(names)):
#     print(names[i],results[i].mean())


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [19]:
max_len = df['Text'].apply(lambda x:len(x.split())).max()
max_len

128

TF-IDF

In [20]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tokenizer = Tokenizer()

In [21]:
TRAIN_SIZE = 0.8
MAX_NB_WORDS = 1000
max_len = int(round(df['Text'].apply(lambda x: len(str(x).split())).max()))

In [22]:
def TF_IDF_ML(X,y):
    tokenizer.fit_on_texts(X)
    word_index = tokenizer.word_index
    vocab_size = len(tokenizer.word_index) + 1
    
    #print("Vocabulary Size :", vocab_size)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30, random_state=42)
    
    
    X_train = pad_sequences(tokenizer.texts_to_sequences(X_train),
                        maxlen = max_len)
    X_test = pad_sequences(tokenizer.texts_to_sequences(X_test),
                       maxlen = max_len)
    return X_train, X_test, y_train, y_test

In [23]:
X_train, X_test, y_train, y_test = TF_IDF_ML(df['Text'],df['category'])

In [24]:
from sklearn.metrics import classification_report
def Train_Model(model):
    print("training:",model)
    print("xtrain",X_train.dtype)
    print("xtest",X_test.dtype)
    print("ytrain",y_train.dtype)
    print("ytest",y_test.dtype)
    model.fit(X_train,y_train)

    return classification_report(y_test,model.predict(X_test))

In [25]:
from sklearn.preprocessing import LabelEncoder

In [26]:
enc = LabelEncoder()
y_train = enc.fit_transform(y_train)
y_test = enc.transform(y_test)

In [27]:
for model in models:
    print(model[0])
    print(Train_Model(model[1]))
    print("\n\n")

DecisionTree
training: DecisionTreeClassifier()
xtrain int32
xtest int32
ytrain int32
ytest int32
              precision    recall  f1-score   support

           0       0.35      0.35      0.35      2396
           1       0.25      0.26      0.25      2444
           2       0.30      0.29      0.29      2458
           3       0.30      0.30      0.30      2336
           4       0.21      0.21      0.21      2379
           5       0.25      0.25      0.25      2387

    accuracy                           0.28     14400
   macro avg       0.28      0.28      0.28     14400
weighted avg       0.28      0.28      0.28     14400




RandomForest
training: RandomForestClassifier()
xtrain int32
xtest int32
ytrain int32
ytest int32
              precision    recall  f1-score   support

           0       0.50      0.52      0.51      2396
           1       0.34      0.36      0.35      2444
           2       0.42      0.34      0.37      2458
           3       0.35      0.49      0.



              precision    recall  f1-score   support

           0       0.23      0.26      0.24      2396
           1       0.24      0.14      0.18      2444
           2       0.21      0.29      0.24      2458
           3       0.19      0.27      0.22      2336
           4       0.16      0.17      0.16      2379
           5       0.22      0.10      0.14      2387

    accuracy                           0.20     14400
   macro avg       0.21      0.20      0.20     14400
weighted avg       0.21      0.20      0.20     14400




KNeighbors
training: KNeighborsClassifier()
xtrain int32
xtest int32
ytrain int32
ytest int32
              precision    recall  f1-score   support

           0       0.34      0.48      0.40      2396
           1       0.26      0.35      0.30      2444
           2       0.29      0.27      0.28      2458
           3       0.30      0.24      0.27      2336
           4       0.22      0.14      0.17      2379
           5       0.25      0.21  

In [28]:
y_train

array([2, 2, 3, ..., 1, 5, 0])

In [29]:
model_XGBC = XGBClassifier()
model_XGBC.fit(X_train,y_train)
print(classification_report(y_test,model_XGBC.predict(X_test)))

              precision    recall  f1-score   support

           0       0.50      0.54      0.52      2396
           1       0.45      0.46      0.45      2444
           2       0.51      0.45      0.48      2458
           3       0.50      0.57      0.53      2336
           4       0.39      0.32      0.35      2379
           5       0.36      0.39      0.38      2387

    accuracy                           0.45     14400
   macro avg       0.45      0.45      0.45     14400
weighted avg       0.45      0.45      0.45     14400



In [30]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re

In [31]:
from gensim.models import doc2vec

In [32]:
labels = df['category'].map(label_dic)
labels[0]

0    4
0    1
0    3
0    2
0    0
0    5
Name: category, dtype: int64

In [33]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled
X_train, X_test, y_train, y_test = train_test_split(df['Text'],labels, random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [34]:
all_data[:2]

[TaggedDocument(words=['completely', 'normal', 'living', 'mental', 'illnessno', 'one', 'deserves', 'live', 'stigma', 'associated', 'mental', 'illness', 'important', 'thing', 'need', 'society', 'compassionate', 'informed', 'caring', 'supportive', 'mental', 'illness'], tags=['Train_0']),
 TaggedDocument(words=['teen', 'wolf', 'iedthis', 'week', 'teen', 'wolf', 'eyes', 'new', 'deadly', 'assassins', 'invaded', 'beacon', 'hills', 'high', 'school', 'hunt', 'names', 'benefactors', 'list', 'ill', 'admit', 'tvtag'], tags=['Train_1'])]

In [35]:
all_data[:1]

[TaggedDocument(words=['completely', 'normal', 'living', 'mental', 'illnessno', 'one', 'deserves', 'live', 'stigma', 'associated', 'mental', 'illness', 'important', 'thing', 'need', 'society', 'compassionate', 'informed', 'caring', 'supportive', 'mental', 'illness'], tags=['Train_0'])]

In [36]:
len(all_data)

48000

In [37]:
df.head()

Unnamed: 0,category,Text
6056,PARENTING,extended breastfeeding time come closetnot eve...
309,PARENTING,say woman miscarriagei recently miscarriage go...
6885,ENTERTAINMENT,interview nick frost cuban fury wrapping corne...
3698,PARENTING,best cartoon character baby namestoday donald ...
4349,PARENTING,lessons twins taught first months time preciou...


In [38]:
max_lengh = df['Text'].apply(lambda x:len(x.split())).max()
max_lengh

128

In [39]:
model_dbow = Doc2Vec(dm=0, vector_size=max_lengh, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 48000/48000 [00:00<00:00, 976886.76it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1430496.11it/s]
100%|██████████| 48000/48000 [00:00<00:00, 911363.57it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1597601.87it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1447039.40it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1651463.33it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1443160.00it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1144772.68it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1500891.56it/s]
100%|██████████| 48000/48000 [00:00<00:00, 745469.06it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1426613.80it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1402474.33it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1531315.11it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1442818.69it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1544105.04it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1534337.74it/s]
100%|██████████| 48000/48000 [00:00<00:00, 944809.36it/s]
1

In [40]:
model_dbow

<gensim.models.doc2vec.Doc2Vec at 0x17a95fd45e0>

In [41]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train), max_lengh, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), max_lengh, 'Test')

  vectors[i] = model.docvecs[prefix]


In [42]:
from sklearn.metrics import classification_report
def Train_Model(model):
    model.fit(train_vectors_dbow,y_train)
    return print(classification_report(y_test,model.predict(test_vectors_dbow)))

In [43]:
for model in models:
    print(model[0])
    print(Train_Model(model[1]))
    print("\n\n")

DecisionTree
              precision    recall  f1-score   support

           0       0.39      0.38      0.39      2431
           1       0.42      0.44      0.43      2367
           2       0.41      0.41      0.41      2403
           3       0.50      0.48      0.49      2448
           4       0.46      0.46      0.46      2384
           5       0.56      0.60      0.58      2367

    accuracy                           0.46     14400
   macro avg       0.46      0.46      0.46     14400
weighted avg       0.46      0.46      0.46     14400

None



RandomForest
              precision    recall  f1-score   support

           0       0.71      0.70      0.70      2431
           1       0.73      0.71      0.72      2367
           2       0.71      0.71      0.71      2403
           3       0.81      0.78      0.79      2448
           4       0.77      0.77      0.77      2384
           5       0.80      0.85      0.82      2367

    accuracy                           0.75



              precision    recall  f1-score   support

           0       0.77      0.75      0.76      2431
           1       0.77      0.78      0.77      2367
           2       0.77      0.74      0.75      2403
           3       0.84      0.83      0.84      2448
           4       0.80      0.83      0.82      2384
           5       0.84      0.87      0.85      2367

    accuracy                           0.80     14400
   macro avg       0.80      0.80      0.80     14400
weighted avg       0.80      0.80      0.80     14400

None



KNeighbors
              precision    recall  f1-score   support

           0       0.72      0.28      0.40      2431
           1       0.37      0.85      0.52      2367
           2       0.86      0.33      0.48      2403
           3       0.95      0.34      0.50      2448
           4       0.63      0.84      0.72      2384
           5       0.69      0.91      0.79      2367

    accuracy                           0.59     14400
   m

In [44]:
models

[('DecisionTree', DecisionTreeClassifier()),
 ('RandomForest', RandomForestClassifier()),
 ('ExtraTreeClassifier', ExtraTreeClassifier()),
 ('XGBRFClassifier',
  XGBRFClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                  colsample_bylevel=1, colsample_bytree=1,
                  early_stopping_rounds=None, enable_categorical=False,
                  eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
                  importance_type=None, interaction_constraints='', max_bin=256,
                  max_cat_to_onehot=4, max_delta_step=0, max_depth=6,
                  max_leaves=0, min_child_weight=1, missing=nan,
                  monotone_constraints='()', n_estimators=100, n_jobs=0,
                  num_parallel_tree=100, objective='multi:softprob',
                  predictor='auto', random_state=0, reg_alpha=0,
                  sampling_method='uniform', scale_pos_weight=None, ...)),
 ('XGBClassifier',
  XGBClassifier(base_score=0.5, booster='gb

In [45]:
from joblib import dump

In [46]:
model_dict = {
    'max_len':max_len,
    'tokenizer':tokenizer,
    'encoder':enc,
    'vectorzier':vectorizer,
    'models':models,
}

In [47]:
dump(model_dict,"models.pk")

['models.pk']

In [48]:
from joblib import load

In [49]:
models = load('models.pk')

In [50]:
models

{'max_len': 128,
 'tokenizer': <keras_preprocessing.text.Tokenizer at 0x17abb13c520>,
 'encoder': LabelEncoder(),
 'vectorzier': TfidfVectorizer(),
 'models': [('DecisionTree', DecisionTreeClassifier()),
  ('RandomForest', RandomForestClassifier()),
  ('ExtraTreeClassifier', ExtraTreeClassifier()),
  ('XGBRFClassifier',
   XGBRFClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                   colsample_bylevel=1, colsample_bytree=1,
                   early_stopping_rounds=None, enable_categorical=False,
                   eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
                   importance_type=None, interaction_constraints='', max_bin=256,
                   max_cat_to_onehot=4, max_delta_step=0, max_depth=6,
                   max_leaves=0, min_child_weight=1, missing=nan,
                   monotone_constraints='()', n_estimators=100, n_jobs=0,
                   num_parallel_tree=100, objective='multi:softprob',
                   predictor='a