In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize 
    # nltk.download('stopwords')
# nltk.download('punkt')
STOPWORDS = set(stopwords.words('english'))

In [11]:
data = pd.read_json('dataset/news.json', lines=True,)
data.drop(['authors','link','date'],axis=1,inplace=True)

In [12]:
data['Text'] = data['headline'] + data['short_description']
data.drop(['headline','short_description'],axis=1,inplace=True)

In [13]:
data.head()

Unnamed: 0,category,Text
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 5...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...


In [14]:
from sklearn.utils import shuffle

#Handle Multiclass Imbalance Datasets
def Imbalance_to_balance(df,No_of_sample):
    
    # Dict used to store data class wise
    df_dic = {}
    
    # Create dataframe to store data after sampling
    df_new = pd.DataFrame(columns=['category','Text'])
    
    #cal use to iterate over each class in target col
    for cal in df['category'].unique():
        
        # Filtering class from target which has more than No_of_sample row
        if df[df['category'] == cal].shape[0] > No_of_sample:
            
            #Extracting 4000 sample class wise from dataframe
            df_dic[cal] = df[df['category'] == cal].sample(No_of_sample,random_state=42,ignore_index=True)
    
    # store all class which has more than 4000 row
    cal = list(df_dic.keys())
    for classs in cal:
        
        #concate data of each class into new data frame
        df_new = pd.concat([df_new,df_dic[f"{classs}"]],axis=0)
    
    return shuffle(df_new)

In [15]:
df= Imbalance_to_balance(data,8000)

In [16]:
# df = df.head(500).copy()

In [17]:
df['Text'].head()

4548    Romney, Rubio And Many Others Have Called Trum...
2591    My Personal Playlist for Spring 2013When I am ...
6613    Obama Family Attends Church In Matching Monoch...
1308    Blac Chyna And Rob Kardashian Are 'Taking Thin...
7775    Jennifer Aniston Gets Dolled Up By Ellen DeGen...
Name: Text, dtype: object

In [18]:
df['category'].value_counts()

POLITICS          8000
WELLNESS          8000
STYLE & BEAUTY    8000
ENTERTAINMENT     8000
PARENTING         8000
TRAVEL            8000
Name: category, dtype: int64

In [19]:
max_len = df['Text'].apply(lambda x:len(x.split())).max()
max_len

209

In [20]:
import re
def process_text(text):
    text = text.lower().replace('\n',' ').replace('\r','').strip()
    text = re.sub(' +', ' ', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'[0-9]','',text)
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    text = " ".join(filtered_sentence)
    return text

In [21]:
df['Text'] = df['Text'].apply(lambda x:process_text(x))

In [22]:
max_len = df['Text'].apply(lambda x:len(x.split())).max()
max_len

128

In [23]:
label_dic = {}
for key, label in enumerate(df['category'].unique()):
    label_dic[label] = key

In [24]:
print(label_dic)

{'POLITICS': 0, 'WELLNESS': 1, 'STYLE & BEAUTY': 2, 'ENTERTAINMENT': 3, 'PARENTING': 4, 'TRAVEL': 5}


`Building Model`

In [25]:
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomTreesEmbedding
from xgboost import XGBRFClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn import svm


models = []
models.append(("DecisionTree",DecisionTreeClassifier()))
models.append(("RandomForest",RandomForestClassifier()))
models.append(("ExtraTreeClassifier",ExtraTreeClassifier()))
models.append(("XGBRFClassifier",XGBRFClassifier()))
models.append(("XGBClassifier",XGBClassifier()))
models.append(("ExtraTreeClassifier",ExtraTreeClassifier()))
models.append(("LinearSVC",LinearSVC()))
models.append(("KNeighbors",KNeighborsClassifier()))


# results = []
# namwarnings[]
# for name,model in models:
#     result = cross_val_score(model, X, y,  cv=5)
#     names.append(name)
#     results.append(result)

# for i in range(len(names)):
#     print(names[i],results[i].mean())


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [27]:
max_len = df['Text'].apply(lambda x:len(x.split())).max()
max_len

128

TF-IDF

In [28]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tokenizer = Tokenizer()

In [29]:
TRAIN_SIZE = 0.8
MAX_NB_WORDS = 1000
max_len = int(round(df['Text'].apply(lambda x: len(str(x).split())).max()))

In [30]:
def TF_IDF_ML(X,y):
    tokenizer.fit_on_texts(X)
    word_index = tokenizer.word_index
    vocab_size = len(tokenizer.word_index) + 1
    
    #print("Vocabulary Size :", vocab_size)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30, random_state=42)
    
    
    X_train = pad_sequences(tokenizer.texts_to_sequences(X_train),
                        maxlen = max_len)
    X_test = pad_sequences(tokenizer.texts_to_sequences(X_test),
                       maxlen = max_len)
    return X_train, X_test, y_train, y_test

In [31]:
X_train, X_test, y_train, y_test = TF_IDF_ML(df['Text'],df['category'])

In [32]:
from sklearn.metrics import classification_report
def Train_Model(model):
    print("training:",model)
    print("xtrain",X_train.dtype)
    print("xtest",X_test.dtype)
    print("ytrain",y_train.dtype)
    print("ytest",y_test.dtype)
    model.fit(X_train,y_train)

    return classification_report(y_test,model.predict(X_test))

In [33]:
from sklearn.preprocessing import LabelEncoder

In [34]:
enc = LabelEncoder()
y_train = enc.fit_transform(y_train)
y_test = enc.transform(y_test)

In [35]:
for model in models:
    print(model[0])
    print(Train_Model(model[1]))
    print("\n\n")

DecisionTree
training: DecisionTreeClassifier()
xtrain int32
xtest int32
ytrain int32
ytest int32
              precision    recall  f1-score   support

           0       0.36      0.35      0.35      2453
           1       0.24      0.25      0.25      2357
           2       0.28      0.28      0.28      2453
           3       0.30      0.31      0.30      2364
           4       0.21      0.20      0.21      2421
           5       0.24      0.23      0.23      2352

    accuracy                           0.27     14400
   macro avg       0.27      0.27      0.27     14400
weighted avg       0.27      0.27      0.27     14400




RandomForest
training: RandomForestClassifier()
xtrain int32
xtest int32
ytrain int32
ytest int32
              precision    recall  f1-score   support

           0       0.51      0.51      0.51      2453
           1       0.31      0.35      0.33      2357
           2       0.39      0.33      0.36      2453
           3       0.37      0.49      0.



              precision    recall  f1-score   support

           0       0.28      0.48      0.35      2453
           1       0.21      0.27      0.23      2357
           2       0.23      0.25      0.24      2453
           3       0.17      0.10      0.13      2364
           4       0.21      0.10      0.13      2421
           5       0.19      0.16      0.17      2352

    accuracy                           0.23     14400
   macro avg       0.22      0.23      0.21     14400
weighted avg       0.22      0.23      0.21     14400




KNeighbors
training: KNeighborsClassifier()
xtrain int32
xtest int32
ytrain int32
ytest int32
              precision    recall  f1-score   support

           0       0.34      0.46      0.39      2453
           1       0.24      0.36      0.29      2357
           2       0.28      0.26      0.27      2453
           3       0.30      0.23      0.26      2364
           4       0.23      0.14      0.17      2421
           5       0.24      0.20  

In [36]:
y_train

array([2, 5, 4, ..., 1, 2, 4])

In [37]:
model_XGBC = XGBClassifier()
model_XGBC.fit(X_train,y_train)
print(classification_report(y_test,model_XGBC.predict(X_test)))

              precision    recall  f1-score   support

           0       0.53      0.55      0.54      2453
           1       0.43      0.46      0.45      2357
           2       0.51      0.45      0.48      2453
           3       0.50      0.55      0.53      2364
           4       0.39      0.33      0.36      2421
           5       0.36      0.39      0.37      2352

    accuracy                           0.46     14400
   macro avg       0.45      0.46      0.45     14400
weighted avg       0.45      0.46      0.45     14400



In [38]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re

In [39]:
from gensim.models import doc2vec

In [40]:
labels = df['category'].map(label_dic)
labels[0]

0    3
0    5
0    2
0    1
0    0
0    4
Name: category, dtype: int64

In [41]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled
X_train, X_test, y_train, y_test = train_test_split(df['Text'],labels, random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [42]:
all_data[:2]

[TaggedDocument(words=['malaysia', 'airlines', 'flight', 'mh', 'tragedy', 'keep', 'travelingits', 'never', 'easy', 'cope', 'death', 'innocents', 'let', 'alone', 'thousands', 'upon', 'millions', 'already', 'lost', 'politicallycharged', 'conflict', 'travelers', 'best', 'travel'], tags=['Train_0']),
 TaggedDocument(words=['facebook', 'suspends', 'political', 'research', 'firm', 'linked', 'trump', 'violating', 'user', 'privacythe', 'group', 'reportedly', 'obtained', 'personal', 'information', 'million', 'users'], tags=['Train_1'])]

In [43]:
all_data[:1]

[TaggedDocument(words=['malaysia', 'airlines', 'flight', 'mh', 'tragedy', 'keep', 'travelingits', 'never', 'easy', 'cope', 'death', 'innocents', 'let', 'alone', 'thousands', 'upon', 'millions', 'already', 'lost', 'politicallycharged', 'conflict', 'travelers', 'best', 'travel'], tags=['Train_0'])]

In [44]:
len(all_data)

48000

In [45]:
df.head()

Unnamed: 0,category,Text
4548,POLITICS,romney rubio many others called trump con man ...
2591,WELLNESS,personal playlist spring devising new playlist...
6613,STYLE & BEAUTY,obama family attends church matching monochrom...
1308,ENTERTAINMENT,blac chyna rob kardashian taking things slow w...
7775,STYLE & BEAUTY,jennifer aniston gets dolled ellen degeneres v...


In [46]:
max_lengh = df['Text'].apply(lambda x:len(x.split())).max()
max_lengh

128

In [47]:
model_dbow = Doc2Vec(dm=0, vector_size=max_lengh, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 48000/48000 [00:00<00:00, 4352913.28it/s]
100%|██████████| 48000/48000 [00:00<00:00, 4343053.59it/s]
100%|██████████| 48000/48000 [00:00<00:00, 3681434.20it/s]
100%|██████████| 48000/48000 [00:00<00:00, 3418863.11it/s]
100%|██████████| 48000/48000 [00:00<00:00, 3436545.68it/s]
100%|██████████| 48000/48000 [00:00<00:00, 3691153.62it/s]
100%|██████████| 48000/48000 [00:00<00:00, 3071297.04it/s]
100%|██████████| 48000/48000 [00:00<00:00, 2991657.63it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1493709.09it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1631244.72it/s]
100%|██████████| 48000/48000 [00:00<00:00, 2938559.55it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1530348.99it/s]
100%|██████████| 48000/48000 [00:00<00:00, 2029113.29it/s]
100%|██████████| 48000/48000 [00:00<00:00, 2870721.82it/s]
100%|██████████| 48000/48000 [00:00<00:00, 4801950.87it/s]
100%|██████████| 48000/48000 [00:00<00:00, 4559748.87it/s]
100%|██████████| 48000/48000 [00:00<00:00, 4237473.26it/

In [48]:
model_dbow

<gensim.models.doc2vec.Doc2Vec at 0x1b00f2f5bb0>

In [49]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train), max_lengh, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), max_lengh, 'Test')

  vectors[i] = model.docvecs[prefix]


In [50]:
from sklearn.metrics import classification_report
def Train_Model(model):
    model.fit(train_vectors_dbow,y_train)
    return print(classification_report(y_test,model.predict(test_vectors_dbow)))

In [51]:
for model in models:
    print(model[0])
    print(Train_Model(model[1]))
    print("\n\n")

DecisionTree
              precision    recall  f1-score   support

           0       0.56      0.55      0.56      2424
           1       0.42      0.40      0.41      2399
           2       0.49      0.49      0.49      2359
           3       0.46      0.49      0.47      2377
           4       0.41      0.40      0.41      2432
           5       0.50      0.49      0.50      2409

    accuracy                           0.47     14400
   macro avg       0.47      0.47      0.47     14400
weighted avg       0.47      0.47      0.47     14400

None



RandomForest
              precision    recall  f1-score   support

           0       0.79      0.85      0.82      2424
           1       0.67      0.70      0.69      2399
           2       0.79      0.77      0.78      2359
           3       0.74      0.71      0.72      2377
           4       0.69      0.63      0.66      2432
           5       0.76      0.79      0.77      2409

    accuracy                           0.74



              precision    recall  f1-score   support

           0       0.84      0.86      0.85      2424
           1       0.77      0.73      0.75      2399
           2       0.84      0.82      0.83      2359
           3       0.78      0.79      0.78      2377
           4       0.75      0.75      0.75      2432
           5       0.81      0.85      0.83      2409

    accuracy                           0.80     14400
   macro avg       0.80      0.80      0.80     14400
weighted avg       0.80      0.80      0.80     14400

None



KNeighbors
              precision    recall  f1-score   support

           0       0.61      0.92      0.73      2424
           1       0.83      0.33      0.47      2399
           2       0.95      0.39      0.56      2359
           3       0.40      0.84      0.54      2377
           4       0.79      0.25      0.38      2432
           5       0.64      0.82      0.72      2409

    accuracy                           0.59     14400
   m

In [52]:
models

[('DecisionTree', DecisionTreeClassifier()),
 ('RandomForest', RandomForestClassifier()),
 ('ExtraTreeClassifier', ExtraTreeClassifier()),
 ('XGBRFClassifier',
  XGBRFClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                  colsample_bylevel=1, colsample_bytree=1,
                  early_stopping_rounds=None, enable_categorical=False,
                  eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
                  importance_type=None, interaction_constraints='', max_bin=256,
                  max_cat_to_onehot=4, max_delta_step=0, max_depth=6,
                  max_leaves=0, min_child_weight=1, missing=nan,
                  monotone_constraints='()', n_estimators=100, n_jobs=0,
                  num_parallel_tree=100, objective='multi:softprob',
                  predictor='auto', random_state=0, reg_alpha=0,
                  sampling_method='uniform', scale_pos_weight=None, ...)),
 ('XGBClassifier',
  XGBClassifier(base_score=0.5, booster='gb

In [53]:
from joblib import dump

In [54]:
model_dict = {
    'max_len':max_len,
    'tokenizer':tokenizer,
    'encoder':enc,
    'vectorzier':vectorizer,
    'models':models,
}

In [55]:
dump(model_dict,"models.pk")

['models.pk']

In [56]:
from joblib import load

In [57]:
models = load('models.pk')

In [58]:
models

{'max_len': 128,
 'tokenizer': <keras_preprocessing.text.Tokenizer at 0x1b06b8de5e0>,
 'encoder': LabelEncoder(),
 'vectorzier': TfidfVectorizer(),
 'models': [('DecisionTree', DecisionTreeClassifier()),
  ('RandomForest', RandomForestClassifier()),
  ('ExtraTreeClassifier', ExtraTreeClassifier()),
  ('XGBRFClassifier',
   XGBRFClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                   colsample_bylevel=1, colsample_bytree=1,
                   early_stopping_rounds=None, enable_categorical=False,
                   eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
                   importance_type=None, interaction_constraints='', max_bin=256,
                   max_cat_to_onehot=4, max_delta_step=0, max_depth=6,
                   max_leaves=0, min_child_weight=1, missing=nan,
                   monotone_constraints='()', n_estimators=100, n_jobs=0,
                   num_parallel_tree=100, objective='multi:softprob',
                   predictor='a