In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import re
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
df = pd.read_csv('final_data_for_training.csv')

In [3]:
df.head()

Unnamed: 0,description,label
0,"""Until you have a dog you don't understand wha...",entertainment
1,More than half a million people remained witho...,world_news
2,White House officials say the crux of the pres...,world_news
3,FIFA has come under pressure from several Euro...,world_news
4,The incident underscores a growing wave of pro...,world_news


In [4]:
df['label'].value_counts()

politics                  32441
health                    23208
entertainment             19416
fashion                   11369
food_and_beverages         8271
world_news                 6961
business                   6887
sports                     4414
science_and_technology     3906
enviornment                3488
crime                      2832
Name: label, dtype: int64

In [5]:
sorted(df['label'].unique())

['business',
 'crime',
 'entertainment',
 'enviornment',
 'fashion',
 'food_and_beverages',
 'health',
 'politics',
 'science_and_technology',
 'sports',
 'world_news']

In [6]:
len(sorted(df['label'].unique()))

11

In [7]:
x_train = list(df['description'])
y_train = list(df['label'])

In [8]:
def standardization(input_text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', input_text)
    output = cleaned_text.lower()
    return output

In [9]:
training_text = []

for sent in tqdm(x_train):
    new_sent = standardization(sent)
    training_text.append(new_sent)

100%|██████████████████████████████████████████████████████████████████████| 123193/123193 [00:00<00:00, 210537.32it/s]


In [10]:
# training_text

In [11]:
VOCAB_SIZE = 10000
SEQUENCE_LENGTH = 100

trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [12]:
# tokenize input text
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, oov_token=oov_tok)

In [13]:
tokenizer.fit_on_texts(training_text)

In [14]:
# tokenizer.word_index

In [15]:
# saving tokenizer for taking inferences in future

with open('trained_tokeinzer.pickle', 'wb') as pkl_file:
    pickle.dump(tokenizer, pkl_file)

In [16]:
x_train_seq = tokenizer.texts_to_sequences(training_text)

In [17]:
# x_train_seq
type(x_train_seq)

list

In [18]:
x_train_padded_seq = tf.keras.preprocessing.sequence.pad_sequences(x_train_seq, maxlen=SEQUENCE_LENGTH, padding=padding_type, truncating=trunc_type)

In [19]:
# x_train_padded_seq
type(x_train_padded_seq)

numpy.ndarray

In [20]:
x_train_padded_seq.shape

(123193, 100)

In [21]:
# encoding labels into numbers
encoder = LabelEncoder()

In [22]:
y_train = np.array(y_train)

In [23]:
encoder.fit(y_train)

LabelEncoder()

In [24]:
type(encoder)

sklearn.preprocessing._label.LabelEncoder

In [25]:
encoder.classes_

array(['business', 'crime', 'entertainment', 'enviornment', 'fashion',
       'food_and_beverages', 'health', 'politics',
       'science_and_technology', 'sports', 'world_news'], dtype='<U22')

In [26]:
# saving encoder for taking  inferences in future

with open('encoder.pickle', 'wb') as pkl_file:
    pickle.dump(encoder, pkl_file)

In [27]:
y_encoded = encoder.transform(y_train)

In [38]:
# y_encoded

In [39]:
# y_train_encoded = tf.keras.utils.to_categorical(y_encoded)

In [40]:
# y_train_encoded

In [28]:
y_encoded_tensor = tf.convert_to_tensor(y_encoded)

In [29]:
x_train_tensor = tf.convert_to_tensor(x_train_padded_seq)

In [30]:
# MODEL ARCHITECTURE
EMBEDDING_DIM = 64

input_layer = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,))

emb_layer = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM)(input_layer)

lstm_1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512, activation='relu', return_sequences=True))(emb_layer)
# emb = lstm_1+emb

lstm_2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, activation='relu', return_sequences=True))(lstm_1)
# emb = lstm_2+emb

# lstm_3 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, activation='relu', return_sequences=True))(lstm_2)
# emb = lstm_3+emb

pooling_layer = tf.keras.layers.GlobalMaxPooling1D()(lstm_2)

hidden = tf.keras.layers.BatchNormalization()(pooling_layer)
hidden = tf.keras.layers.Dense(256, activation='relu')(hidden)
hidden = tf.keras.layers.BatchNormalization()(hidden)
hidden = tf.keras.layers.Dense(128, activation='relu')(hidden)


output_layer = tf.keras.layers.Dense(11, activation='softmax')(hidden)
model = tf.keras.models.Model(inputs=input_layer, outputs=output_layer)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 64)           640000    
                                                                 
 bidirectional (Bidirectiona  (None, 100, 1024)        2363392   
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 100, 512)         2623488   
 nal)                                                            
                                                                 
 global_max_pooling1d (Globa  (None, 512)              0         
 lMaxPooling1D)                                                  
                                                             

In [31]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
             optimizer= tf.keras.optimizers.Adam(),
             metrics=['accuracy'])

In [47]:
history = model.fit(x_train_tensor, y_encoded_tensor, epochs=12, batch_size=1024)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [48]:
model.save('smart_news_crafter_model_softtmax_12_epochs.h5')