**Import Libraries**

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import string, nltk
import re
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.metrics import accuracy_score, precision_score,recall_score, classification_report


In [None]:
import tensorflow 
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense,Input,Embedding,LSTM, Dropout,SimpleRNN
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers

In [None]:
punct = nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words("english"))
nltk.download('omw-1.4') 
lemma = WordNetLemmatizer()
stemm = PorterStemmer()
from bs4 import BeautifulSoup
# Downloading stopwords
nltk.download('stopwords')
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
train_data = pd.read_csv("/content/news_topic_train.csv", engine = "python")


In [None]:
train_data.head()

Unnamed: 0,Id,title,text,label
0,1490,Oscars steer clear of controversy,The Oscars nominations list has left out som...,entertainment
1,2001,Charvis set to lose fitness bid,Flanker Colin Charvis is unlikely to play an...,sport
2,1572,Corbett attacks 'dumbed-down TV',Ronnie Corbett has joined fellow comedy star...,entertainment
3,1840,Sociedad set to rescue Mladenovic,Rangers are set to loan out-of-favour midfie...,sport
4,610,Doors open at biggest gadget fair,Thousands of technology lovers and industry ...,tech


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1780 entries, 0 to 1779
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Id      1780 non-null   int64 
 1   title   1780 non-null   object
 2   text    1780 non-null   object
 3   label   1780 non-null   object
dtypes: int64(1), object(3)
memory usage: 55.8+ KB


In [None]:
train_data.isna().sum()

Id       0
title    0
text     0
label    0
dtype: int64

In [None]:
train_data.drop(['Id'], axis=1,inplace = True)


In [None]:
train_data['text'] = train_data['text'] + " " + train_data['title']
del train_data['title']

**Text preprocessing**

In [None]:
nltk.download('stopwords')
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Removing the Html tags
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [None]:
#Removing the square brackets and urls
def remove_between_square_brackets(text):
    text = re.sub('\[[^]]*\]', '', text)
    return re.sub(r'http\S+', '', text)

In [None]:
#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

In [None]:
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text

In [None]:
#Apply function on text column
train_data['text']=train_data['text'].apply(denoise_text)

- Creating mapped dictionary **`Id`** with the **`label`**  column and vice versa.

In [None]:
category_to_id = {'business':0, 'tech':1, 'politics':2, 'sport':3, 'entertainment':4}
id_to_category = {0: 'business', 1: 'tech', 2: 'politics', 3: 'sport', 4: 'entertainment'}

In [None]:
train_data

Unnamed: 0,text,label
0,Oscars nominations list left controversial fil...,entertainment
1,Flanker Colin Charvis unlikely play part Wales...,sport
2,Ronnie Corbett joined fellow comedy stars Vict...,entertainment
3,Rangers set loan out-of-favour midfielder Drag...,sport
4,Thousands technology lovers industry experts g...,tech
...,...,...
1775,Charlie Simpson took new band Fightstar stage ...,entertainment
1776,India's defence minister opened country's Aero...,business
1777,Fiat General Motors (GM) midnight 1 February s...,business
1778,Shares Google fallen 6.7% employees early inve...,business


In [None]:
train_data = train_data.replace(category_to_id)
train_data.head()

Unnamed: 0,text,label
0,Oscars nominations list left controversial fil...,4
1,Flanker Colin Charvis unlikely play part Wales...,3
2,Ronnie Corbett joined fellow comedy stars Vict...,4
3,Rangers set loan out-of-favour midfielder Drag...,3
4,Thousands technology lovers industry experts g...,1


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1780 entries, 0 to 1779
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1780 non-null   object
 1   label   1780 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 27.9+ KB


In [None]:
tokenizer = Tokenizer(num_words=5000)
#here the mapping is created,  word to integer mapping (only for the top 10000)
tokenizer.fit_on_texts(train_data['text'])
#here the mapping is applied to the sentences
vector = tokenizer.texts_to_sequences(train_data['text'])

In [None]:
max_features = 10000
embed_dim = 100
#train_set = 1500

In [None]:
train_data

Unnamed: 0,text,label
0,Oscars nominations list left controversial fil...,4
1,Flanker Colin Charvis unlikely play part Wales...,3
2,Ronnie Corbett joined fellow comedy stars Vict...,4
3,Rangers set loan out-of-favour midfielder Drag...,3
4,Thousands technology lovers industry experts g...,1
...,...,...
1775,Charlie Simpson took new band Fightstar stage ...,4
1776,India's defence minister opened country's Aero...,0
1777,Fiat General Motors (GM) midnight 1 February s...,0
1778,Shares Google fallen 6.7% employees early inve...,0


In [256]:
x = train_data["text"]
y= train_data["label"]

In [None]:
#x_train = np.array(vector)[:train_set]
#y_train = (np.array(train_data['label'])[:train_set])
#x_test = np.array(vector)[train_set:]
#y_test = (np.array(train_data['label'])[train_set:])

  x_train = np.array(vector)[:train_set]
  x_test = np.array(vector)[train_set:]


In [257]:
y


0       4
1       3
2       4
3       3
4       1
       ..
1775    4
1776    0
1777    0
1778    0
1779    0
Name: label, Length: 1780, dtype: int64

In [252]:
X_train_pad = pad_sequences(vector, maxlen=embed_dim)


**Y into one hot encoding**

In [258]:
from keras.utils import to_categorical
y_train = to_categorical(y, dtype ="uint8")


In [270]:
y_train

array([[0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       ...,
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]], dtype=uint8)

In [None]:
tokenizer.sequences_to_texts([[ 512,  154,   56, 1434,   14]])

['comes software good serve two']

In [None]:
model_rnn = Sequential()
model_rnn.add(Embedding(10000,32, input_length=embed_dim))
model_rnn.add(SimpleRNN(64))
model_rnn.add(Dense(5, activation='softmax'))
model_rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
EPOCHS=50
BATCHES=30

In [None]:
test_model = Sequential()
test_model.add(Embedding(max_features, embed_dim))
test_model.add(LSTM(units=60, activation='relu', return_sequences=False))
test_model.add(Dropout(0.2))
test_model.add(Dense(5, activation='softmax'))
test_model.compile(loss ='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [None]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', patience = 2, verbose=1,factor=0.5, min_lr=0.00001)

In [260]:
his1 = test_model.fit(X_train_pad,y_train, epochs=EPOCHS, batch_size=BATCHES,validation_split=0.2,  )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [261]:
news_test =pd.read_csv("/content/news_topic_test.csv")

In [262]:
news_test['text'] = news_test['text'] + " " + news_test['title']
del news_test['title']


In [263]:
news_test['text']=news_test['text'].apply(denoise_text)

In [264]:
tokenized_test = tokenizer.texts_to_sequences(news_test['text'])
test = sequence.pad_sequences(tokenized_test, maxlen=embed_dim )

In [267]:
pred_y=test_model.predict(test)



In [268]:
pred_y

array([[2.31164298e-03, 5.04124016e-02, 9.20701027e-01, 2.17509195e-02,
        4.82399203e-03],
       [1.41248347e-06, 9.99914467e-01, 3.65354463e-05, 3.11776712e-05,
        1.62624765e-05],
       [1.56482533e-02, 9.75613892e-02, 7.03051835e-02, 1.80249259e-01,
        6.36235952e-01],
       ...,
       [5.80752552e-01, 3.38563509e-02, 1.10383846e-01, 6.68935701e-02,
        2.08113655e-01],
       [4.10604440e-02, 5.89646399e-01, 1.60212114e-01, 1.35658905e-01,
        7.34221116e-02],
       [1.79414892e-05, 9.99768794e-01, 1.51001450e-06, 1.63248318e-04,
        4.83429794e-05]], dtype=float32)

In [None]:
id_to_category = {0: 'business', 1: 'tech', 2: 'politics', 3: 'sport', 4: 'entertainment'}

In [273]:
final_pred = np.round(pred_y).astype(int)
final_pred

array([[0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ...,
       [1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0]])

In [274]:
y = np.argmax(final_pred, axis=-1)


In [275]:
y

array([2, 1, 4, 2, 0, 4, 3, 0, 1, 0, 2, 1, 4, 4, 1, 1, 3, 1, 2, 1, 2, 3,
       2, 1, 4, 0, 4, 3, 2, 4, 4, 0, 3, 1, 2, 1, 2, 3, 3, 0, 0, 0, 2, 4,
       3, 2, 3, 1, 2, 3, 1, 1, 0, 2, 1, 0, 0, 4, 2, 2, 4, 3, 1, 2, 4, 1,
       2, 2, 0, 4, 1, 0, 3, 1, 2, 3, 0, 0, 2, 3, 3, 0, 3, 1, 1, 3, 3, 0,
       1, 4, 4, 2, 0, 2, 4, 2, 1, 0, 2, 0, 0, 0, 4, 2, 4, 0, 4, 3, 1, 0,
       0, 3, 1, 2, 0, 4, 1, 1, 1, 0, 1, 0, 3, 0, 3, 0, 3, 1, 2, 1, 4, 1,
       0, 4, 4, 2, 2, 2, 1, 3, 3, 2, 0, 0, 3, 0, 0, 4, 2, 0, 4, 1, 2, 0,
       2, 4, 2, 2, 2, 1, 3, 1, 4, 0, 3, 1, 2, 2, 3, 1, 0, 3, 3, 1, 0, 3,
       0, 0, 0, 4, 0, 3, 2, 0, 3, 0, 3, 4, 3, 2, 0, 4, 2, 0, 0, 1, 3, 0,
       0, 3, 0, 0, 2, 3, 2, 0, 2, 4, 4, 2, 0, 0, 0, 0, 0, 2, 3, 3, 0, 3,
       2, 2, 2, 0, 2, 0, 0, 0, 0, 1, 4, 4, 1, 0, 3, 3, 1, 4, 3, 2, 3, 2,
       0, 4, 1, 3, 2, 2, 3, 2, 0, 4, 0, 2, 1, 3, 1, 3, 4, 0, 0, 0, 2, 1,
       4, 1, 0, 4, 1, 1, 1, 2, 2, 1, 2, 4, 1, 0, 1, 0, 2, 2, 2, 2, 1, 3,
       0, 1, 0, 3, 2, 3, 2, 0, 0, 3, 2, 2, 1, 0, 1,

In [None]:
id_to_category = {0: 'business', 1: 'tech', 2: 'politics', 3: 'sport', 4: 'entertainment'}

In [278]:
df = pd.DataFrame(y, columns = ["label"])


In [279]:
df

Unnamed: 0,label
0,2
1,1
2,4
3,2
4,0
...,...
440,2
441,2
442,0
443,1


In [280]:
submission = pd.DataFrame({'Id':news_test['Id'],'label':df['label']})


In [281]:
submission = submission.replace(id_to_category)
submission.to_csv('Submission.csv',header=False,index=False)
