In [24]:
import pandas as pd
import numpy as np
import csv
import string 
import bs4
from nltk.tokenize import sent_tokenize
import tensorflow as tf

In [25]:
df=pd.read_csv('news.csv', encoding='latin1')
df.head()

Unnamed: 0,News,Category
0,The IMF now expects China's economy to grow by...,Economy
1,Manufacturing activity in the Eurozone has dec...,Economy
2,Continued disruptions in the global supply cha...,Economy
3,Concerns about food security remain high due t...,Economy
4,"While some central banks, like the US Federal ...",Economy


In [26]:
df.size

504

In [27]:
df.Category.unique()

array(['Economy', 'Health', 'Sports', 'Entertainment', 'Politics',
       'International relations', 'Food', 'Artificial Intelligence '],
      dtype=object)

In [28]:
df.isnull().sum()

News        0
Category    0
dtype: int64

In [29]:
def text_preprocess(q):
    
    q = str(q).lower().strip()

    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')    
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]
        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

In [30]:
for i in df['News']:
    translator = str.maketrans('', '', string.punctuation)
    clean_sentence = i.translate(translator)
    df=df.replace(i,clean_sentence)

In [31]:
unique_categories = df['Category'].unique()
unique_categories

array(['Economy', 'Health', 'Sports', 'Entertainment', 'Politics',
       'International relations', 'Food', 'Artificial Intelligence '],
      dtype=object)

In [32]:
#column values to numeral:
replace_dict = {'Economy': 0, 'Health': 1, 'Sports': 2, 'Entertainment': 3,
               'Politics': 4, 'International relations':5, 'Food': 6,
               'Artificial Intelligence ':7}
df['Category'].replace(replace_dict, inplace=True)

In [33]:
df['News length'] = df['News'].apply(lambda x: len(x.split()))

In [34]:
df.head()

Unnamed: 0,News,Category,News length
0,The IMF now expects Chinas economy to grow by ...,0,29
1,Manufacturing activity in the Eurozone has dec...,0,22
2,Continued disruptions in the global supply cha...,0,24
3,Concerns about food security remain high due t...,0,25
4,While some central banks like the US Federal R...,0,28


In [35]:
df['News length'].nlargest(n=5)

65     121
136     96
68      86
96      86
42      82
Name: News length, dtype: int64

In [36]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences=df['News']
categories=df['Category']

vocab_size = 10000
embedding_dim = 16
max_length = 70

tokenizer = Tokenizer(oov_token="<OOV>", num_words=10000)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded=pad_sequences(sequences, padding='post')

print(padded[0])

[   2 1127  409 1128  294   95    6  295   20 1129   24 1130   81  624
 1131 1132  294 1133  296   32   37 1134  625    3  221  148 1135    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0]


In [42]:
################################################################
training_size = int(0.18 * len(categories))
training_sentences=sentences[:training_size]
testing_sentences=sentences[training_size:]

training_labels=categories[:training_size]
testing_labels=categories[training_size:]

In [43]:
tokenizer = Tokenizer(oov_token="<OOV>", num_words=10000)
tokenizer.fit_on_texts(training_sentences)
word_index=tokenizer.word_index

training_sequences=tokenizer.texts_to_sequences(training_sentences)
training_padded=pad_sequences(training_sequences, maxlen=max_length, padding='post', truncating='post')

testing_sequences=tokenizer.texts_to_sequences(testing_sentences)
testing_padded=pad_sequences(testing_sequences, max_length, padding='post', truncating='post')

In [44]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

In [None]:
history=model.fit(training_padded, training_labels, epochs=800, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/800
2/2 - 1s - loss: 0.6932 - accuracy: 0.2667 - val_loss: 0.6536 - val_accuracy: 0.0628 - 550ms/epoch - 275ms/step
Epoch 2/800
2/2 - 0s - loss: 0.6891 - accuracy: 0.3111 - val_loss: 0.6387 - val_accuracy: 0.0628 - 34ms/epoch - 17ms/step
Epoch 3/800
2/2 - 0s - loss: 0.6849 - accuracy: 0.3111 - val_loss: 0.6232 - val_accuracy: 0.0628 - 39ms/epoch - 20ms/step
Epoch 4/800
2/2 - 0s - loss: 0.6806 - accuracy: 0.3111 - val_loss: 0.6023 - val_accuracy: 0.0628 - 36ms/epoch - 18ms/step
Epoch 5/800
2/2 - 0s - loss: 0.6755 - accuracy: 0.3111 - val_loss: 0.5788 - val_accuracy: 0.0628 - 37ms/epoch - 18ms/step
Epoch 6/800
2/2 - 0s - loss: 0.6704 - accuracy: 0.3111 - val_loss: 0.5517 - val_accuracy: 0.0628 - 43ms/epoch - 21ms/step
Epoch 7/800
2/2 - 0s - loss: 0.6648 - accuracy: 0.3111 - val_loss: 0.5204 - val_accuracy: 0.0628 - 37ms/epoch - 19ms/step
Epoch 8/800
2/2 - 0s - loss: 0.6590 - accuracy: 0.3111 - val_loss: 0.4880 - val_accuracy: 0.0628 - 37ms/epoch - 19ms/step
Epoch 9/800
2/2 - 0s -

In [41]:
random_sport_article="Afcon runs from 13 January to 11 February in Ivory Coast and the Asian Cup is in Qatar from 12 January to 10 February. Although the tournaments coincide with winter breaks across some European leagues, Premier League clubs will face a number of weeks without first-team regulars. Liverpool boss Jurgen Klopp, who will lose top scorer and Egypt captain Mohamed Salah, said it is something clubs have to plan for. We have [Wataru] Endo as a participant of the Asian Cup as well, so it is like it is. We have to deal with it and we will."
new_seq=tokenizer.texts_to_sequences(random_sport_article)
new_padded=pad_sequences(new_seq, max_length, padding='post')
predictions = model.predict(new_padded)



[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 