In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/intelligence-sig-NLP-Task/sample_submission.csv
/kaggle/input/intelligence-sig-NLP-Task/news_train.csv
/kaggle/input/intelligence-sig-NLP-Task/test.csv
/kaggle/input/glove6b100dtxt/glove.6B.100d.txt
/kaggle/input/glove6b50dtxt/glove.6B.50d.txt


In [2]:
from sklearn.model_selection import train_test_split
data = pd.read_csv("/kaggle/input/intelligence-sig-NLP-Task/news_train.csv")
x = data.drop('Category',axis=1)
y = data['Category']

# Splitting data into training and testing pools with 80 20 split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) 

## **First Approach**
Using Bag of Words and Naive Bayes determine the category. This helps us set a baseline scores for all metrics.


#### Why I decided to use Bag of Words?
* Bag of words is useful when the text pool is really small, here the the fields consisted of mainly short sentences which were really decriptive and concise.
* It is really simple to implement and works well with Naive Bayes.
* Since use of pre trained models were not allowed, the inofrmation the model learn only depended on the information available in the tarining data.

#### Why i decided to include stop words.
* The only category which was concerning was the humour category which includes "keywords"  which if stopwords were ommited could consist of embedding mainly composed of these "keywords " which were prevelant in other categories and confise the model. In certain examples I observed that the data in this category were in terms of questions, so I decided to keep in stopwords, in hopes that it will be able to capture this.

#### How Bag of Words works
For each category there exists a dictionary such that each dictionry consists of frequencies of words that occur in that category independent of all the occurances of said word in other categories. To start of tokenisation, I have joined the head and the title into one sentence and used that to iterate over every word in in that sentence and added and/or updated the frequency of the occured word. Also removed any special characters that occur for simplification.

Now there is the use of lemmitisation or stemming that can be used here for further improving results during tokensation, since this is a benchmark trial I made decision to skip this and include it further improvements.

In [3]:
print(data)

          ID                                         News_title  \
0          1  Do men enjoy sex more, or women? The Mahabhara...   
1          2       Why you should eat the Demonetisation laddoo   
2          3            Is the world headed for a new Cold War?   
3          4  Demonetisation is all about Modi, either you'r...   
4          5  Why electoral bonds won't clean up political f...   
...      ...                                                ...   
15571  15572  [Watch] Hansal Mehta beautifully captures Rohi...   
15572  15573  How Pakistan weakened Taliban by revealing Mul...   
15573  15574          2G 'scam': What the verdict means for DMK   
15574  15575  Kargil to Pathankot: Why is India never prepar...   
15575  15576      On Sadhvis and sex workers: Listen up, trolls   

                                           News_headline  Category  
0      [Book Extract] From Anushasana Parva, translat...      Arts  
1      One laddoo equals to one lakh in your Jan Dhan... 

In [4]:
import re

# Dictionary to store bag of words for each category
bag_of_words = {
    'tech': {},
    'politics': {},
    'Arts': {},
    'humour': {},
    'sports': {},
    'business': {}
}
pattern = re.compile(r'[^A-Za-z0-9\s]')
for r in range(y_train.shape[0]):
    category = y_train.iloc[r]
    title = x_train.iloc[r, 1]
    head = x_train.iloc[r, 2]
    words = f"{head} {title}" # Combining both the title and the head into one string
    for word in words.split():
        key = pattern.sub('', word).lower() # Remove special characters and convert to lowercase
        bag_of_words[category][key] = bag_of_words[category].get(key, 0) + 1

# print(bag_of_words_sports)



#### Naive Bayes
Now that we have the bag of words for every class we can calculate the probability of class using Bayes Rules,

<center> $P(Class \mid Words) = \frac{P(Words \mid Class) \cdot P(Class)}{P(Words)}$ <center>



We omit $P(Words)$ as the value which maximises $\frac{P(Words \mid Class) \cdot P(Class)}{P(Words)}$ also maximises, $P(Words \mid Class) \cdot P(Class)$

We assume the words occur independent of each other therefore $P(Words \mid Class) = \prod_{i} P(Word_i \mid Class)$ 

Product of probabilities can get very small so we add the logs of the proabilities, 
<center> $\log P(Words \mid Class) = \sum_{i} \log P(Word_i \mid Class)$ <center>

Probability of class is given by,

$P(C) = \frac{Number\space of\space training\space entires\space for\space a\space given\space class}{Total\space number\space of\space training\space entries}$

In [5]:
counts = y_train.value_counts() # Library of class occurances in training entry
total_entries = y_train.shape[0] # Total number of entires

print(counts)
print("Total entries = ",total_entries)

Category
politics    9225
sports       733
humour       731
Arts         672
tech         574
business     525
Name: count, dtype: int64
Total entries =  12460


In [6]:
y_pred = []
pattern = re.compile(r'[^A-Za-z0-9\s]')
preds = { 
    'tech': 0,
    'politics': 0,
    'Arts': 0,
    'humour': 0,
    'sports': 0,
    'business': 0
}

for r in range(y_test.shape[0]):
    correct_category = y_test.iloc[r]
    title = x_test.iloc[r, 1]
    head = x_test.iloc[r, 2]
    words = f"{head} {title}"
    for category in bag_of_words:
        p_c = counts[category] / total_entries
        log_sum = np.log(p_c)
        bag = bag_of_words[category]
        for word in words.split():
            key = pattern.sub('', word).lower()
            if key in bag:
                n_word = bag[key]
                n_total = sum(bag.values())
                p_word = n_word / n_total
            else:
                p_word = 1e-6  # Assigning a very small value for unknown words since we take log

            log_sum += np.log(p_word)

        preds[category] = log_sum
    max_key = max(preds, key=preds.get)
    y_pred.append(max_key)

# print(y_pred)

In [8]:
from sklearn.metrics import f1_score

f1_score = f1_score(y_test, y_pred, average = 'weighted')
print(f'F1 score: {f1_score}')

F1 score: 0.8448133709909017


In [9]:
test = pd.read_csv("/kaggle/input/intelligence-sig-NLP-Task/news_train.csv")

In [11]:
y_pred = []
pattern = re.compile(r'[^A-Za-z0-9\s]')
preds = { 
    'tech': 0,
    'politics': 0,
    'Arts': 0,
    'humour': 0,
    'sports': 0,
    'business': 0
}

for r in range(test.shape[0]):
    title = test.iloc[r, 1]
    head = test.iloc[r, 2]
    words = f"{head} {title}"
    for category in bag_of_words:
        p_c = counts[category] / total_entries
        log_sum = np.log(p_c)
        bag = bag_of_words[category]
        for word in words.split():
            key = pattern.sub('', word).lower()
            if key in bag:
                n_word = bag[key]
                n_total = sum(bag.values())
                p_word = n_word / n_total
            else:
                p_word = 1e-6  # Assigning a small value for unknown words since we take log

            log_sum += np.log(p_word)

        preds[category] = log_sum
    max_key = max(preds, key=preds.get)
    y_pred.append(max_key)

# print(y_pred)

In [None]:
# Making the submission
idx = test.iloc[:,0]
mapping = {
    "Arts" : 0,
    "business" : 1,
    "humour" : 2,
    "politics" : 3,
    "sports" : 4,
    "tech" : 5
}
y_pred = [mapping.get(category, category) for category in y_pred]
out = pd.DataFrame({'ID':idx, 'Category': y_pred})
out.to_csv('pred1.csv', index=False)

#### ****Submission gave an F1 Score of 0.84574 which is going to be the baseline to beat****

#### Pros
1. Model is efficient and fast.
2. Ignore irrelevant features.
3. Handles proper nouns or class specific words well.

#### Cons
1. Does not have any kind of contextual understanding or attention applied.

#### Things I can use to improve my score,
1. Stemming or Lemmitization.
2. Avoiding stop words.
3. Better classifying techniques.
4. Handling context better.

#### Concerns
My main concern is handling proper nouns, like for exmaple,

*	Sentence 1, “Cristiano Ronaldo is making headlines for his recent performance.”
*	Sentence 2, “Sensex is making headlines for its recent performance.”

If we choose to replace all proper nouns with a masking vector, both the sentences will be meaning the same, and the context only chnges when the sentences have the specifc names in them.
Sentence 1, can be categorised under sports, as the main subject is Cristiano Ronaldo and Sentence 2, can be categorised under business, the main subject being Sensex

## Second Approach
#### Moving from Bag of Wrods to TF-IDF
Using TF-IDF (Term Frequency Inverse Document Frequency), this method is an upgrade in most cases as TF-IDF adjusts for word importance by considering how often a word appears across different categories. It emphasizes on words that are unique to a particular category, making it more suitable for capturing the features of each category.

#### Moving from Naive Bayes to Logistic Regression
Naive Bayes assumes independence of occurance of terms pertaining to features unique to each category, Logistic Regression directly predcits the probability of categories using a logistic function, which may provide better probability estimates.

In [12]:
from sklearn.model_selection import train_test_split
data = pd.read_csv("/kaggle/input/intelligence-sig-NLP-Task/news_train.csv")
x = data.drop('Category',axis=1)
y = data['Category']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#### **Formula for traditional TF-IDF**

$ TF(t, d) = \frac{\text{Number of times term } t \text{ appears in document } d}{\text{Total number of terms in document } d} $

$ IDF(t) = \log \left(\frac{\text{Total number of documents}}{\text{Number of documents containing term } t} \right) $

The final TF-IDF score for a term in a document is the product of TF and IDF:

$ TF\text{-}IDF(t, d) = TF(t, d) \times IDF(t) $

#### Vectorization in TF-IDF
Once the TF-IDF is obtained score for every word, the sentence can be represented as a vector according to the TF-IDF score of words in it. Each sentence becomes a vector in a multi-dimensional space.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

tfidf = TfidfVectorizer(max_features=50000, stop_words='english')

# print(x_train)
x_train['News_title'] = x_train['News_title'].fillna(' ')
x_train['News_headline'] = x_train['News_headline'].fillna(' ')
x_test['News_title'] = x_test['News_title'].fillna(' ')
x_test['News_headline'] = x_test['News_headline'].fillna(' ')
x_train_tfidf = tfidf.fit_transform(x_train["News_title"]+" "+x_train["News_headline"])
x_test_tfidf = tfidf.transform(x_test["News_title"]+" "+x_test["News_headline"])

# Using Logistic Regression
model = LogisticRegression(max_iter=1000)

# print(x_train_tfidf)
model.fit(x_train_tfidf, y_train)

y_pred = model.predict(x_test_tfidf)

f1_score = f1_score(y_test, y_pred, average = 'weighted')
print(f'F1 score: {f1_Score}')

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import PorterStemmer
import nltk

nltk.download('punkt') 
stemmer = PorterStemmer() # Initializing stemmer for stemming

def stem_text(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

x_train['News_title'] = x_train['News_title'].fillna(' ')
x_train['News_headline'] = x_train['News_headline'].fillna(' ')
x_test['News_title'] = x_test['News_title'].fillna(' ')
x_test['News_headline'] = x_test['News_headline'].fillna(' ')

x_train['Processed_Text'] = x_train["News_title"] + " " + x_train["News_headline"]
x_train['Processed_Text'] = x_train['Processed_Text'].apply(stem_text)

x_test['Processed_Text'] = x_test["News_title"] + " " + x_test["News_headline"]
x_test['Processed_Text'] = x_test['Processed_Text'].apply(stem_text)

tfidf = TfidfVectorizer(max_features=50000, stop_words='english')

x_train_tfidf = tfidf.fit_transform(x_train['Processed_Text'])
x_test_tfidf = tfidf.transform(x_test['Processed_Text'])

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train_tfidf, y_train)

y_pred = model.predict(x_test_tfidf)

print(classification_report(y_test, y_pred))

### Swicthing from Logistic Regression to SVMs

Usually SVMs are used for Binary classification, sometimes they work well for mutliclass problems as well. SVMS are higly customziable depending on the kernel, regularisation, class imbalance and other tunable parameters that it offers.

In [None]:
from sklearn.svm import SVC

svm_model = SVC()

svm_model.fit(x_train_tfidf, y_train)

y_pred_svm = svm_model.predict(x_test_tfidf)
print(classification_report(y_test, y_pred_svm))

## Experimenting with pre trained tokenizers and vectorization.

In [None]:
pip install keras

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalMaxPool1D
from tensorflow.keras.utils import to_categorical

nltk.download('punkt')

df = pd.read_csv('/kaggle/input/intelligence-sig-NLP-Task/news_train.csv')
df['News_title'] = df['News_title'].fillna(' ')
df['News_headline'] = df['News_headline'].fillna(' ')

df['Text'] = df['News_title'] + " " + df['News_headline']

le = LabelEncoder()
df['target'] = le.fit_transform(df['Category'])
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['target'], test_size=0.2, random_state=42)

tokenizer = Tokenizer(num_words=50000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=100, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=100, padding='post')
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

In [None]:
embedding_index = {}
glove_path = '/kaggle/input/glove6b100dtxt/glove.6B.100d.txt'

with open(glove_path, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100  # GloVe 100d
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, index in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
from keras.layers import Bidirectional

model = Sequential()
model.add(Embedding(input_dim=vocab_size, 
                    output_dim=embedding_dim, 
                    weights=[embedding_matrix], 
                    input_length=100, 
                    trainable=False))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(6, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_pad, y_train_cat, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test_cat))

In [None]:
loss, accuracy = model.evaluate(X_test_pad, y_test_cat)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

y_pred = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)

y_test_labels = le.inverse_transform(np.argmax(y_test_cat, axis=1))
y_pred_labels = le.inverse_transform(y_pred_classes)

from sklearn.metrics import classification_report
print(classification_report(y_test_labels, y_pred_labels))

In [None]:
df_test = pd.read_csv("/kaggle/input/intelligence-sig-NLP-Task/test.csv")

In [None]:
df_test['News_title'] = df['News_title'].fillna(' ')
df_test['News_headline'] = df['News_headline'].fillna(' ')

df_test['Text'] = df_test['News_title'] + " " + df_test['News_headline']

tokenizer = Tokenizer(num_words=50000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
test_seq = tokenizer.texts_to_sequences(df_test['Text'])

test_pad = pad_sequences(test_seq, maxlen=100, padding='post')

y_pred = model.predict(test_pad)
print(f'Test Accuracy: {accuracy * 100:.2f}%')
y_pred_classes = np.argmax(y_pred, axis=1)

y_pred_labels = le.inverse_transform(y_pred_classes)

In [None]:
print(f"Length of df_test: {len(df_test)}")
print(f"Length of test_seq: {len(test_pad)}")
print(f"Length of y_pred_labels: {len(y_pred_labels)}")
idx = df_test['ID']

mapping = {
    "Arts": 0,
    "business": 1,
    "humour": 2,
    "politics": 3,
    "sports": 4,
    "tech": 5
}

print(f"Number of predictions: {len(y_pred_labels)}")

y_pred_mapped = [mapping.get(category, category) for category in y_pred_labels]

out = pd.DataFrame({'ID': idx, 'Category': y_pred_mapped})

out.to_csv('pred2.1.csv', index=False)