In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import tensorflow
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [2]:
df=pd.read_csv('train.csv')

In [3]:
df.head()


Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [4]:
df.shape

(120000, 3)

In [5]:
df.isnull().sum()

Class Index    0
Title          0
Description    0
dtype: int64

In [6]:
df['Class Index'].value_counts()

Class Index
3    30000
4    30000
2    30000
1    30000
Name: count, dtype: int64

In [7]:
df['text'] = df['Title'].astype(str) + " " + df['Description'].astype(str)


In [8]:
import re

In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [10]:
import string
df['clean_text'] = df['text'].apply(clean_text)

In [11]:
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['Class Index'])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], 
    df['label'], 
    test_size=0.2, 
    random_state=42,
    stratify=df['label']
)

In [13]:
vectorizer = CountVectorizer(
    max_features=5000,       
    ngram_range=(1,2),
    stop_words='english'
)

In [14]:
x_train_cv = vectorizer.fit_transform(X_train)
x_test_cv = vectorizer.transform(X_test)

In [15]:
x_train_cv.shape

(96000, 5000)

In [16]:
y_train.shape

(96000,)

In [17]:
x_test_cv.shape

(24000, 5000)

In [18]:
y_test.shape

(24000,)

In [19]:
num_classes = len(encoder.classes_)
num_classes

4

In [20]:

y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

In [21]:
from tensorflow.keras.models import Sequential
model = Sequential([
    Dense(512, activation='relu', input_dim=x_train_cv.shape[1]),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [23]:
x_train_cv = x_train_cv.toarray().astype('float32')
x_test_cv = x_test_cv.toarray().astype('float32')

In [27]:



model.fit(
    x_train_cv, 
    y_train_cat, 
    epochs=10, 
    batch_size=64, 
    validation_split=0.2
)


Epoch 1/10
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 48ms/step - accuracy: 0.8846 - loss: 0.3458 - val_accuracy: 0.9080 - val_loss: 0.2817
Epoch 2/10
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 48ms/step - accuracy: 0.9283 - loss: 0.2061 - val_accuracy: 0.9079 - val_loss: 0.2912
Epoch 3/10
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 49ms/step - accuracy: 0.9554 - loss: 0.1254 - val_accuracy: 0.9044 - val_loss: 0.3333
Epoch 4/10
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 48ms/step - accuracy: 0.9766 - loss: 0.0687 - val_accuracy: 0.9051 - val_loss: 0.4184
Epoch 5/10
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 50ms/step - accuracy: 0.9871 - loss: 0.0391 - val_accuracy: 0.9031 - val_loss: 0.5113
Epoch 6/10
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 48ms/step - accuracy: 0.9903 - loss: 0.0291 - val_accuracy: 0.9012 - val_loss: 0.5723
Epoc

<keras.src.callbacks.history.History at 0x1f968ee12b0>

In [28]:
 model.evaluate(x_test_cv, y_test_cat)

[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.9056 - loss: 0.6935


[0.6934736371040344, 0.9055833220481873]

In [29]:

y_pred = model.predict(x_test_cv)
y_pred_classes = y_pred.argmax(axis=1)  


print("\nClassification Report:\n", 
      classification_report(y_test, y_pred_classes, 
                            target_names=[str(cls) for cls in encoder.classes_]))


[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step

Classification Report:
               precision    recall  f1-score   support

           1       0.91      0.90      0.91      6000
           2       0.96      0.95      0.96      6000
           3       0.87      0.88      0.88      6000
           4       0.89      0.88      0.88      6000

    accuracy                           0.91     24000
   macro avg       0.91      0.91      0.91     24000
weighted avg       0.91      0.91      0.91     24000

