In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import re
import keras
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import spacy
nlp = spacy.load("en_core_web_lg")

from google.colab import files


**1. Load the data from the given csv file**

In [None]:
up = files.upload()

df = pd.read_csv('train.csv')
df

Saving train.csv to train (1).csv


Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."
...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...
119996,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...
119997,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...
119998,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...


**2. Identify the text column and the target label(s) that the model will learn to predict.**

In [None]:
df.columns.tolist()

['Class Index', 'Title', 'Description']

In [None]:
df.dtypes

Unnamed: 0,0
Class Index,int64
Title,object
Description,object


In [None]:
# Find text column.
text_columns = df.select_dtypes(include=['object', 'string']).columns.tolist()
print("Possible text columns:", text_columns)

Possible text columns: ['Title', 'Description']


In [None]:

# Let's find target-like columns (often contain 'target', 'label', 'class', or 'y')
possible_targets = [col for col in df.columns if col.lower() in ['target', 'label', 'class', 'y', 'sentiment', 'category']]
print("Possible target columns (by name):", possible_targets)

Possible target columns (by name): []


In [None]:

# If not obvious, inspect columns with few unique values (good for classification targets)
nunique = df.nunique().sort_values()
print("Columns with few unique values (potential target candidates):")
print(nunique.head(10))

Columns with few unique values (potential target candidates):
Class Index         4
Title          114350
Description    118723
dtype: int64


Here 'Class Index' got only 4 unique vlaues, so, it might be the target label.

**3. Perform the necessary preprocessing steps to prepare the text data for training — clean, transform, and convert it into a numerical format suitable for an ANN.**

In [None]:
df.isnull().sum()

Unnamed: 0,0
Class Index,0
Title,0
Description,0


In [None]:
df['Text'] = df[text_columns].agg(' '.join, axis = 1)
df

Unnamed: 0,Class Index,Title,Description,Text
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","Oil prices soar to all-time record, posing new..."
...,...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...,Pakistan's Musharraf Says Won't Quit as Army C...
119996,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...,Renteria signing a top-shelf deal Red Sox gene...
119997,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...,Saban not going to Dolphins yet The Miami Dolp...
119998,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...,Today's NFL games PITTSBURGH at NY GIANTS Time...


In [None]:

df["Text"].iloc[0]

"Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."

In [None]:
# Define cleaning function that uses spaCy for tokenization + lemmatization
def clean_text_spacy(doc_text):
    """
    - Lowercase
    - Remove URLs and HTML tags
    - Tokenize with spaCy
    - Keep alphabetic tokens only (token.is_alpha)
    - Remove stopwords and pronouns, lemmatize
    - Return cleaned string
    """
    if not isinstance(doc_text, str) or not doc_text:
        return ""

    # basic cleaning before spaCy (optional)
    txt = re.sub(r"http\S+|www\.\S+", " ", doc_text)    # remove URLs
    txt = re.sub(r"<.*?>", " ", txt)                    # remove HTML tags
    txt = txt.strip().lower()

    doc = nlp(txt)

    tokens = []
    for token in doc:
        # token.is_stop filters spaCy stopwords; token.is_alpha keeps alphabetic tokens
        # We also exclude pronouns (token.pos_ == "PRON") to reduce noise
        if token.is_alpha and not token.is_stop and token.pos_ != "PRON":
            lemma = token.lemma_.strip()
            if lemma:
                tokens.append(lemma)
    return " ".join(tokens)

In [None]:

# Apply (this step can be slow for large datasets)
print("Cleaning texts with spaCy — this may take a while ...")
df["cleaned_text"] = df["Text"].apply(clean_text_spacy)

Cleaning texts with spaCy — this may take a while ...


In [None]:

# Show a few examples
df[["Text", "cleaned_text"]].head(5)


Unnamed: 0,Text,cleaned_text
0,Wall St. Bears Claw Back Into the Black (Reute...,wall st bears claw black reuters reuters short...
1,Carlyle Looks Toward Commercial Aerospace (Reu...,carlyle look commercial aerospace reuters reut...
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,oil economy cloud stock outlook reuters reuter...
3,Iraq Halts Oil Exports from Main Southern Pipe...,iraq halt oil export main southern pipeline re...
4,"Oil prices soar to all-time record, posing new...",oil price soar time record pose new menace eco...


In [None]:
df["cleaned_text"].iloc[0]

In [None]:
def remove_duplicates_in_sentence(text):
    words = text.split()
    # Keep only first occurrence of each word (order preserved)
    seen = set()
    return " ".join([w for w in words if not (w in seen or seen.add(w))])

In [None]:
df["cleaned_text"] = df["cleaned_text"].apply(remove_duplicates_in_sentence)

In [None]:
df["cleaned_text"].iloc[0]

'wall st bears claw black reuters short seller street ultra cynic see green'

In [None]:
cleaned_text_nu = np.array([doc.vector for doc in nlp.pipe(df['cleaned_text'], batch_size=1000,
                                                           n_process=2)])

In [None]:
cleaned_text_nu

array([[ 0.048359  ,  0.04825231, -0.10745777, ...,  0.00281985,
         0.064946  ,  0.13077646],
       [-0.09257842,  0.1665946 ,  0.04497312, ..., -0.15562254,
         0.08899876,  0.1595554 ],
       [-0.27481782,  0.2040888 ,  0.18295597, ..., -0.16699007,
        -0.02018282, -0.02704386],
       ...,
       [-0.03949511,  0.03725689, -0.01641958, ..., -0.00190511,
         0.05533959, -0.00608957],
       [-0.14488088,  0.07613474, -0.13919978, ..., -0.03235732,
        -0.02865209,  0.04906403],
       [-0.02943656,  0.14860702, -0.12821665, ..., -0.17522237,
        -0.12732348,  0.1410166 ]], dtype=float32)

**4. Build and train a simple Artificial Neural Network that can classify or categorize the text data.**

In [None]:
print("cleaned_text_nu type:", type(cleaned_text_nu))
print("cleaned_text_nu shape:", getattr(cleaned_text_nu, "shape", None))
print("df shape:", df.shape)

cleaned_text_nu type: <class 'numpy.ndarray'>
cleaned_text_nu shape: (120000, 300)
df shape: (120000, 6)


In [None]:
x = cleaned_text_nu
y = df['Class Index']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
X_train

array([[-0.18485707,  0.14445584, -0.00579284, ..., -0.21959771,
        -0.0065635 ,  0.01172568],
       [-0.2036254 ,  0.15775652, -0.05030218, ...,  0.0338351 ,
        -0.16030052,  0.0018293 ],
       [-0.02761224,  0.2095897 ,  0.20966496, ..., -0.08897186,
        -0.001202  ,  0.06551421],
       ...,
       [-0.13817556,  0.13312167,  0.033114  , ..., -0.25999948,
         0.10807889, -0.00421937],
       [-0.07771461,  0.11029606, -0.00941379, ...,  0.00219233,
        -0.00894529,  0.00282074],
       [ 0.05873767,  0.1778224 ,  0.00563027, ...,  0.017945  ,
         0.02631246,  0.15474667]], dtype=float32)

In [None]:
X_train.ndim

2

In [None]:
X_train.shape

(96000, 300)

In [None]:
y_train

Unnamed: 0,Class Index
53541,1
41728,2
74819,2
106459,4
63062,3
...,...
110268,4
119879,4
103694,2
860,4


In [None]:
df['Class Index'].nunique()

4

In [None]:
X_train.shape

(96000, 300)

In [None]:
y_train_zeroed = y_train - y_train.min()
y_test_zeroed = y_test - y_test.min()

In [None]:
model = keras.Sequential([keras.layers.Dense(4, input_shape=(300,), activation='sigmoid')])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
              metrics =['accuracy'])

In [None]:
model.fit(X_train, y_train_zeroed, epochs =5)

Epoch 1/5
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8291 - loss: 0.6053
Epoch 2/5
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8833 - loss: 0.3602
Epoch 3/5
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8887 - loss: 0.3448
Epoch 4/5
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8914 - loss: 0.3377
Epoch 5/5
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8903 - loss: 0.3365


<keras.src.callbacks.history.History at 0x78beb67b92b0>

**5. Evaluate the model’s performance and analyze the factors that may have affected its accuracy.**

In [None]:
model.evaluate(X_test, y_test_zeroed)

[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8875 - loss: 0.3451


[0.3465745151042938, 0.8864583373069763]

In [None]:
temp = model.predict(X_test[[0]])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step


In [None]:
temp

array([[0.4903087 , 0.26843202, 0.7765077 , 0.22765271]], dtype=float32)

In [None]:
np.argmax(temp) + y_train.min()

np.int64(3)

In [None]:
df.head()

Unnamed: 0,Class Index,Title,Description,Text,cleaned_text,cleaned_text_nu
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Wall St. Bears Claw Back Into the Black (Reute...,wall st bears claw black reuters short seller ...,"[0.048358995, 0.048252307, -0.10745777, -0.029..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace (Reu...,carlyle look commercial aerospace reuters priv...,"[-0.09257842, 0.1665946, 0.044973116, -0.18282..."
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks' Outlook (Reuters...,oil economy cloud stock outlook reuters soar c...,"[-0.27481782, 0.2040888, 0.18295597, 0.042788,..."
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Iraq Halts Oil Exports from Main Southern Pipe...,iraq halt oil export main southern pipeline re...,"[-0.16582888, -0.021393804, 0.19694346, 0.1861..."
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","Oil prices soar to all-time record, posing new...",oil price soar time record pose new menace eco...,"[-0.19514248, 0.076723285, 0.22925636, 0.06456..."


## factors affecting performance

1.   Simple ANN - Adding headen layer may improve accuracy
2.   epochs - More epochs may improve accuracy.
3. Data quality & noise -If the dataset contains mis-labeled or ambiguous articles, even the best model can’t exceed the inherent noise ceiling.
4. Preprocessing decisions - stopwords, pronouns, and duplicates are removed. his reduces noise but may also remove useful context.
5. Feature scaling - Embeddings are roughly normalized, but standardizing features sometimes helps ANN convergence.

<br>

*   TF-IDF and COunVectorizer can used also.
