# Preprocess

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./US-Economic-News.csv", delimiter=',', encoding= 'ISO-8859-1')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               8000 non-null   int64  
 1   _golden                8000 non-null   bool   
 2   _unit_state            8000 non-null   object 
 3   _trusted_judgments     8000 non-null   int64  
 4   _last_judgment_at      8000 non-null   object 
 5   positivity             1420 non-null   float64
 6   positivity:confidence  3775 non-null   float64
 7   relevance              8000 non-null   object 
 8   relevance:confidence   8000 non-null   float64
 9   articleid              8000 non-null   object 
 10  date                   8000 non-null   object 
 11  headline               8000 non-null   object 
 12  positivity_gold        0 non-null      float64
 13  relevance_gold         0 non-null      float64
 14  text                   8000 non-null   object 
dtypes: b

In [3]:
df.head(5)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,positivity,positivity:confidence,relevance,relevance:confidence,articleid,date,headline,positivity_gold,relevance_gold,text
0,842613455,False,finalized,3,12/5/15 17:48,3.0,0.64,yes,0.64,wsj_398217788,8/14/91,Yields on CDs Fell in the Latest Week,,,NEW YORK -- Yields on most certificates of dep...
1,842613456,False,finalized,3,12/5/15 16:54,,,no,1.0,wsj_399019502,8/21/07,The Morning Brief: White House Seeks to Limit ...,,,The Wall Street Journal Online</br></br>The Mo...
2,842613457,False,finalized,3,12/5/15 1:59,,,no,1.0,wsj_398284048,11/14/91,Banking Bill Negotiators Set Compromise --- Pl...,,,WASHINGTON -- In an effort to achieve banking ...
3,842613458,False,finalized,3,12/5/15 2:19,,0.0,no,0.675,wsj_397959018,6/16/86,Manager's Journal: Sniffing Out Drug Abusers I...,,,The statistics on the enormous costs of employ...
4,842613459,False,finalized,3,12/5/15 17:48,3.0,0.3257,yes,0.64,wsj_398838054,10/4/02,Currency Trading: Dollar Remains in Tight Rang...,,,NEW YORK -- Indecision marked the dollar's ton...


In [4]:
df = df[['headline', 'text', 'relevance']]

# We drop all irrelavant features to only keep headline and text for 2 reasons: 
# The other features seem either irrelevant or we lack documentation
# With headline and text only, our final model will be more generalizable. We could in theory apply it to any article.

In [5]:
df.head(5)

Unnamed: 0,headline,text,relevance
0,Yields on CDs Fell in the Latest Week,NEW YORK -- Yields on most certificates of dep...,yes
1,The Morning Brief: White House Seeks to Limit ...,The Wall Street Journal Online</br></br>The Mo...,no
2,Banking Bill Negotiators Set Compromise --- Pl...,WASHINGTON -- In an effort to achieve banking ...,no
3,Manager's Journal: Sniffing Out Drug Abusers I...,The statistics on the enormous costs of employ...,no
4,Currency Trading: Dollar Remains in Tight Rang...,NEW YORK -- Indecision marked the dollar's ton...,yes


Cleaning Strings

In [6]:
#!pip install nltk

In [7]:
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [8]:
# #Ensure you have downloaded the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\majon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\majon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\majon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
df['whole_txt'] = df['headline']+ ' ' + df['text']

In [10]:
wtxt_train = np.array(df['whole_txt'])

In [11]:
#print(wtxt_train)

In [12]:
for i in range(len(wtxt_train)):
    # Taking out '<br>' in the 'whole_text' column
    wtxt_train[i] = re.sub(r'</?br>', ' ', wtxt_train[i])
    # Deletion of non-latin alfabet signs, also numbers
    wtxt_train[i] = re.sub(r'[^a-zA-Z]', ' ', wtxt_train[i])
    # Removing single letter works like 'a'.
    wtxt_train[i] = re.sub(r"\s+[a-zA-Z]\s+", ' ', wtxt_train[i])
    # Removing double spaces
    wtxt_train[i] = re.sub(r'\s+', ' ', wtxt_train[i])
    # Lower case
    wtxt_train[i] = wtxt_train[i].lower()

Split the words.

In [13]:
for i in range(len(wtxt_train)):
    wtxt_train[i] = word_tokenize(wtxt_train[i])

Removing stop words

In [14]:
stop_words = set(stopwords.words('english'))

for i in range(len(wtxt_train)):
    wtxt_train[i] = [word for word in wtxt_train[i] if word not in stop_words]

In [15]:
wtxt_train[0]
# stop_words

['yields',
 'cds',
 'fell',
 'latest',
 'week',
 'new',
 'york',
 'yields',
 'certificates',
 'deposit',
 'offered',
 'major',
 'banks',
 'dropped',
 'tenth',
 'percentage',
 'point',
 'latest',
 'week',
 'reflecting',
 'overall',
 'decline',
 'short',
 'term',
 'interest',
 'rates',
 'small',
 'denomination',
 'consumer',
 'cds',
 'sold',
 'directly',
 'banks',
 'average',
 'yield',
 'six',
 'month',
 'deposits',
 'fell',
 'week',
 'ended',
 'yesterday',
 'according',
 'bank',
 'survey',
 'banxquote',
 'money',
 'markets',
 'wilmington',
 'del',
 'information',
 'service',
 'three',
 'month',
 'consumer',
 'deposits',
 'average',
 'yield',
 'sank',
 'week',
 'according',
 'banxquote',
 'two',
 'banks',
 'banxquote',
 'survey',
 'citibank',
 'new',
 'york',
 'corestates',
 'pennsylvania',
 'paying',
 'less',
 'threemonth',
 'small',
 'denomination',
 'cds',
 'declines',
 'somewhat',
 'smaller',
 'five',
 'year',
 'consumer',
 'cds',
 'eased',
 'banxquote',
 'said',
 'yields',
 'three',

Lemmatization

In [16]:
lemmatizer = WordNetLemmatizer()
for i in range(len(wtxt_train)):
    wtxt_train[i] = [lemmatizer.lemmatize(word) for word in wtxt_train[i]]

In [17]:
df['whole_txt'] = wtxt_train
df = df.drop(['headline', 'text'], axis = 1)

In [18]:
df.head(5)

Unnamed: 0,relevance,whole_txt
0,yes,"[yield, cd, fell, latest, week, new, york, yie..."
1,no,"[morning, brief, white, house, seek, limit, ch..."
2,no,"[banking, bill, negotiator, set, compromise, p..."
3,no,"[manager, journal, sniffing, drug, abuser, qui..."
4,yes,"[currency, trading, dollar, remains, tight, ra..."


In [105]:
## Importing Libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

### Data preparation
* Initial Data Processing: Our first step is to encode the relevance label into both the Relevant (1) and non-Relevant labels (0). Then, we make it into a np.array to feed into the model.
* Then, we begin to clean text data into pad sequences.

In [20]:
df.update(df["relevance"].apply(lambda x: 0 if x == "no" else 1))

In [21]:
df.head(5)

Unnamed: 0,relevance,whole_txt
0,1,"[yield, cd, fell, latest, week, new, york, yie..."
1,0,"[morning, brief, white, house, seek, limit, ch..."
2,0,"[banking, bill, negotiator, set, compromise, p..."
3,0,"[manager, journal, sniffing, drug, abuser, qui..."
4,1,"[currency, trading, dollar, remains, tight, ra..."


### Tokenization
First, we need to "tokenize" our sentences, i.e., convert them to sequences of numbers. For this task, we are going to use the `Tokenizer` from Tensorflow (documentation [here](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer))

In [22]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(wtxt_train)   # fit our tokenizer on the dataset (i.e., assign a number to each word and keep a
                                    # dictionary with the correspondence of each word to a number)

# see the language dictionary and the total number of words (please note that number 0 is reserved for the padding task)
word_index = tokenizer.word_index
total_words = len(word_index) + 1

In [23]:
word_index

{'year': 1,
 'market': 2,
 'said': 3,
 'rate': 4,
 'stock': 5,
 'new': 6,
 'price': 7,
 'would': 8,
 'economy': 9,
 'bank': 10,
 'economic': 11,
 'percent': 12,
 'interest': 13,
 'federal': 14,
 'last': 15,
 'month': 16,
 'company': 17,
 'billion': 18,
 'week': 19,
 'one': 20,
 'inflation': 21,
 'million': 22,
 'investor': 23,
 'fed': 24,
 'point': 25,
 'dollar': 26,
 'time': 27,
 'bond': 28,
 'tax': 29,
 'government': 30,
 'president': 31,
 'first': 32,
 'york': 33,
 'say': 34,
 'growth': 35,
 'fund': 36,
 'day': 37,
 'increase': 38,
 'yesterday': 39,
 'reserve': 40,
 'share': 41,
 'also': 42,
 'business': 43,
 'average': 44,
 'since': 45,
 'two': 46,
 'may': 47,
 'many': 48,
 'state': 49,
 'index': 50,
 'could': 51,
 'high': 52,
 'quarter': 53,
 'money': 54,
 'report': 55,
 'mr': 56,
 'job': 57,
 'trading': 58,
 'cut': 59,
 'financial': 60,
 'rose': 61,
 'deficit': 62,
 'consumer': 63,
 'sale': 64,
 'gain': 65,
 'higher': 66,
 'dow': 67,
 'policy': 68,
 'even': 69,
 'budget': 70,
 'e

In [24]:
total_words

36554

### Padding Sequences
Sentences and sequences tend to have different lengths, however our model is expecting equally sized observations.
Here we want to convert our texts to sequences and make them of the same length (in general, the lenght of the longest of our sequences). We are going to use here `pad_sequences` from Tensorflow (documentation [here](https://www.tensorflow.org/api_docs/python/tf/keras/utils/pad_sequences)), to add zeroes to the tokenized sentences until they all reach the same length.

In [25]:
sequences = tokenizer.texts_to_sequences(wtxt_train)
padded_sequences = pad_sequences(sequences)

In [26]:
sequences[0]

[186,
 2146,
 88,
 346,
 19,
 6,
 33,
 186,
 2559,
 754,
 886,
 147,
 10,
 373,
 2943,
 370,
 25,
 346,
 19,
 1464,
 508,
 86,
 156,
 75,
 13,
 4,
 193,
 7949,
 63,
 2146,
 565,
 1687,
 10,
 44,
 186,
 293,
 16,
 754,
 88,
 19,
 321,
 39,
 133,
 10,
 349,
 9914,
 54,
 2,
 7950,
 3269,
 631,
 128,
 94,
 16,
 63,
 754,
 44,
 186,
 1935,
 19,
 133,
 9914,
 46,
 10,
 9914,
 349,
 3807,
 6,
 33,
 9114,
 2470,
 879,
 139,
 21385,
 193,
 7949,
 2146,
 86,
 1234,
 800,
 247,
 1,
 63,
 2146,
 1379,
 9914,
 3,
 186,
 94,
 16,
 293,
 16,
 77,
 146,
 565,
 289,
 1110,
 2442,
 1335,
 370,
 25,
 381,
 19,
 2615]

In [27]:
padded_sequences

array([[   0,    0,    0, ...,  381,   19, 2615],
       [   0,    0,    0, ...,  157,   49,  178],
       [   0,    0,    0, ...,   10,   83,   43],
       ...,
       [   0,    0,    0, ...,  514,  527,  120],
       [   0,    0,    0, ...,  278,  103,   59],
       [   0,    0,    0, ...,   41,  184,   22]])

In [28]:
df['pad_seq'] = padded_sequences.tolist()

In [29]:
df.drop(['whole_txt'], axis = 1)

Unnamed: 0,relevance,pad_seq
0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
7995,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7996,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7997,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7998,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [30]:
df.head(5)

Unnamed: 0,relevance,whole_txt,pad_seq
0,1,"[yield, cd, fell, latest, week, new, york, yie...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,0,"[morning, brief, white, house, seek, limit, ch...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,0,"[banking, bill, negotiator, set, compromise, p...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,0,"[manager, journal, sniffing, drug, abuser, qui...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,1,"[currency, trading, dollar, remains, tight, ra...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Train-Test Split

In [31]:
X = padded_sequences
y = df['relevance']

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2, random_state=42, stratify=y)

In [33]:
X_train

array([[    0,     0,     0, ...,    71,   111,    38],
       [    0,     0,     0, ...,  2256,   278,  1288],
       [    0,     0,     0, ...,   206,    22,    57],
       ...,
       [    0,     0,     0, ...,   190,    84,   426],
       [    0,     0,     0, ..., 11382,  5063,   149],
       [    0,     0,     0, ...,    27,  2800,  2736]])

In [34]:
X_train.shape

(6400, 475)

In [52]:
y_train

array([0, 0, 1, ..., 1, 0, 0])

In [36]:
y_train.shape

(6400,)

In [59]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [60]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

### Building the model

We are going to build a simple model that includes:
- `Embedding` layer with an output representation of each word as a vector of dim 16
- `LSTM` (see class slides for more detail or RNNs example notebook for more details) with an intermediate state of 100
- An output layer `Dense` that connects the output of the LSTM and creates an output of 3 positions (one per class) as output of the network

That is model nr.1 

In [82]:
# We are going to build our model with the Sequential API
model = Sequential()
model.add(Embedding(total_words,      # number of words to process as input
                    100,    # output representation
                    input_length=len(padded_sequences[0])))    # total length of each observation
model.add(LSTM(100, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))  # Change activation based on the number of classes

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [83]:
model(padded_sequences)

<tf.Tensor: shape=(8000, 1), dtype=float32, numpy=
array([[0.49995384],
       [0.49540925],
       [0.50199515],
       ...,
       [0.502827  ],
       [0.50224346],
       [0.5016381 ]], dtype=float32)>

In [84]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 475, 100)          3655400   
                                                                 
 lstm_5 (LSTM)               (None, 100)               80400     
                                                                 
 dense_5 (Dense)             (None, 1)                 101       
                                                                 
Total params: 3735901 (14.25 MB)
Trainable params: 3735901 (14.25 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# Training the models

### MODEL 1 (The base model)

In [55]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x16f9b57f880>

In [56]:
print(model(padded_sequences).numpy().argmax(axis = 1))

[0 0 0 ... 0 0 0]


Model 1 Testing

In [61]:
loss, accuracy = model.evaluate(X_test, y_test)

print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Loss: 1.4123
Test Accuracy: 75.44%


In [64]:
#Prection and Confusion Matrix
y_pred = model.predict(X_test)
bin_y_pred = (y_pred > 0.5).astype(int)



In [70]:
bin_y_pred = np.squeeze(bin_y_pred)

In [73]:
bin_y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [74]:
from sklearn.metrics import confusion_matrix

In [76]:
confusion_matrix(y_test, bin_y_pred)

array([[1137,  177],
       [ 216,   70]], dtype=int64)

### MODEL 2

In [108]:
# We are going to build our model with the Sequential API
model2 = Sequential()

model2.add(Embedding(total_words,      # number of words to process as input
                    50,    # output representation
                    input_length=len(padded_sequences[0])))    # total length of each observation

model2.add(LSTM(50, return_sequences=False))

model2.add(Dropout(0.2))

model2.add(Dense(1, activation='sigmoid')) 

model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [109]:
model2.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 475, 50)           1827700   
                                                                 
 lstm_11 (LSTM)              (None, 50)                20200     
                                                                 
 dropout_3 (Dropout)         (None, 50)                0         
                                                                 
 dense_10 (Dense)            (None, 1)                 51        
                                                                 
Total params: 1847951 (7.05 MB)
Trainable params: 1847951 (7.05 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [110]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [111]:
model2.fit(X_train, y_train, epochs=5, validation_data = (X_test, y_test), callbacks=[early_stopping])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


<keras.src.callbacks.History at 0x16fa0208040>

In [113]:
loss, accuracy = model2.evaluate(X_test, y_test)



Adjust the model architecture and hyperparameters based on your specific problem and data.

This is a basic example to get you started with implementing an RNN for text classification using TensorFlow. Fine-tune and expand upon this foundation according to your project requirements.

## Implementing an RNN for Text Classification with TensorFlow

In this example, we'll build a simple Recurrent Neural Network (RNN) using TensorFlow for text classification. The dataset consists of one or two sentences as input data.

This is a simplified and applied version with focus on the usage of Tensorflow. For a more detailed and extensive process description please refer to the "RNNs example" notebook or to the class presentations
