# Preprocess

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("./US-Economic-News.csv", delimiter=',', encoding= 'ISO-8859-1')

df.info()

In [None]:
df.head(5)

In [None]:
df = df[['headline', 'text', 'relevance']]

# We drop all irrelavant features to only keep headline and text for 2 reasons: 
# The other features seem either irrelevant or we lack documentation
# With headline and text only, our final model will be more generalizable. We could in theory apply it to any article.

In [None]:
import numpy as np

df_yes = df[df['relevance'] == 'yes']
df_no = df[df['relevance'] == 'no']

df_no_sampled = df_no.sample(n=len(df_yes), random_state=42)

# Concatenate the sampled 'no' rows with all 'yes' rows
df_balanced = pd.concat([df_yes, df_no_sampled])

print(df_balanced['relevance'].value_counts())

In [None]:
df = df_balanced

Cleaning Strings

In [None]:
#!pip install nltk

In [None]:
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# #Ensure you have downloaded the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
df['whole_txt'] = df['headline']+ ' ' + df['text']

In [None]:
def preprocess_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'\W', ' ', text)  # remove all non-word characters
    text = re.sub(r'\s+', ' ', text)  # replace multiple spaces with a single space
    return text

df['processed'] = df['whole_txt'].apply(preprocess_text)


In [None]:
df['processed'].head()

In [None]:
 # Tokenize before word2vec
df['tokenized'] = df['processed'].apply(lambda x: x.split())

In [None]:
unique_words = set(word for sentence in df['tokenized'] for word in sentence)
total_words = len(unique_words)

In [None]:
# Train word2vec model with tokenized features
from gensim.models import Word2Vec
sentences = df['tokenized'].tolist()
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
import numpy as np

def vectorize_text(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

df['vectorized'] = df['tokenized'].apply(lambda x: vectorize_text(x, word2vec_model))


In [None]:
df.update(df["relevance"].apply(lambda x: 0 if x == "no" else 1))

In [None]:
# Separate features and target
X = np.array(df['vectorized'].tolist())
y = df['relevance']  # Replace with your actual target variable

### Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2, random_state=42)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,  test_size=0.2, random_state=42)

In [None]:
X_train

In [None]:
X_train.shape

In [None]:
y_train

In [None]:
y_train.shape

In [None]:
y_val.shape

In [None]:
y_test.shape

In [None]:
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

In [None]:
X_train = np.array(X_train).astype('float32')
X_val = np.array(X_val).astype('float32')
y_train = np.array(y_train).astype('float32')
y_val = np.array(y_val).astype('float32')

### Building the model

We are going to build a simple model that includes:
- `Embedding` layer with an output representation of each word as a vector of dim 16
- `LSTM` (see class slides for more detail or RNNs example notebook for more details) with an intermediate state of 100
- An output layer `Dense` that connects the output of the LSTM and creates an output of 3 positions (one per class) as output of the network

That is model nr.1 

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [1]:
total_words

NameError: name 'df' is not defined

In [None]:
embedding_dim = word2vec_model.vector_size
print("Word2Vec Embedding Dimension:", embedding_dim)

In [None]:
max_length = 300 

def convert_to_sequence(text_tokens, model, max_length):
    sequence = [model.wv[word] for word in text_tokens if word in model.wv]
    # Pad with zeros if the sequence is shorter than max_length
    padding = [np.zeros(model.vector_size)] * (max_length - len(sequence))
    sequence.extend(padding)
    # Truncate if the sequence is longer than max_length
    return sequence[:max_length]

X_train = np.array([convert_to_sequence(tokens, word2vec_model, max_length) for tokens in train_tokenized_texts])
# Similarly for X_val and X_test
X_val = np.array([convert_to_sequence(tokens, word2vec_model, max_length) for tokens in train_tokenized_texts])
X_test = np.array([convert_to_sequence(tokens, word2vec_model, max_length) for tokens in train_tokenized_texts])

### Model 1

In [None]:
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(max_length, embedding_dim)))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))  # Use 'softmax' for multi-class classification

model.summary()

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
batch_size = 32
epochs = 10

history = model.fit(X_train, y_train, 
                    batch_size=batch_size, 
                    epochs=epochs, 
                    validation_data=(X_val, y_val))

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")