# CNN

In [1]:
!pip install wandb



In [2]:
import pandas as pd
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding, LSTM
from keras.layers import Conv1D, Flatten, MaxPooling1D
from keras.datasets import imdb
import wandb
from wandb.keras import WandbCallback
import numpy as np
from keras.preprocessing import text
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv("/content/US-Economic-News.csv", delimiter=',', encoding= 'ISO-8859-1')

z = df['text']
o = df['headline']
X = o + z
y = df['relevance']
print(y.head())

y = pd.DataFrame(y, columns=['relevance'])

0    yes
1     no
2     no
3     no
4    yes
Name: relevance, dtype: object


In [4]:
categorical_columns = ['relevance']

# Define the transformation: OneHotEncoder for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ],
    remainder='passthrough'  # This will pass through other columns not specified in transformers
)

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the DataFrame
y_ = pipeline.fit_transform(y)

# transformed_df is a NumPy array. To convert it back to a DataFrame:
y_ = pd.DataFrame(y_, columns=pipeline.named_steps['preprocessor'].get_feature_names_out())


In [5]:
y_ = y_.drop(columns=['cat__relevance_no', 'cat__relevance_not sure'])

In [6]:

# here stratify is used to balance dataset as much as possible
X_train, X_test, y_train, y_test = train_test_split(X, y_, test_size=0.2, random_state=42, stratify=y)


In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
import string

stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

In [8]:
#y_train =  pd.DataFrame.to_numpy(y_train)
y_train

Unnamed: 0,cat__relevance_yes
420,0.0
3350,0.0
6716,1.0
4224,0.0
6230,0.0
...,...
3923,0.0
876,0.0
2182,1.0
7333,0.0


In [9]:
X_train

420     trade deficit worry nearly halts activity euro...
3350    teaparty activists complicate republican comeb...
6716    elder shareone great feats social engineering ...
4224    article 1 ûó titlethe house banking coni mitt...
6230    trade shrinks selloffnew york sept 20 upûóthe...
                              ...                        
3923    wall street revives reward junkets top brokers...
876     uk inflation continues riselondonuk inflation ...
2182    gdp growth revised upward 3rd quarter slight c...
7333    remodeling longer adds reinvesting equity home...
2463    rich means staying waythe rich different f sco...
Length: 6400, dtype: object

In [10]:

wandb.init(
        project="my-awesome-project",

    # track hyperparameters and run metadata
    config={
    "learning_rate": 0.02,
    "architecture": "CNN",
    "dataset": "US-Economic-News.csv",
    "epochs": 10,
    }
)
config = wandb.config

# set parameters:
config.vocab_size = 1000
config.maxlen = 5000
config.batch_size = 32
config.embedding_dims = 10
config.filters = 16
config.kernel_size = 3
config.hidden_dims = 250
config.epochs = 10

tokenizer = text.Tokenizer(num_words=config.vocab_size)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_matrix(X_train)
X_test = tokenizer.texts_to_matrix(X_test)

X_train = sequence.pad_sequences(X_train, maxlen=config.maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=config.maxlen)


[34m[1mwandb[0m: Currently logged in as: [33mefeperro[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [12]:

model = Sequential()
model.add(Embedding(config.vocab_size,
                    config.embedding_dims,
                    input_length=config.maxlen))
model.add(Dropout(0.5))
model.add(Conv1D(config.filters,
                 config.kernel_size,
                 padding='valid',
                 activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(config.filters,
                 config.kernel_size,
                 padding='valid',
                 activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(config.hidden_dims, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          batch_size=config.batch_size,
          epochs=config.epochs,
          validation_data=(X_test, y_test), callbacks=[WandbCallback()])



Epoch 1/10

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20231112_194445-6uce6rjz/files/model-best)... Done. 0.3s


Epoch 2/10

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20231112_194445-6uce6rjz/files/model-best)... Done. 0.3s


Epoch 3/10

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20231112_194445-6uce6rjz/files/model-best)... Done. 0.3s


Epoch 4/10

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20231112_194445-6uce6rjz/files/model-best)... Done. 0.3s


Epoch 5/10
Epoch 6/10

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20231112_194445-6uce6rjz/files/model-best)... Done. 0.3s


Epoch 7/10

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20231112_194445-6uce6rjz/files/model-best)... Done. 0.3s


Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7db074322500>

0.839 accuracy