In [94]:
!pip install nltk
!pip install keras
!pip install tensorflow



In [115]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, RMSprop
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.utils.class_weight import compute_class_weight

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/darylmurenzi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/darylmurenzi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/darylmurenzi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [96]:
dataset0 = pd.read_csv("input/clickbait_data.csv")

In [97]:
dataset0 = dataset0.rename(columns={"headline": "text", "clickbait": "label"})

print(dataset0.head())
print(dataset0.shape)

                                                text  label
0                                 Should I Get Bings      1
1      Which TV Female Friend Group Do You Belong In      1
2  The New "Star Wars: The Force Awakens" Trailer...      1
3  This Vine Of New York On "Celebrity Big Brothe...      1
4  A Couple Did A Stunning Photo Shoot With Their...      1
(32000, 2)


In [98]:
# pre process text ( remove punctuation, make characters lowercase and do lemmatization )


def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # 1. Lowercase the text
    text = text.lower()

    # 2. Remove punctuation using regex
    text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text)

    # 3. Tokenize the text
    words = text.split()

    # 4. Remove stopwords
    words = [word for word in words if word not in stop_words]

    # 5. Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # 6. Join back into a string
    return ' '.join(lemmatized_words)

df = dataset0.copy()

df = df.drop_duplicates()

df["text"] = df["text"].apply(preprocess_text)
print(df.head())

                                                text  label
0                                          get bings      1
1                      tv female friend group belong      1
2      new star war force awakens trailer give chill      1
3  vine new york celebrity big brother fucking pe...      1
4  couple stunning photo shoot baby learning inop...      1


In [99]:
# Split train and test data

# train_data, test_data, train_labels, test_labels = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# Train/Test split
train_data, test_data, train_labels, test_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42)

# Further split train_data into train/val
train_data, val_data, train_labels, val_labels = train_test_split(
    train_data, train_labels, test_size=0.2, random_state=42)


print(train_data.shape)
print(test_data.shape)

(20480,)
(6400,)


In [100]:
# Terms Frequency -  Inverse Document Frequency


vectorizer = TfidfVectorizer(max_features=5000)

X_train = vectorizer.fit_transform(train_data).toarray()
X_val = vectorizer.transform(val_data).toarray()
X_test = vectorizer.transform(test_data).toarray()

In [101]:
# build neural network model

# Define model with regularization, dropout, deeper layers
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))  # Dropout added
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

# Compile with custom learning rate
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train with validation and early stopping
model.fit(X_train, train_labels, validation_data=(X_val, val_labels), epochs=20, callbacks=[early_stop])


test_tfidf=  vectorizer.transform(test_data).toarray()

y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

accuracy = accuracy_score(test_labels, y_pred)
f1 = f1_score(test_labels, y_pred)
recall = recall_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

# Optional: Print predictions and true labels
print("Predicted labels:", y_pred.flatten())
print("True labels:", test_labels.values)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7690 - loss: 1.0679 - val_accuracy: 0.9240 - val_loss: 0.5396
Epoch 2/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9209 - loss: 0.5365 - val_accuracy: 0.9295 - val_loss: 0.5197
Epoch 3/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9241 - loss: 0.5102 - val_accuracy: 0.9318 - val_loss: 0.4986
Epoch 4/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9239 - loss: 0.4982 - val_accuracy: 0.9242 - val_loss: 0.4867
Epoch 5/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9291 - loss: 0.4769 - val_accuracy: 0.9246 - val_loss: 0.4814
Epoch 6/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9298 - loss: 0.4678 - val_accuracy: 0.9328 - val_loss: 0.4644
Epoch 7/20
[1m640/640[0m 

In [102]:
model.save("saved_models/nn_instance2.h5")
print("Model saved to saved_models/nn_instance2.h5")



Model saved to saved_models/nn_instance2.h5


In [117]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weight_dict = dict(enumerate(class_weights))
print("Class weights:", class_weight_dict)

# === Build the model ===
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

# === Compile the model using RMSprop ===
optimizer = RMSprop(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# === Early stopping ===
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# === Train the model ===
history = model.fit(
    X_train, train_labels,
    validation_data=(X_val, val_labels),
    epochs=20,
    callbacks=[early_stop],
    class_weight=class_weight_dict
)

y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

accuracy = accuracy_score(test_labels, y_pred)
f1 = f1_score(test_labels, y_pred, zero_division=0)
recall = recall_score(test_labels, y_pred, zero_division=0)
precision = precision_score(test_labels, y_pred, zero_division=0)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")


Class weights: {0: np.float64(0.9940782448305989), 1: np.float64(1.0059927301306613)}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8277 - loss: 0.5867 - val_accuracy: 0.9398 - val_loss: 0.2348
Epoch 2/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9429 - loss: 0.2143 - val_accuracy: 0.9391 - val_loss: 0.2173
Epoch 3/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9467 - loss: 0.1958 - val_accuracy: 0.9350 - val_loss: 0.2124
Epoch 4/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9486 - loss: 0.1845 - val_accuracy: 0.9395 - val_loss: 0.1975
Epoch 5/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9479 - loss: 0.1813 - val_accuracy: 0.9395 - val_loss: 0.1990
Epoch 6/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9514 - loss: 0.1718 - val_accuracy: 0.9420 - val_loss: 0.1916
Epoch 7/20
[1m640/640[0m 

In [None]:
model.save("saved_models/nn_instance3.h5")
print("Instance 3 saved successfully.")