In [2]:
!pip install nltk
!pip install keras
!pip install tensorflow



In [19]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import joblib
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.utils.class_weight import compute_class_weight

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/darylmurenzi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/darylmurenzi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/darylmurenzi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
dataset0 = pd.read_csv("input/clickbait_data.csv")

In [5]:
dataset0 = dataset0.rename(columns={"headline": "text", "clickbait": "label"})

print(dataset0.head())
print(dataset0.shape)

                                                text  label
0                                 Should I Get Bings      1
1      Which TV Female Friend Group Do You Belong In      1
2  The New "Star Wars: The Force Awakens" Trailer...      1
3  This Vine Of New York On "Celebrity Big Brothe...      1
4  A Couple Did A Stunning Photo Shoot With Their...      1
(32000, 2)


In [6]:
# pre process text ( remove punctuation, make characters lowercase and do lemmatization )


def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # 1. Lowercase the text
    text = text.lower()

    # 2. Remove punctuation using regex
    text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text)

    # 3. Tokenize the text
    words = text.split()

    # 4. Remove stopwords
    words = [word for word in words if word not in stop_words]

    # 5. Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # 6. Join back into a string
    return ' '.join(lemmatized_words)

df = dataset0.copy()

df = df.drop_duplicates()

df["text"] = df["text"].apply(preprocess_text)
print(df.head())

                                                text  label
0                                          get bings      1
1                      tv female friend group belong      1
2      new star war force awakens trailer give chill      1
3  vine new york celebrity big brother fucking pe...      1
4  couple stunning photo shoot baby learning inop...      1


In [7]:
# Split train and test data

# train_data, test_data, train_labels, test_labels = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# Train/Test split
train_data, test_data, train_labels, test_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42)

# Further split train_data into train/val
train_data, val_data, train_labels, val_labels = train_test_split(
    train_data, train_labels, test_size=0.2, random_state=42)


print(train_data.shape)
print(test_data.shape)

(20480,)
(6400,)


In [8]:
# Terms Frequency -  Inverse Document Frequency


vectorizer = TfidfVectorizer(max_features=5000)

X_train = vectorizer.fit_transform(train_data).toarray()
X_val = vectorizer.transform(val_data).toarray()
X_test = vectorizer.transform(test_data).toarray()

In [9]:
# build neural network model

# Define model with regularization, dropout, deeper layers
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))  # Dropout added
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

# Compile with custom learning rate
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train with validation and early stopping
model.fit(X_train, train_labels, validation_data=(X_val, val_labels), epochs=20, callbacks=[early_stop])


test_tfidf=  vectorizer.transform(test_data).toarray()

y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

accuracy = accuracy_score(test_labels, y_pred)
f1 = f1_score(test_labels, y_pred)
recall = recall_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

# Optional: Print predictions and true labels
print("Predicted labels:", y_pred.flatten())
print("True labels:", test_labels.values)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7929 - loss: 1.0518 - val_accuracy: 0.9285 - val_loss: 0.5421
Epoch 2/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9254 - loss: 0.5346 - val_accuracy: 0.9252 - val_loss: 0.5051
Epoch 3/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9277 - loss: 0.5055 - val_accuracy: 0.9197 - val_loss: 0.4950
Epoch 4/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9262 - loss: 0.4882 - val_accuracy: 0.9297 - val_loss: 0.4820
Epoch 5/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9329 - loss: 0.4747 - val_accuracy: 0.9287 - val_loss: 0.4656
Epoch 6/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9325 - loss: 0.4681 - val_accuracy: 0.9191 - val_loss: 0.4957
Epoch 7/20
[1m640/640[0m 

In [10]:
model.save("saved_models/nn_instance2.h5")
print("Model saved to saved_models/nn_instance2.h5")



Model saved to saved_models/nn_instance2.h5


In [14]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weight_dict = dict(enumerate(class_weights))
print("Class weights:", class_weight_dict)

# === Build the model ===
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

# === Compile the model using RMSprop ===
optimizer = RMSprop(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# === Early stopping ===
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# === Train the model ===
history = model.fit(
    X_train, train_labels,
    validation_data=(X_val, val_labels),
    epochs=20,
    callbacks=[early_stop],
    class_weight=class_weight_dict
)

y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

accuracy = accuracy_score(test_labels, y_pred)
f1 = f1_score(test_labels, y_pred, zero_division=0)
recall = recall_score(test_labels, y_pred, zero_division=0)
precision = precision_score(test_labels, y_pred, zero_division=0)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")


Class weights: {0: np.float64(0.9940782448305989), 1: np.float64(1.0059927301306613)}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8070 - loss: 0.5858 - val_accuracy: 0.9381 - val_loss: 0.2348
Epoch 2/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9463 - loss: 0.2133 - val_accuracy: 0.9406 - val_loss: 0.2182
Epoch 3/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9462 - loss: 0.1950 - val_accuracy: 0.9371 - val_loss: 0.2088
Epoch 4/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9456 - loss: 0.1900 - val_accuracy: 0.9244 - val_loss: 0.2303
Epoch 5/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9459 - loss: 0.1800 - val_accuracy: 0.9412 - val_loss: 0.2031
Epoch 6/20
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9495 - loss: 0.1775 - val_accuracy: 0.9354 - val_loss: 0.1958
Epoch 7/20
[1m640/640[0m 

In [12]:
model.save("saved_models/nn_instance3.h5")
print("Model saved to saved_models/nn_instance3.h5")



Model saved to saved_models/nn_instance3.h5


In [15]:


# === Build the model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001)))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# === Compile with SGD optimizer
optimizer = SGD(learning_rate=0.01)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# === Early stopping
early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

# === Train the model
history = model.fit(
    X_train, train_labels,
    validation_data=(X_val, val_labels),
    epochs=30,
    callbacks=[early_stop],
    class_weight=class_weight_dict
)

y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

accuracy = accuracy_score(test_labels, y_pred)
f1 = f1_score(test_labels, y_pred, zero_division=0)
recall = recall_score(test_labels, y_pred, zero_division=0)
precision = precision_score(test_labels, y_pred, zero_division=0)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5187 - loss: 1.0666 - val_accuracy: 0.6045 - val_loss: 1.0580
Epoch 2/30
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5626 - loss: 1.0554 - val_accuracy: 0.5127 - val_loss: 1.0457
Epoch 3/30
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5941 - loss: 1.0429 - val_accuracy: 0.6170 - val_loss: 1.0310
Epoch 4/30
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6458 - loss: 1.0274 - val_accuracy: 0.8152 - val_loss: 1.0112
Epoch 5/30
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7071 - loss: 1.0064 - val_accuracy: 0.8238 - val_loss: 0.9802
Epoch 6/30
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7608 - loss: 0.9716 - val_accuracy: 0.8566 - val_loss: 0.9179
Epoch 7/30
[1m640/640[0m 

In [None]:
model.save("saved_models/nn_instance4.h5")
print("Model saved to saved_models/nn_instance4.h5")

In [20]:

# === Build the model
logreg_model = LogisticRegression(
    max_iter=1000,
    C=1.0,  # Lower values = stronger regularization
    solver='liblinear'  # Good for small datasets and binary classification
)

# === Fit the model
logreg_model.fit(X_train, train_labels)

# === Predict
y_pred = logreg_model.predict(X_test)

# === Evaluate
accuracy = accuracy_score(test_labels, y_pred)
f1 = f1_score(test_labels, y_pred, zero_division=0)
recall = recall_score(test_labels, y_pred, zero_division=0)
precision = precision_score(test_labels, y_pred, zero_division=0)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

Accuracy: 0.9505
F1-score: 0.9508
Recall: 0.9358
Precision: 0.9662


In [21]:
joblib.dump(logreg_model, "saved_models/logreg_model.pkl")
print("Model saved to saved_models/logreg_model.pkl'")

Model saved to saved_models/logreg_model.pkl'
