Project 3: Spam filter for Quora questions
Download data from here : https://www.dropbox.com/sh/kpf9z73woodfssv/AAAw1_JIzpuVvwteJCma0xMla?dl=0

Goal : Build a model for identifying if a question on Quora is spam

Suggested Guidelines :

1. To bring down dimensions of your model you can use glove embedding shared with you ( in the data )

2. Here is how you can use pertained embeddings : https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

3. You'll have to Create and maintain your own train/validation splits for the full data shared with you

4. Your solution needs to be uploaded to GitHub repo of your team

In [None]:
# tags dataset column- qid, question_text, target

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/DL P3 Qura spam/train (1) (1).csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   qid            1048575 non-null  object
 1   question_text  1048575 non-null  object
 2   target         1048575 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 24.0+ MB


In [None]:
# Split data
from sklearn.model_selection import train_test_split
text_train, text_val, labels_train, labels_val = train_test_split(df['question_text'], df['target'], test_size=0.2)


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

# Create tokenizer
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(text_train)



In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout,Flatten



In [None]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(text_train)
val_sequences = tokenizer.texts_to_sequences(text_val)


In [None]:
# Pad sequences
max_length = 200
train_sequences_padded = pad_sequences(train_sequences, maxlen=max_length)
val_sequences_padded = pad_sequences(val_sequences, maxlen=max_length)



# One-hot encode labels
labels_train_onehot = to_categorical(labels_train)
labels_val_onehot = to_categorical(labels_val)



In [None]:
# Build model
vocab_size = 10000
embedding_dim = 100
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(Dropout(0.2))
model.add(Flatten())  # Added Flatten layer

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))




In [None]:
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
# Train model
model.fit(train_sequences_padded, labels_train_onehot, validation_data=(val_sequences_padded, labels_val_onehot), epochs=5)


Epoch 1/5
[1m26215/26215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1420s[0m 54ms/step - accuracy: 0.9473 - loss: 0.1454 - val_accuracy: 0.9526 - val_loss: 0.1212
Epoch 2/5
[1m26215/26215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1438s[0m 53ms/step - accuracy: 0.9543 - loss: 0.1194 - val_accuracy: 0.9526 - val_loss: 0.1265
Epoch 3/5
[1m26215/26215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1398s[0m 53ms/step - accuracy: 0.9569 - loss: 0.1115 - val_accuracy: 0.9524 - val_loss: 0.1257
Epoch 4/5
[1m26215/26215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1447s[0m 55ms/step - accuracy: 0.9609 - loss: 0.1013 - val_accuracy: 0.9516 - val_loss: 0.1411
Epoch 5/5
[1m26215/26215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1493s[0m 56ms/step - accuracy: 0.9639 - loss: 0.0952 - val_accuracy: 0.9510 - val_loss: 0.1425


<keras.src.callbacks.history.History at 0x7c30cad6d6f0>

In [None]:
# Evaluate model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

loss, accuracy = model.evaluate(val_sequences_padded, labels_val_onehot)
print(f'Validation accuracy: {accuracy:.3f}')



[1m6554/6554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 7ms/step - accuracy: 0.9513 - loss: 0.1416
Validation accuracy: 0.951


In [None]:

# Make predictions
predictions = model.predict(val_sequences_padded)
predicted_labels = np.argmax(predictions, axis=1)



[1m6554/6554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 7ms/step


In [None]:
# Convert one-hot encoded val_labels to class labels
labels_val_onehot = np.argmax(labels_val_onehot, axis=1)


In [None]:
# Evaluate predictions
print('Classification Report:')
print(classification_report(labels_val_onehot, predicted_labels))
print('Confusion Matrix:')
print(confusion_matrix(labels_val_onehot, predicted_labels))


Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97    196709
           1       0.66      0.44      0.53     13006

    accuracy                           0.95    209715
   macro avg       0.81      0.71      0.75    209715
weighted avg       0.94      0.95      0.95    209715

Confusion Matrix:
[[193745   2964]
 [  7309   5697]]


this is completed