In [None]:
# Mount Google Drive to access files
from google.colab import drive
drive.mount('/content/Quora Text Classification Data.csv')

# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
from tensorflow.keras.models import Sequential  # Sequential model for neural network
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Embedding  # Neural network layers
from tensorflow.keras.preprocessing.text import Tokenizer  # Text tokenization
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Sequence padding
from nltk.tokenize import word_tokenize  # Text tokenization
from nltk.stem import WordNetLemmatizer  # Word lemmatization
from nltk.corpus import stopwords  # Stopwords removal
from string import punctuation  # Punctuation characters
import numpy as np  # Numerical operations
from tqdm import tqdm  # Progress bar
tqdm.pandas()  # Enable pandas progress_apply() with progress bar

"""
The dataset contains Quora questions with labels indicating whether they're sincere (0) or insincere (1)
Columns: qid (question id), question_text, target (0/1)
"""
df = pd.read_csv('/content/drive/MyDrive/Datasets/Quora Text Classification Data.csv')
df.head()  # Display first 5 rows to inspect data

# Download NLTK resources (stopwords, tokenizer models, wordnet)
import nltk
nltk.download('stopwords')  # Common words to be removed (the, is, etc.)
nltk.download('punkt')  # Pre-trained tokenizer model
nltk.download('wordnet')  # Lexical database for lemmatization

# Combine standard English stopwords with punctuation marks
stop_words = stopwords.words('english') + list(punctuation)
lem = WordNetLemmatizer()  # Initialize lemmatizer (reduces words to base form)

def cleaning(text):
    """
    Text preprocessing function:
    1. Convert to lowercase
    2. Tokenize into words
    3. Remove stopwords and punctuation
    4. Lemmatize words
    5. Rejoin into single string
    """
    text = text.lower()  # Normalize case
    words = word_tokenize(text)  # Split text into tokens
    words = [w for w in words if w not in stop_words]  # Filter stopwords
    words = [lem.lemmatize(w) for w in words]  # Reduce words to base form
    return ' '.join(words)  # Rejoin into single string

# Apply cleaning function to all questions with progress bar
df['Clean Text'] = df['question_text'].progress_apply(cleaning)

# Unzip GloVe word embeddings (pre-trained word vectors)
# GloVe: Global Vectors for Word Representation - captures semantic relationships
!unzip '/content/drive/MyDrive/Word Embeddings/glove.42B.300d.zip'

# Load GloVe embeddings into dictionary {word: vector}
embedding_values = {}
f = open('/content/glove.42B.300d.txt')  # 42B tokens, 300-dimensional vectors
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]  # First element is the word
    coef = np.array(value[1:], dtype="float32")  # Remaining elements are the vector
    if coef is not None:
        embedding_values[word] = coef

# Initialize tokenizer - converts text to sequences of integers
tokenizer = Tokenizer()
x = df['Clean Text']  # Features (cleaned text)
y = df['target']  # Labels (0 or 1)

# Build vocabulary from all texts
tokenizer.fit_on_texts(x)

# Convert texts to sequences of integers
seq = tokenizer.texts_to_sequences(x)
# Pad sequences to ensure uniform length (300 tokens)
pad_seq = pad_sequences(seq, maxlen=300)

vocab_size = len(tokenizer.word_index) + 1  # +1 for OOV (out-of-vocabulary) token
print(f"Vocabulary size: {vocab_size}")

# Create embedding matrix where each row corresponds to a word in our vocabulary
embedding_matrix = np.zeros((vocab_size, 300))  # Initialize with zeros
for word, i in tqdm(tokenizer.word_index.items()):
    value = embedding_values.get(word)
    if value is not None:
        embedding_matrix[i] = value  # Assign pre-trained vector if word exists in GloVe

"""
Model Architecture:
1. Embedding Layer: Uses pre-trained GloVe vectors (frozen during training)
2. LSTM Layer: Processes sequential information (50 units)
3. Dense Layer: 128 ReLU units for feature transformation
4. Output Layer: Single sigmoid unit for binary classification
"""
model = Sequential()
# Embedding layer with pre-trained weights (trainable=False to keep fixed)
model.add(Embedding(vocab_size, 300, input_length=300,
                   weights=[embedding_matrix], trainable=False))
model.add(LSTM(50, return_sequences=False))  # Single LSTM layer
model.add(Dense(128, activation='relu'))  # Hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer (binary classification)

# Compile model with Adam optimizer and binary crossentropy loss
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model with 20% validation split
history = model.fit(pad_seq, y, validation_split=0.2, epochs=5)