In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
df = pd.read_csv("questions.csv")
df.shape
new_df = df.sample(30000,random_state=2)
qid = pd.Series(new_df['qid1'].tolist() + new_df['qid2'].tolist())
print('Number of unique questions',np.unique(qid).shape[0])
x = qid.value_counts()>1
print('Number of questions getting repeated',x[x].shape[0])

Number of unique questions 59795
Number of questions getting repeated 176


In [8]:
# Analyze question distribution
qid = pd.Series(new_df['qid1'].tolist() + new_df['qid2'].tolist())
print('Number of unique questions', np.unique(qid).shape[0])
x = qid.value_counts() > 1
print('Number of questions getting repeated', x[x].shape[0])

# Clean and process text data
def clean_and_process_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    return str(text).strip()

new_df['question1'] = new_df['question1'].apply(clean_and_process_text)
new_df['question2'] = new_df['question2'].apply(clean_and_process_text)

# Calculate basic features
new_df['q1_len'] = new_df['question1'].str.len().fillna(0)
new_df['q2_len'] = new_df['question2'].str.len().fillna(0)
new_df['q1_num_words'] = new_df['question1'].apply(lambda x: len(str(x).split(" ")) if pd.notnull(x) else 0)
new_df['q2_num_words'] = new_df['question2'].apply(lambda x: len(str(x).split(" ")) if pd.notnull(x) else 0)

# Function to count common words
def common_words(row):
    try:
        w1 = set(map(lambda word: word.lower().strip(), str(row['question1']).split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), str(row['question2']).split(" ")))    
        return len(w1 & w2)
    except:
        return 0

# Function to count total words
def total_words(row):
    try:
        w1 = set(map(lambda word: word.lower().strip(), str(row['question1']).split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), str(row['question2']).split(" ")))    
        return (len(w1) + len(w2))
    except:
        return 0

# Calculate word-based features
new_df['word_common'] = new_df.apply(common_words, axis=1)
new_df['word_total'] = new_df.apply(total_words, axis=1)
new_df['word_share'] = new_df.apply(lambda x: round(x['word_common']/x['word_total'], 2) if x['word_total'] > 0 else 0, axis=1)

# Prepare final dataset
ques_df = new_df[['question1','question2']]
final_df = new_df.drop(columns=['id','qid1','qid2','question1','question2'])

# Vectorize text data
from sklearn.feature_extraction.text import CountVectorizer
questions = [clean_and_process_text(q) for q in ques_df['question1']] + [clean_and_process_text(q) for q in ques_df['question2']]

cv = CountVectorizer(max_features=3000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(), 2)

# Create feature matrices
temp_df1 = pd.DataFrame(q1_arr, index=ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index=ques_df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
final_df = pd.concat([final_df, temp_df], axis=1)


Number of unique questions 59795
Number of questions getting repeated 176


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_df.iloc[:,1:].values, 
                                                    final_df.iloc[:,0].values,
                                                    test_size=0.2,
                                                    random_state=1)

In [14]:
# ...existing code...

# determine input dimension from prepared features
input_dim = X_train.shape[1]

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras import regularizers

model = Sequential()
model.add(Dense(600, input_dim=input_dim, activation='relu',
                kernel_regularizer=regularizers.l2(1e-4)))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(300, activation='relu',
                kernel_regularizer=regularizers.l2(1e-4)))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(120, activation='relu',
                kernel_regularizer=regularizers.l2(1e-4)))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(60, activation='relu',
                kernel_regularizer=regularizers.l2(1e-4)))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(30, activation='relu',
                kernel_regularizer=regularizers.l2(1e-4)))
model.add(BatchNormalization())
model.add(Dropout(0.4))

model.add(Dense(10, activation='relu',
                kernel_regularizer=regularizers.l2(1e-4)))
model.add(Dropout(0.3))

model.add(Dense(5, activation='relu',
                kernel_regularizer=regularizers.l2(1e-4)))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')

# train
history = model.fit(X_train, y_train,
                    validation_split=0.2,
                    batch_size=64,
                    epochs=100,
                    callbacks=callbacks,
                    verbose=1)

# ...existing code...

Epoch 1/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 22ms/step - accuracy: 0.5698 - loss: 0.8436 - val_accuracy: 0.6254 - val_loss: 0.7629 - learning_rate: 0.0010
Epoch 2/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - accuracy: 0.6194 - loss: 0.7602 - val_accuracy: 0.6254 - val_loss: 0.7466 - learning_rate: 0.0010
Epoch 3/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.6273 - loss: 0.7335 - val_accuracy: 0.6254 - val_loss: 0.7300 - learning_rate: 0.0010
Epoch 4/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.6414 - loss: 0.6877 - val_accuracy: 0.6902 - val_loss: 0.6807 - learning_rate: 0.0010
Epoch 5/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.6633 - loss: 0.6574 - val_accuracy: 0.6975 - val_loss: 0.6900 - learning_rate: 0.0010
Epoch 6/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

KeyboardInterrupt: 