In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, BatchNormalization, GRU, concatenate
from keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from nltk import word_tokenize
from sklearn.metrics import classification_report, f1_score

In [2]:
!wget -O stacksample.zip https://www.dropbox.com/s/5721wcs2guuykzl/stacksample.zip?dl=0
!unzip stacksample.zip -d /content/

--2025-01-02 08:46:57--  https://www.dropbox.com/s/5721wcs2guuykzl/stacksample.zip?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.1.18, 2620:100:6016:18::a27d:112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.1.18|:443... connected.
HTTP request sent, awaiting response... 302 Found


  pid, fd = os.forkpty()


Location: https://www.dropbox.com/scl/fi/bnabda9cee2yuyi6wesmv/stacksample.zip?rlkey=sucdc1cm2m7fnwtzhykv24g8z&dl=0 [following]
--2025-01-02 08:46:57--  https://www.dropbox.com/scl/fi/bnabda9cee2yuyi6wesmv/stacksample.zip?rlkey=sucdc1cm2m7fnwtzhykv24g8z&dl=0
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc514b28c17f34b32aab35f15a39.dl.dropboxusercontent.com/cd/0/inline/ChYWFVnDrgIf9yBnHgNQTYeQRXlz-rAyMoVMi8ajuaVuJkHrWX8n5SJ8k__mLOOo-ic7-cERryHrRHcfb20CuQzQZcWdpaiRQJSvhtx95QlLWpT-WbEDeKq3zVk-LTrrfcs/file# [following]
--2025-01-02 08:46:58--  https://uc514b28c17f34b32aab35f15a39.dl.dropboxusercontent.com/cd/0/inline/ChYWFVnDrgIf9yBnHgNQTYeQRXlz-rAyMoVMi8ajuaVuJkHrWX8n5SJ8k__mLOOo-ic7-cERryHrRHcfb20CuQzQZcWdpaiRQJSvhtx95QlLWpT-WbEDeKq3zVk-LTrrfcs/file
Resolving uc514b28c17f34b32aab35f15a39.dl.dropboxusercontent.com (uc514b28c17f34b32aab35f15a39.dl.dropboxusercontent.com)... 162.125.1.15, 2620:100:6016:15::a27d:10f
C

In [3]:
df_tags = pd.read_csv('/content/Tags.csv', encoding='iso-8859-1')

In [4]:
top10_tags = ['javascript', 'java', 'c#', 'php', 'android', 'jquery', 'python', 'html', 'c++', 'ios']
df_tags = df_tags[df_tags['Tag'].isin(top10_tags)]

In [5]:
questions = pd.read_csv('/content/Questions.csv', encoding='iso-8859-1')
questions = questions.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate', 'Score'])
data = pd.merge(questions, df_tags, on='Id')

In [6]:
mlb = MultiLabelBinarizer()
data['Tags'] = data.groupby('Id')['Tag'].transform(lambda x: list(set(x)))
data = data.drop_duplicates(subset=['Id']).reset_index(drop=True)
y = mlb.fit_transform(data['Tags'])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.25, random_state=24)

In [8]:
max_len_t, max_len_b = 18, 700
vocab_size_t, vocab_size_b = 10000, 50000

In [9]:
tok_title = Tokenizer(num_words=vocab_size_t, char_level=False, split=' ')
tok_title.fit_on_texts(X_train['Title'])
seq_train_t = tok_title.texts_to_sequences(X_train['Title'])
seq_test_t = tok_title.texts_to_sequences(X_test['Title'])
seq_train_t = sequence.pad_sequences(seq_train_t, maxlen=max_len_t, padding='post')
seq_test_t = sequence.pad_sequences(seq_test_t, maxlen=max_len_t, padding='post')

In [10]:
tok_body = Tokenizer(num_words=vocab_size_b, char_level=False, split=' ')
tok_body.fit_on_texts(X_train['Body'])
seq_train_b = tok_body.texts_to_sequences(X_train['Body'])
seq_test_b = tok_body.texts_to_sequences(X_test['Body'])
seq_train_b = sequence.pad_sequences(seq_train_b, maxlen=max_len_b, padding='post')
seq_test_b = sequence.pad_sequences(seq_test_b, maxlen=max_len_b, padding='post')

In [11]:
def RNN():
    # Title Input
    title_input = Input(shape=[max_len_t], name='title_input')
    title_embed = Embedding(input_dim=vocab_size_t, output_dim=200, input_length=max_len_t, mask_zero=True)(title_input)
    title_gru = GRU(128,use_cudnn=False)(title_embed)
    
    # Body Input
    body_input = Input(shape=[max_len_b], name='body_input')
    body_embed = Embedding(input_dim=vocab_size_b, output_dim=200, input_length=max_len_b, mask_zero=True)(body_input)
    body_gru = GRU(128,use_cudnn=False)(body_embed)
    
    # Combine GRU Outputs
    combined = concatenate([title_gru, body_gru])
    dense1 = Dense(256, activation='relu')(combined)
    dropout1 = Dropout(0.3)(dense1)
    batch_norm = BatchNormalization()(dropout1)
    dense2 = Dense(128, activation='relu')(batch_norm)
    main_output = Dense(len(mlb.classes_), activation='sigmoid', name='main_output')(dense2)
    
    return Model(inputs=[title_input, body_input], outputs=[main_output])

In [12]:
model = RNN()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



In [15]:
history = model.fit(
    [seq_train_t, seq_train_b], y_train,
    validation_data=([seq_test_t, seq_test_b], y_test),
    epochs=25,
    batch_size=800)

Epoch 1/25
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 356ms/step - accuracy: 0.4142 - loss: 0.0595 - val_accuracy: 0.3932 - val_loss: 0.1465
Epoch 2/25
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 355ms/step - accuracy: 0.4265 - loss: 0.0481 - val_accuracy: 0.4249 - val_loss: 0.1674
Epoch 3/25
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 355ms/step - accuracy: 0.4343 - loss: 0.0378 - val_accuracy: 0.4042 - val_loss: 0.1927
Epoch 4/25
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 355ms/step - accuracy: 0.4415 - loss: 0.0297 - val_accuracy: 0.4490 - val_loss: 0.2283
Epoch 5/25
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 355ms/step - accuracy: 0.4645 - loss: 0.0234 - val_accuracy: 0.4439 - val_loss: 0.2624
Epoch 6/25
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 353ms/step - accuracy: 0.5029 - loss: 0.0188 - val_accuracy: 0.4667 - val_loss: 0.2792
Epoc

In [17]:
predictions = model.predict([seq_test_t, seq_test_b])
print(f"F1 Score: {f1_score(y_test, predictions > 0.55, average='samples')}")
print(classification_report(y_test, predictions > 0.55, target_names=mlb.classes_))

[1m5519/5519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 31ms/step
F1 Score: 0.8768640000282765
              precision    recall  f1-score   support

           #       0.89      0.88      0.89     24011
           +       0.85      0.88      0.86     11643
           a       0.91      0.91      0.91     65492
           c       0.88      0.89      0.88     54403
           d       0.81      0.83      0.82     18131
           e       0.78      0.75      0.76     17696
           h       0.91      0.91      0.91     46259
           i       0.83      0.84      0.84     48363
           j       0.89      0.88      0.89     65057
           l       0.67      0.69      0.68      6032
           m       0.67      0.69      0.68      6032
           n       0.88      0.88      0.88     33996
           o       0.90      0.90      0.90     45479
           p       0.89      0.89      0.89     58976
           q       0.78      0.75      0.76     17696
           r       0.87   

  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
top10_tags

['javascript',
 'java',
 'c#',
 'php',
 'android',
 'jquery',
 'python',
 'html',
 'c++',
 'ios']