In [69]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import ast
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, hamming_loss, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [70]:
df = pd.read_csv('/content/stack_overflow_dataset.csv')
df.head()

Unnamed: 0,Id,CreationDate,Score,Title,Body,AnswerCount,Tags
0,17016800,2013-06-10T04:15:05Z,0,Handling the EditText send keyboard event for ...,<pre><code>import com.example.methanegaszonege...,1,"['android', 'events', 'android-edittext', 'send']"
1,7685280,2011-10-07T09:20:41Z,7,EditText: how to enable/disable input?,<p>I have a 7x6 grid of EditText views. I want...,7,['android']
2,24178500,2014-06-12T07:13:00Z,1,Mobile web - Displaying a fixed div below a re...,<p>I want to have a relative div at the top of...,0,"['jquery', 'html', 'css', 'iphone', 'mobile']"
3,38820760,2016-08-08T03:10:28Z,0,How to create tabbed view in HTML?,<p>I'm trying to create a tabbed view in HTML ...,4,"['html', 'google-sites']"
4,3674120,2010-09-09T05:53:46Z,0,Problems decrypting HTTP Live Stream,<p>I have a single key encrypted HTTP Live Str...,2,"['http', 'stream', 'openssl', 'live', 'encrypt..."


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Id            10000 non-null  int64 
 1   CreationDate  10000 non-null  object
 2   Score         10000 non-null  int64 
 3   Title         10000 non-null  object
 4   Body          10000 non-null  object
 5   AnswerCount   10000 non-null  int64 
 6   Tags          10000 non-null  object
dtypes: int64(3), object(4)
memory usage: 547.0+ KB


In [72]:
df.drop(columns=['Id', 'CreationDate'], inplace=True)

In [73]:
df = df[df['Score'] > -3]

In [74]:
df.loc[:, 'full_text'] = df['Title'].astype(str) + ' ' + df['Body'].astype(str)

In [75]:
df.drop(columns=['Title', 'Body'], inplace=True)

In [76]:
nltk.download('stopwords', quiet=True)
stop_words = frozenset(stopwords.words('english'))

def clean_text(text, remove_stopwords=True, max_words=None):

    stop_words = frozenset(stopwords.words('english'))

    # 1. Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. Lowercase
    text = text.lower()

    # 3. Remove URLs and emails
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)

    # 4. Remove backticks but keep code content
    text = text.replace("`", "")

    # 5. Remove divider lines
    text = re.sub(r'^[\-=*_#<>~]{3,}\s*$', '', text, flags=re.MULTILINE)

    # 6. Remove quotes and curly brackets (keep parentheses and angle brackets)
    text = text.replace('"', '').replace("'", "")
    text = re.sub(r"[{}\[\]]", "", text)

    # 7. Keep letters, digits, and programming-relevant punctuation
    text = re.sub(r"[^a-zA-Z0-9\s\.\-+#_]", " ", text)

    # 8. Eemove stopwords
    if remove_stopwords:
        tokens = text.split()
        tokens = [w for w in tokens if w not in stop_words]
        text = " ".join(tokens)

    # 9. Truncate words
    if max_words:
        text = " ".join(text.split()[:max_words])

    # 10. Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [77]:
df['combined_text'] = df['full_text'] + " " + df['Tags'].apply(lambda tags: " ".join(tags))
df['clean_text'] = df['combined_text'].apply(clean_text)


In [78]:
def safe_parse_tags(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except:
            return []
    return []

# Step 2: Clean tags
def clean_tags(tag_list):
    return [tag.strip().lower() for tag in tag_list if isinstance(tag, str) and tag.strip() != '']

df['Tags'] = df['Tags'].apply(safe_parse_tags)

In [79]:
all_tags = [tag for tags in df['Tags'] for tag in tags]
tag_counts = Counter(all_tags)
print(tag_counts.most_common(300))

[('javascript', 972), ('java', 933), ('php', 788), ('c#', 778), ('android', 706), ('jquery', 621), ('python', 508), ('html', 485), ('ios', 358), ('css', 345), ('mysql', 335), ('c++', 327), ('sql', 276), ('asp.net', 242), ('objective-c', 198), ('c', 193), ('.net', 193), ('ruby-on-rails', 189), ('angularjs', 160), ('iphone', 153), ('arrays', 139), ('sql-server', 135), ('json', 134), ('ajax', 133), ('xml', 127), ('regex', 114), ('ruby', 113), ('linux', 111), ('asp.net-mvc', 108), ('database', 107), ('r', 104), ('node.js', 101), ('wpf', 100), ('windows', 99), ('django', 98), ('eclipse', 93), ('spring', 92), ('html5', 87), ('excel', 85), ('xcode', 85), ('swift', 84), ('string', 80), ('multithreading', 79), ('vb.net', 79), ('wordpress', 65), ('git', 64), ('oracle', 62), ('algorithm', 61), ('twitter-bootstrap', 60), ('facebook', 59), ('mongodb', 59), ('performance', 58), ('winforms', 57), ('bash', 56), ('swing', 54), ('hibernate', 50), ('image', 49), ('vba', 49), ('sqlite', 49), ('list', 48),

In [80]:
# Step 1: Get the top 300 tags
top_tags = set([tag for tag, _ in tag_counts.most_common(300)])

# Step 2: Filter each row to keep only those
df['Tags'] = df['Tags'].apply(lambda tags: [tag for tag in tags if tag in top_tags])


In [81]:
df = df[df['Tags'].apply(len) > 0]

In [82]:
mlb = MultiLabelBinarizer()
y_tags = mlb.fit_transform(df['Tags'])

# Step 2: Create tag DataFrame with proper column names
tag_df = pd.DataFrame(y_tags, columns=mlb.classes_)

# Step 3: Reset index to align before concatenation
df = df.reset_index(drop=True)
tag_df = tag_df.reset_index(drop=True)


In [83]:
df_with_tags = pd.concat([df, tag_df], axis=1)

In [84]:
X = df_with_tags['clean_text']
y = df_with_tags[mlb.classes_]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [85]:
model = make_pipeline(
    TfidfVectorizer(max_features=20000, ngram_range=(1,3), min_df=2, max_df=0.9, sublinear_tf=True),
    OneVsRestClassifier(LogisticRegression(penalty='l2',
    solver='lbfgs',
    class_weight='balanced',
    C=3,
    max_iter=2000,
    n_jobs=-1)))

model.fit(X_train, y_train)


In [86]:
y_prob = model.predict_proba(X_test)
y_pred = (y_prob >= 0.5).astype(int)

print("Hamming Loss:", hamming_loss(y_test, y_pred))
print("F1 Score (Micro):", f1_score(y_test, y_pred, average='micro'))
print("F1 Score (Macro):", f1_score(y_test, y_pred, average='macro'))

print(classification_report(y_test, y_pred))


Hamming Loss: 0.00541343669250646
F1 Score (Micro): 0.5856759429297923
F1 Score (Macro): 0.42238708583775597
              precision    recall  f1-score   support

           0       1.00      0.60      0.75        10
           1       0.15      0.25      0.19        28
           2       0.50      0.20      0.29         5
           3       0.00      0.00      0.00         3
           4       0.52      0.83      0.64        18
           5       0.46      0.38      0.41        16
           6       0.67      0.22      0.33         9
           7       0.90      0.80      0.85       127
           8       0.00      0.00      0.00         5
           9       0.40      1.00      0.57         2
          10       0.50      0.50      0.50         4
          11       0.25      0.67      0.36         3
          12       0.00      0.00      0.00         5
          13       0.86      0.71      0.78        35
          14       0.33      0.33      0.33         3
          15       0.80   

In [97]:
import joblib

# Save
joblib.dump(model, 'tag pred model.pkl')

['tag pred model.pkl']

In [87]:
# import numpy as np
# import tensorflow as tf
# import tensorflow.keras.backend as K
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import f1_score, hamming_loss, classification_report
# from transformers import DistilBertTokenizer, TFDistilBertModel
# from tensorflow.keras.layers import Input, Dropout, Dense, Lambda, BatchNormalization
# from tensorflow.keras.models import Model
# from tensorflow.keras.callbacks import EarlyStopping


In [88]:
# # Assume df_with_tags["clean_text"] and mlb-transformed df_with_tags["Tags"] are ready
# X = df_with_tags["clean_text"].tolist()
# y = mlb.transform(df_with_tags["Tags"])

# # Train-validation split
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [89]:
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# def tokenize(texts, max_len=512):
#     return tokenizer(
#         texts,
#         padding=True,
#         truncation=True,
#         max_length=max_len,
#         return_tensors="tf"
#     )

# train_encodings = tokenize(X_train)
# val_encodings = tokenize(X_val)


In [90]:
# batch_size = 16

# train_dataset = tf.data.Dataset.from_tensor_slices((
#     dict(train_encodings),
#     y_train
# )).batch(batch_size)

# val_dataset = tf.data.Dataset.from_tensor_slices((
#     dict(val_encodings),
#     y_val
# )).batch(batch_size)


In [91]:
# label_freq = np.sum(y_train, axis=0)
# total_samples = y_train.shape[0]

# class_weights = total_samples / (len(label_freq) * label_freq)
# class_weights = np.clip(class_weights, 1.0, 10.0)  # Prevent extreme values

# class_weights_tensor = tf.constant(class_weights, dtype=tf.float32)


In [92]:
# def get_weighted_loss(weights):
#     def loss(y_true, y_pred):
#         y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
#         loss_pos = -y_true * K.log(y_pred) * weights
#         loss_neg = -(1 - y_true) * K.log(1 - y_pred)
#         return K.mean(loss_pos + loss_neg)
#     return loss

# loss_fn = get_weighted_loss(class_weights_tensor)


In [93]:
# bert_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")

# def create_model(num_labels):
#     input_ids = Input(shape=(None,), dtype=tf.int32, name="input_ids")
#     attention_mask = Input(shape=(None,), dtype=tf.int32, name="attention_mask")

#     cls_output = Lambda(lambda x: bert_model(x)[0][:, 0], output_shape=(768,))(
#         {'input_ids': input_ids, 'attention_mask': attention_mask}
#     )

#     x = Dense(128, activation='relu')(cls_output)
#     x = BatchNormalization()(x)
#     x = Dropout(0.2)(x)

#     x = Dense(64, activation='relu')(x)
#     x = BatchNormalization()(x)
#     x = Dropout(0.2)(x)


#     output = Dense(num_labels, activation='sigmoid')(x)

#     return Model(inputs=[input_ids, attention_mask], outputs=output)

# model = create_model(num_labels=y_train.shape[1])

# model.summary()


In [94]:
# model.compile(
#     optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
#     loss=loss_fn,
#     metrics=['accuracy']
# )

# early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# model.fit(
#     train_dataset,
#     validation_data=val_dataset,
#     epochs=5,
#     callbacks=[early_stopping]
# )


In [95]:
# y_pred_probs = model.predict(val_dataset)
# y_pred = (y_pred_probs >= 0.4).astype(int)  # Threshold tuning possible

# print("🔁 Hamming Loss:", hamming_loss(y_val, y_pred))
# print("✅ F1 Score (Micro):", f1_score(y_val, y_pred, average='micro'))
# print("✅ F1 Score (Macro):", f1_score(y_val, y_pred, average='macro'))

# # Optional detailed per-tag performance
# print(classification_report(y_val, y_pred, target_names=mlb.classes_))


In [96]:
# for t in [0.2, 0.3, 0.4, 0.5]:
#     y_pred = (y_pred_probs >= t).astype(int)
#     print(f"\nThreshold: {t}")
#     print("Hamming Loss:", hamming_loss(y_val, y_pred))
#     print("F1 Micro:", f1_score(y_val, y_pred, average='micro'))
#     print("F1 Macro:", f1_score(y_val, y_pred, average='macro'))
#     print("F1 Samples:", f1_score(y_val, y_pred, average='samples'))
