In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv("cleaned_stackoverflow.csv")

# Preprocess
df = df[df['tags'].apply(lambda x: isinstance(x, str))]
df['text'] = df['text'].fillna('')
df['question'] = df['question'].fillna('')
df['combined_text'] = df['question'] + ' ' + df['text']
df['tags'] = df['tags'].apply(lambda x: [tag.strip() for tag in x.split(',')])

# Top 50 tags only
top_tags = [tag for tag, _ in Counter([t for tags in df['tags'] for t in tags]).most_common(50)]
df['filtered_tags'] = df['tags'].apply(lambda tags: [t for t in tags if t in top_tags])
df = df[df['filtered_tags'].map(len) > 0]

# Binarize labels
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df['filtered_tags'])
X = df['combined_text']

# Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Scorer
f1_scorer = make_scorer(f1_score, average='micro')

# Logistic Regression model
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words='english')),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear', max_iter=2000)))
])

params = {
    'clf__estimator__C': [1, 10]
}

# Train & Evaluate
print("Tuning Logistic Regression...")
grid = GridSearchCV(model, param_grid=params, scoring=f1_scorer, cv=3, verbose=1, n_jobs=-1)
grid.fit(X_train, Y_train)
best_model = grid.best_estimator_

Y_pred = best_model.predict(X_test)

results = {
    'Accuracy': accuracy_score(Y_test, Y_pred),
    'Precision (micro)': precision_score(Y_test, Y_pred, average='micro'),
    'Recall (micro)': recall_score(Y_test, Y_pred, average='micro'),
    'F1 Score (micro)': f1_score(Y_test, Y_pred, average='micro'),
    'Precision (macro)': precision_score(Y_test, Y_pred, average='macro'),
    'Recall (macro)': recall_score(Y_test, Y_pred, average='macro'),
    'F1 Score (macro)': f1_score(Y_test, Y_pred, average='macro'),
    'Best Params': grid.best_params_
}

# Show Results
print("\nLogistic Regression Results:")
for metric, val in results.items():
    if isinstance(val, dict):
        print(f"{metric}: {val}")
    else:
        print(f"{metric}: {val:.4f}")

Tuning Logistic Regression...
Fitting 3 folds for each of 2 candidates, totalling 6 fits

Logistic Regression Results:
Accuracy: 0.3420
Precision (micro): 0.7678
Recall (micro): 0.4873
F1 Score (micro): 0.5962
Precision (macro): 0.7283
Recall (macro): 0.4596
F1 Score (macro): 0.5580
Best Params: {'clf__estimator__C': 10}


In [2]:
# New text data to predict tags for
new_texts = [
    "How to use pandas to read a CSV file in Python",
    "Best practices for training deep learning models in TensorFlow",
    "How to optimize SQL queries for better performance"
]

# Predict tags (binary format)
Y_pred_new = best_model.predict(new_texts)

# Convert binary predictions back to actual tag names
predicted_tags = mlb.inverse_transform(Y_pred_new)

# Print results
for text, tags in zip(new_texts, predicted_tags):
    print(f"Text: {text}")
    print(f"Predicted Tags: {tags}\n")

Text: How to use pandas to read a CSV file in Python
Predicted Tags: ('python',)

Text: Best practices for training deep learning models in TensorFlow
Predicted Tags: ('python',)

Text: How to optimize SQL queries for better performance
Predicted Tags: ()

