In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('/content/stack_overflow_dataset.csv')
df.head()

Unnamed: 0,Id,CreationDate,Score,Title,Body,AnswerCount,Tags
0,17016800,2013-06-10T04:15:05Z,0,Handling the EditText send keyboard event for ...,<pre><code>import com.example.methanegaszonege...,1,"['android', 'events', 'android-edittext', 'send']"
1,7685280,2011-10-07T09:20:41Z,7,EditText: how to enable/disable input?,<p>I have a 7x6 grid of EditText views. I want...,7,['android']
2,24178500,2014-06-12T07:13:00Z,1,Mobile web - Displaying a fixed div below a re...,<p>I want to have a relative div at the top of...,0,"['jquery', 'html', 'css', 'iphone', 'mobile']"
3,38820760,2016-08-08T03:10:28Z,0,How to create tabbed view in HTML?,<p>I'm trying to create a tabbed view in HTML ...,4,"['html', 'google-sites']"
4,3674120,2010-09-09T05:53:46Z,0,Problems decrypting HTTP Live Stream,<p>I have a single key encrypted HTTP Live Str...,2,"['http', 'stream', 'openssl', 'live', 'encrypt..."


In [4]:
df.drop(columns=['Id', 'CreationDate'], inplace=True)

In [5]:
df = df[df['Score'] > -3]
df.loc[:, 'full_text'] = df['Title'].astype(str) + ' ' + df['Body'].astype(str)
df.drop(columns=['Title', 'Body'], inplace=True)

In [6]:
nltk.download('stopwords', quiet=True)

def clean_text(text, max_words=None):
    if not isinstance(text, str):
        return ""

    # 1. Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. Lowercase
    text = text.lower()

    # 3. Remove URLs and emails
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)

    # 4. Keep code content, remove backticks and markdown
    text = text.replace("`", "")
    text = re.sub(r"^[=\-*#~_]{3,}", "", text, flags=re.MULTILINE)

    # 5. Remove brackets and quotes (keep parentheses & angle brackets)
    text = re.sub(r"[{}\[\]\"']", "", text)

    # 6. Keep only relevant characters (preserve programming tokens)
    text = re.sub(r"[^a-zA-Z0-9\s\.\-+#_]", " ", text)

    # 8. Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [7]:
df['clean_text'] = df['full_text'].apply(lambda x: clean_text(x))

In [8]:
def rule_based_intent(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    # Debugging
    if any(kw in text for kw in [
        "error", "exception", "traceback", "crash", "bug", "not working",
        "issue", "fix", "debug", "fails", "failed", "stuck", "problem",
        "compile error", "unexpected", "can't", "cannot", "not loading", "won't load", "broken"
    ]):
        return "debugging"

    # How-to
    if re.search(r"\b(how to|how do i|how can i|how should i|how would i|how does one|steps to|guide to|tutorial|prevent|disable|change|create|detect|convert|make|resolve)\b", text):
        return "how-to"
    if re.search(r"\b(can i|should i|what if i|how do you|how many|how could)\b", text):
        return "how-to"

    # Concept
    if re.search(r"\b(what is|explain|definition of|describe|understand|meaning of|overview of|concept of|why does|why is|why should)\b", text):
        return "concept"

    # Comparison
    if any(phrase in text for phrase in [
        "difference between", "vs", "versus", "better than", "compare with",
        "which is better", "pros and cons"
    ]):
        return "comparison"

    # Optimization
    if any(kw in text for kw in [
        "optimize", "performance", "faster", "efficient", "efficiency",
        "improve speed", "reduce time", "speed up", "high performance",
        "scalable", "memory usage"
    ]):
        return "optimization"

    # Handle "are" questions
    if text.startswith("are "):
        if any(kw in text for kw in ["difference", "compare", "versus", "vs", "better", "advantages"]):
            return "comparison"
        elif any(kw in text for kw in ["meaning", "concept", "definition", "describe", "explain"]):
            return "concept"
        elif any(kw in text for kw in ["not", "issue", "problem", "error", "wrong", "fail"]):
            return "debugging"
        else:
            return "how-to"

    # Catch general action or UI behavior phrasing
    if any(kw in text for kw in [
        "change", "detect", "parse", "autocomplete", "load", "insert",
        "handler", "fetch", "validate", "submit", "fade", "onclick", "prevent"
    ]):
        return "how-to"

    # If starts with a verb or known tech task keyword, treat as how-to
    first_word = text.split()[0] if text else ""
    if first_word in {"create", "use", "build", "send", "access", "show", "trigger", "toggle"}:
        return "how-to"

    return None


In [9]:
df['intent_rule'] = df['clean_text'].apply(rule_based_intent)


In [10]:
df['intent_rule'].value_counts(dropna=False)


Unnamed: 0_level_0,count
intent_rule,Unnamed: 1_level_1
debugging,4979
how-to,3324
,1141
concept,292
optimization,71
comparison,69


In [11]:
df_labeled = df[df['intent_rule'].notna()].reset_index().copy()


In [12]:
df_labeled.drop(columns=['index', 'Score', 'AnswerCount', 'Tags', 'full_text'], inplace=True)

In [13]:
df_labeled['intent_rule'] = df_labeled['intent_rule'].replace({
    'optimization': 'how-to',
    'comparison': 'concept'})


In [14]:
df_labeled['intent_rule'].value_counts()

Unnamed: 0_level_0,count
intent_rule,Unnamed: 1_level_1
debugging,4979
how-to,3395
concept,361


In [15]:
intent_labels = {
    "debugging": 0,
    "how-to": 1,
    "concept": 2}

df_labeled['intent_label'] = df_labeled['intent_rule'].map(intent_labels)


In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    df_labeled['clean_text'],
    df_labeled['intent_label'],
    test_size=0.2,
    stratify=df_labeled['intent_label'],
    random_state=42)


In [17]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=6000,
        ngram_range=(1, 3),
        sublinear_tf=True)),
    ('clf', LogisticRegression(
    solver='liblinear',
    penalty='l1',
    C=1.5,
    class_weight='balanced',
    max_iter=2000))])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


In [18]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['debugging', 'how-to', 'concept']))
print(confusion_matrix(y_test, y_pred))


0.9456210646823126
              precision    recall  f1-score   support

   debugging       1.00      0.94      0.97       996
      how-to       0.90      0.98      0.94       679
     concept       0.75      0.74      0.74        72

    accuracy                           0.95      1747
   macro avg       0.88      0.88      0.88      1747
weighted avg       0.95      0.95      0.95      1747

[[932  58   6]
 [  0 667  12]
 [  0  19  53]]


In [19]:
import joblib

# Save
joblib.dump(pipeline, 'intent pred model.pkl')

['intent pred model.pkl']