In [33]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

In [34]:
df = pd.read_csv('/content/stack_overflow_dataset.csv')
df.head()

Unnamed: 0,Id,CreationDate,Score,Title,Body,AnswerCount,Tags
0,17016800,2013-06-10T04:15:05Z,0,Handling the EditText send keyboard event for ...,<pre><code>import com.example.methanegaszonege...,1,"['android', 'events', 'android-edittext', 'send']"
1,7685280,2011-10-07T09:20:41Z,7,EditText: how to enable/disable input?,<p>I have a 7x6 grid of EditText views. I want...,7,['android']
2,24178500,2014-06-12T07:13:00Z,1,Mobile web - Displaying a fixed div below a re...,<p>I want to have a relative div at the top of...,0,"['jquery', 'html', 'css', 'iphone', 'mobile']"
3,38820760,2016-08-08T03:10:28Z,0,How to create tabbed view in HTML?,<p>I'm trying to create a tabbed view in HTML ...,4,"['html', 'google-sites']"
4,3674120,2010-09-09T05:53:46Z,0,Problems decrypting HTTP Live Stream,<p>I have a single key encrypted HTTP Live Str...,2,"['http', 'stream', 'openssl', 'live', 'encrypt..."


In [35]:
df.drop(columns=['Id', 'CreationDate'], inplace=True)

In [36]:
df = df[df['Score'] > -3]
df.loc[:, 'full_text'] = df['Title'].astype(str) + ' ' + df['Body'].astype(str)
df.drop(columns=['Title', 'Body'], inplace=True)

In [37]:
nltk.download('stopwords', quiet=True)
stop_words = frozenset(stopwords.words('english'))

def clean_text(text, remove_stopwords=True, max_words=None):
    from bs4 import BeautifulSoup
    import re
    from nltk.corpus import stopwords

    stop_words = frozenset(stopwords.words('english'))

    # 1. Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. Lowercase
    text = text.lower()

    # 3. Remove URLs and emails
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)

    # 4. Remove backticks but keep code content
    text = text.replace("`", "")

    # 5. Remove divider lines
    text = re.sub(r'^[\-=*_#<>~]{3,}\s*$', '', text, flags=re.MULTILINE)

    # 6. Remove quotes and curly brackets (keep parentheses and angle brackets)
    text = text.replace('"', '').replace("'", "")
    text = re.sub(r"[{}\[\]]", "", text)

    # 7. Keep letters, digits, and programming-relevant punctuation
    text = re.sub(r"[^a-zA-Z0-9\s\.\-+#_]", " ", text)

    # 8. Optional: remove stopwords
    if remove_stopwords:
        tokens = text.split()
        tokens = [w for w in tokens if w not in stop_words]
        text = " ".join(tokens)

    # 9. Optional: truncate words
    if max_words:
        text = " ".join(text.split()[:max_words])

    # 10. Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df['clean_text'] = df['full_text'].apply(clean_text)


In [38]:
def estimate_difficulty(text):
    text = str(text).strip()

    words = text.split()
    word_count = len(words)
    sentence_count = max(text.count("."), 1)
    avg_word_length = sum(len(word) for word in words) / max(word_count, 1)
    avg_sentence_length = word_count / sentence_count

    # Rule thresholds (can be tuned)
    if word_count < 40 and avg_sentence_length < 15:
        return "easy"
    elif word_count <= 120 and avg_sentence_length <= 25:
        return "medium"
    return "hard"

# Apply
df["difficulty"] = df["clean_text"].apply(estimate_difficulty)


In [39]:
df['difficulty'].value_counts()

Unnamed: 0_level_0,count
difficulty,Unnamed: 1_level_1
medium,5274
hard,3470
easy,1132


In [40]:
label_map = {'easy': 0, 'medium': 1, 'hard': 2}
df['difficulty_label'] = df['difficulty'].map(label_map)

X = df['clean_text']
y = df['difficulty_label']

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


In [42]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=7000,
        min_df=2,
        ngram_range=(1, 2),
        sublinear_tf=True
    )),
    ('clf', RandomForestClassifier(
    n_estimators=800,
    criterion='entropy',
    min_samples_split=10,
    min_samples_leaf=3,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=0
))
])

pipeline.fit(X_train, y_train)


In [43]:
y_pred = pipeline.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['easy', 'medium', 'hard']))
print(confusion_matrix(y_test, y_pred))


0.7530364372469636
              precision    recall  f1-score   support

        easy       0.56      0.68      0.61       227
      medium       0.77      0.82      0.79      1055
        hard       0.81      0.68      0.74       694

    accuracy                           0.75      1976
   macro avg       0.71      0.73      0.72      1976
weighted avg       0.76      0.75      0.75      1976

[[155  72   0]
 [ 85 860 110]
 [ 39 182 473]]


In [44]:
import joblib

# Save
joblib.dump(pipeline, 'difficulty pred model.pkl')

# # Load
# model = joblib.load('model.pkl')


['difficulty pred model.pkl']