In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')


In [10]:
df = pd.read_csv('/content/stack_overflow_dataset.csv')
df.head()

Unnamed: 0,Id,CreationDate,Score,Title,Body,AnswerCount,Tags
0,17016800,2013-06-10T04:15:05Z,0,Handling the EditText send keyboard event for ...,<pre><code>import com.example.methanegaszonege...,1,"['android', 'events', 'android-edittext', 'send']"
1,7685280,2011-10-07T09:20:41Z,7,EditText: how to enable/disable input?,<p>I have a 7x6 grid of EditText views. I want...,7,['android']
2,24178500,2014-06-12T07:13:00Z,1,Mobile web - Displaying a fixed div below a re...,<p>I want to have a relative div at the top of...,0,"['jquery', 'html', 'css', 'iphone', 'mobile']"
3,38820760,2016-08-08T03:10:28Z,0,How to create tabbed view in HTML?,<p>I'm trying to create a tabbed view in HTML ...,4,"['html', 'google-sites']"
4,3674120,2010-09-09T05:53:46Z,0,Problems decrypting HTTP Live Stream,<p>I have a single key encrypted HTTP Live Str...,2,"['http', 'stream', 'openssl', 'live', 'encrypt..."


In [11]:
df.drop(columns=['Id', 'CreationDate'], inplace=True)

In [12]:
df = df[df['Score'] > -3]
df.loc[:, 'full_text'] = df['Title'].astype(str) + ' ' + df['Body'].astype(str)
df.drop(columns=['Title', 'Body'], inplace=True)

In [13]:
nltk.download('stopwords', quiet=True)

def clean_text(text, max_words=None):
    if not isinstance(text, str):
        return ""

    # 1. Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. Lowercase
    text = text.lower()

    # 3. Remove URLs and emails
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)

    # 4. Keep code content, remove backticks and markdown
    text = text.replace("`", "")
    text = re.sub(r"^[=\-*#~_]{3,}", "", text, flags=re.MULTILINE)

    # 5. Remove brackets and quotes (keep parentheses & angle brackets)
    text = re.sub(r"[{}\[\]\"']", "", text)

    # 6. Keep only relevant characters (preserve programming tokens)
    text = re.sub(r"[^a-zA-Z0-9\s\.\-+#_]", " ", text)

    # 8. Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [14]:
df['clean_text'] = df['full_text'].apply(lambda x: clean_text(x))

In [20]:
import numpy as np
from sentence_transformers import SentenceTransformer
from numpy.linalg import norm

In [15]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import pandas as pd


texts = df['clean_text'].tolist()

# Load SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")


In [16]:
print("Encoding corpus...")
corpus_embeddings = model.encode(texts, batch_size=64, show_progress_bar=True)
np.save("corpus_embeddings.npy", corpus_embeddings)

Encoding corpus...


Batches:   0%|          | 0/155 [00:00<?, ?it/s]

In [17]:
def cosine_similarity_matrix(query_vec, matrix):
    dot_products = np.dot(matrix, query_vec)
    norms = norm(matrix, axis=1) * norm(query_vec)
    return dot_products / norms

In [26]:
query = "i need help in join function in sql"
query_embedding = model.encode(query)

In [27]:
similarities = cosine_similarity_matrix(query_embedding, corpus_embeddings)

# 🔹 Get top-k most similar entries
top_k = 5
top_indices = np.argsort(similarities)[::-1][:top_k]

print(f"\nQuery: {query}\nTop {top_k} similar results:\n")
for idx in top_indices:
    score = similarities[idx]
    similar_text = df.iloc[idx]['clean_text'][:300].replace('\n', ' ')
    print(f"({score:.4f}) {similar_text}...\n")


Query: i need help in join function in sql
Top 5 similar results:

(0.5145) apply inner join between two tables result failed i have a two tables. names are tbl_module_contact and tbl_module_contact_details. now tbl_module_contact ---- columns are ---- contactid categoryid typeid customerid status all numeric now second table tbl_module_contact_details---- columns are -----...

(0.4694) left join product p on 1 can anyone explain this i have clipped the code to show what i am interested in select from select from company left join product p on 1 what does the left join product p on 1 mean first time i encounter this and it does not really make sense to me. the question might be stu...

(0.4557) how do i get related data of a different type in mysql so i have these two tables with a relation recipient-accountid recipient is also an int id and i want to make a select query which would take recipients username from the other table and the rest of the message data. i found that i should p