In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

## loading dataset

In [11]:
ques_df=pd.read_csv('Questions.csv',encoding='latin1')

In [12]:
ques_df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,6,5.0,2010-07-19T19:14:44Z,272,The Two Cultures: statistics vs. machine learn...,"<p>Last year, I read a blog post from <a href=..."
1,21,59.0,2010-07-19T19:24:36Z,4,Forecasting demographic census,<p>What are some of the ways to forecast demog...
2,22,66.0,2010-07-19T19:25:39Z,208,Bayesian and frequentist reasoning in plain En...,<p>How would you describe in plain English the...
3,31,13.0,2010-07-19T19:28:44Z,138,What is the meaning of p values and t values i...,<p>After taking a statistics course and then t...
4,36,8.0,2010-07-19T19:31:47Z,58,Examples for teaching: Correlation does not me...,"<p>There is an old saying: ""Correlation does n..."


In [33]:
tags_df=pd.read_csv('Tags.csv')

In [34]:
tags_df.head()

Unnamed: 0,Id,Tag
0,1,bayesian
1,1,prior
2,1,elicitation
3,2,distributions
4,2,normality


### Merging files in one

In [44]:
# If multiple tags per question, group them
tags_grouped = tags_df.groupby('Id')['Tag'].apply(lambda x: ' '.join(x)).reset_index()

# Merge tags into questions
questions_df = ques_df.merge(tags_grouped, how='left', on='Id')
questions_df['Tag'] = questions_df['Tag'].fillna('')


In [45]:
# Get only one answer per question — the first one
first_answers = ans_df.sort_values(by='CreationDate').drop_duplicates(subset='ParentId', keep='first')

# Rename for clarity and merge
first_answers = first_answers.rename(columns={'Body': 'Answer'})
questions_df = questions_df.merge(first_answers[['ParentId', 'Answer']], how='left', left_on='Id', right_on='ParentId')
questions_df['Answer'] = questions_df['Answer'].fillna('')


In [46]:
# Combine title + tags + answer into one searchable text
questions_df['TextForSimilarity'] = (
    questions_df['Title'].fillna('') + ' ' +
    questions_df['Tag'].fillna('') + ' ' +
    questions_df['Answer'].fillna('')
)


Vectorizing the coulumn

### TF-IDF + Cosine Similarity

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorize
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(questions_df['TextForSimilarity'])


### defining function to find similar questions

In [48]:
def find_similar_questions(user_question, top_n=5):
    user_vec = vectorizer.transform([user_question])
    sim_scores = cosine_similarity(user_vec, tfidf_matrix).flatten()
    top_indices = sim_scores.argsort()[-top_n:][::-1]

    return questions_df.iloc[top_indices][['Id', 'Title', 'Tag', 'Answer']]


## Example

In [50]:
query = "what is supervised learning?"
find_similar_questions(query)



Unnamed: 0,Id,Title,Tag,Answer
44910,163219,Standard function for comparing regression (su...,regression supervised-learning,
30480,203715,What algorithm should I use?,machine-learning bayesian supervised-learning,
46127,89571,Supervised or unsupervised learning problem,machine-learning svm neural-networks unsupervi...,"<p>If you try supervised learning algorithms, ..."
16362,231336,combining supervised and unsupervised learning,classification semi-supervised,
55886,235719,Symmetry in supervised learning models,supervised-learning symmetry,
