In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json

In [3]:
data = pd.read_csv("capstone_qq.csv")
data = data.iloc[1:, :]
# data.head(1)

In [4]:
questions_ids = list(data.questions_id.unique())
# len(questions_ids)
questions_df = data[data.questions_id.isin(questions_ids)].dropna()
questions = list(questions_df.questions_body_clean)

In [5]:
vec = TfidfVectorizer(max_features=5000)
vec.fit_transform(questions)

<79059x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1709781 stored elements in Compressed Sparse Row format>

In [6]:
vec_matrix = vec.transform(questions).todense()

In [7]:
def find_answers(q_id):
    answers = list(data[data.questions_id == q_id].answers_body_clean)
    return answers

In [8]:
def find_similar_questions(question_body: str):
    x = vec.transform([question_body]).todense()
    scores = cosine_similarity(x, vec_matrix)
    similar_inds = scores.argsort()[0][-6:-1]
    dict = {}
    dict["input_question_body"] = question_body
    re = []
    for i in similar_inds:
        q_id = questions_df.iloc[i,:].questions_id
        sim_question = questions_df.iloc[i,:].questions_body_clean
        answers = find_answers(q_id)
        re.append({"question_id": q_id, 
                   "question_body": sim_question,
                   "score": scores[0][i],
                   "answers": answers})
    dict["similar_questions"] = re
    return dict

In [9]:
body_example = questions[1]
re = find_similar_questions(body_example)
print(json.dumps(re, indent=4, sort_keys = True))

{
    "input_question_body": "I am Priyanka from Bangalore . Now am in 10th std . When I go to college I should not get confused on what I want to take to become army officer. So I am asking this question  #military #army",
    "similar_questions": [
        {
            "answers": [
                "There are a couple of options to become a commissioned officer in the United States military:\n1. Attend a 4-year college that has an ROTC program (Reserve Officer Training Program)... apply for an ROTC slot which prepares and leads to you becoming commissioned and thus ready to lead at an entry-level in our military.  You will need to meet academic, physical, psychological, and character minimums in order to be accepted into an ROTC program as well as to be commissioned.\n2. Apply and be accepted into a military institute (e.g. West Point (Army), Annapolis (Navy), etc.).\n3. Join the military as a Private (entry level enlisted personnel), show leadership aptitude, and apply for an OCS sl

In [10]:
body_example = 'how to become a data scientist'
re = find_similar_questions(body_example)
print(json.dumps(re, indent=4, sort_keys = True))

{
    "input_question_body": "how to become a data scientist",
    "similar_questions": [
        {
            "answers": [
                "The definition of this newfangled title \"data scientist\" is basically a super-awesome statistician, with a variety of skills and experience to match (I see the title as a senior title, and have seen it at the C-level).  Since you can apply statistics to a wide variety of information, this title would be transportable across a variety of industries where extracting information from large amount of structured or unstructured data (aka \"big data\") is useful, such as pharmaceutical, network security, defense/intelligence, medical, financial, government... If you're interested in quantitative analysis of the financial markets, my suggestion is to start working on a model.  In fact, a financial quant I used to work with spent his free time working on a football (aka soccer) player performance prediction system, while I (as a network engineer), spen

### end here