# Importing the required libraries

In [98]:
import numpy as np
from numpy import nan
import matplotlib.pyplot as plt
import pandas as pd
pd.options.mode.chained_assignment = None 
from sklearn.preprocessing import OneHotEncoder

from bs4 import BeautifulSoup

import nltk
import re
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial import distance

# Importing the datasets

In [99]:
#'rel_que' dataframe contains the questions that are relevant to the query which is the output of relevant questions model
rel_que = pd.read_csv('data/relevant_questions.csv')
#'ans' dataframe contains all the answers with their topics
ans = pd.read_csv('data/keyword_answer.csv')
#'kdf' dataframe contains the data of keywords for each topic
kdf = pd.read_csv('data/keywords.csv')

# Preprocessing the text

In [100]:
#calculates the length of text
def length_text(text):
    if(type(text) != type(0.0)):
        text = text.split(' ')
        return len(text)
    else:
        return 0
ans['ans length'] = ans['answer'].apply(length_text)

In [101]:
#'rel_ans' dataframe contains the answers corresponding to the relevant questions to the query i.e, questions of 'rel_que' dataset
rel_ans = pd.DataFrame()
for i in enumerate(rel_que['id']):
    rel_ans = pd.concat([ans[ans['id'] == i[1]], rel_ans])

In [102]:
topics = ['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7']

In [103]:
#Gives one-hot encoding of the topics for the answers
def get_dataframe(topic_list):
    if(type(topic_list) == type(0.0)):
        return pd.Series([0,0,0,0,0,0,0,0])
    else:
        topic_list = topic_list.split(', ')
        tl = []
        for topic in topics:
            if(topic in topic_list):
                val = 1
            else: 
                val = 0
            tl.append(val)
        return pd.Series(tl)

In [104]:
rel_ans[topics] = rel_ans['topic list'].apply(get_dataframe)
rel_ans.drop(columns=['topic list'], inplace = True)
rel_ans['answer list'] = rel_ans['answer'] + rel_ans['code']

In [107]:
# def extract_text(text):
#     soup = BeautifulSoup(text, 'lxml')
#     txt = "".join([txt.text for txt in soup.find_all("p")])
#     return txt

# rel_ans['body'] = rel_ans['body'].apply(extract_text)
rel_ans['title'] = rel_que['questions']
rel_ans['question list'] = rel_ans['title'] + rel_ans['body']
rel_ans.drop(columns=['body'], axis=1, inplace = True)

In [108]:
rel_que

Unnamed: 0,id,questions,score
0,48641350,How to merge two lists into a list of multiple...,0.799961
1,46362972,Merge list of lists in python 3,0.785947
2,1158128,Merge sorted lists in python,0.750189
3,44476206,how to merge two list having dict,0.749920
4,58971955,How to merge two dataframe?,0.709934
...,...,...,...
95,14008075,How to merge many to many relations from one d...,0.372127
96,55206146,Merge Pandas Dataframe with non-unique index w...,0.366641
97,58678560,How to merge similar data into a custom field ...,0.365843
98,50727548,How to merge multiple dataframes with the same...,0.354777


In [109]:
rel_ans

Unnamed: 0,id,answer,link,code,score,ans length,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,answer list,title,question list
4728,58276538.0,You can either use update:output:Or loc:output:,,update\nEmpty_DF.update(ROI_DF)\r\n\n a ...,2.0,6,0,1,0,0,0,0,0,0,You can either use update:output:Or loc:output...,,
4729,58276538.0,"This is perfect case for DataFrame.update, whi...",https://pandas.pydata.org/pandas-docs/stable/r...,DataFrame.update\nEmpty_DF.update(ROI_DF)\r\n\...,3.0,44,1,1,0,0,1,0,0,0,"This is perfect case for DataFrame.update, whi...",,
4730,58276538.0,In your case reindex_like,,reindex_like\nyourdf=ROI_DF.reindex_like(Empty...,1.0,4,0,1,0,0,0,0,0,0,In your case reindex_likereindex_like\nyourdf=...,,
60589,50727548.0,You can put the different column into index an...,,"pd.concat([plantsFrame.set_index(['plants']), ...",0.0,12,0,1,0,0,0,0,0,0,You can put the different column into index an...,,
27294,58678560.0,Your desired outputs are no longer really mode...,https://stackoverflow.com/a/35019122/12197595,,0.0,46,0,1,0,0,0,0,0,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39444,46362972.0,Use this code :Ouptut:,,"list1= [['user1', 186, 'Feb 2017, Apr 2017', 5...",1.0,4,1,0,0,0,0,0,0,1,"Use this code :Ouptut:list1= [['user1', 186, '...",,
18975,48641350.0,This might help,,"desiredlist = list(map(lambda y:[lst1,y],lst2)...",0.0,3,0,0,0,0,0,0,0,0,This might helpdesiredlist = list(map(lambda y...,,
18976,48641350.0,You could achieve your desired output with ite...,,itertools.zip_longest\nfillvalue\n>>> from ite...,1.0,31,0,1,0,0,0,0,0,0,You could achieve your desired output with ite...,,
18977,48641350.0,"Or you can use use append, but you need to cre...",,lst3 = []\r\nfor elem in lst2:\r\n theNew =...,1.0,16,1,1,0,0,0,0,0,0,"Or you can use use append, but you need to cre...",,


In [68]:
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [69]:
# a text processing function which converts text into lowercase, removes any characters like numbers, punctuation, 
# removes stopwords, stems the words and lemmatises them
def text_process(text):
    if(type(text) != type(0.0)):
        text = text.lower()                              
        text = re.sub("[^a-z]", " ", text)               
        text = text.strip()                            
        token = word_tokenize(text)
        text = [i for i in token if not i in stop_words]
        output = []
        for word in text:
            output.append(stemmer.stem(word))
        text = []
        for word in output:
            text.append(lemmatizer.lemmatize(word))
        text = " ".join(text)
    return text

In [70]:
#answers and questions are of 'rel_ans' dataframe are preprocessed
rel_ans['question list'] = rel_ans['question list'].apply(text_process)
rel_ans['answer list'] = rel_ans['answer list'].apply(text_process)

# Answers text scoring and ranking

In [71]:
data = rel_ans

In [88]:
#a function for similarity of the answers with their corresponding questions
def cosine_score(question, answer):
    if(type(answer) == type(0.0) or type(question) == type(0.0)):
        return 0
    else:
        sentences = [question, answer]
        vectorizer = CountVectorizer()
        vector = vectorizer.fit_transform(sentences)
        text1 = vector.toarray()[0].tolist()
        text2 = vector.toarray()[1].tolist()
        cosc = 1-distance.cosine(text1, text2)
        return cosc

#a function that caluclates the sum of tfidf of each word in a text
def entropy(text, tfidf_dict):
    if(type(text) == type(0.0)):
        return 0
    else:
        token = word_tokenize(text)
        entropy = 0.0
        for word in token:
            try:
                entropy = entropy + tfidf_dict[word]
            except:
                entropy = entropy + 0.0
        return entropy

#a function that calculates similarity with the query
def query_sim_score(query, answer):
    score = 0
    if(type(answer) == type(0.0)):
        return 0
    else:
        query = query
        for q in query:
            if(q in answer):
                score =  score + 1
        score = score/len(query)
        return score

In [89]:
#text processes the query used in 'relevant questions' model
query = 'merge two lists in python'
query = text_process(query)

In [90]:
#calculates the frequency of words
docs = data['answer list'].dropna().tolist()
cv = CountVectorizer(stop_words = stop_words)
word_count_vector = cv.fit_transform(docs)

In [91]:
#calculates the tfidf score of each word
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit_transform(word_count_vector)

<157x1345 sparse matrix of type '<class 'numpy.float64'>'
	with 4237 stored elements in Compressed Sparse Row format>

In [92]:
#produces a dictionary of each word with its tfidf score where the word is the key and tfidf score of the word is the value
tfidf_array = tfidf_transformer.idf_
words = cv.get_feature_names()
tfidf_dict = {}
for i in range(0, len(tfidf_transformer.idf_)):
    tfidf_dict[words[i]] = tfidf_array[i]

In [93]:
def normalised_score():
    sc = ((score - min_score)/(max_score - min_score))
    return sc

In [94]:
#normalises all the scores and returns their weighted sum
def score(user_score, cosine_score, entropy_score, sim_score, min_user_score, max_user_score, min_cosine_score, max_cosine_score, min_entropy_score, max_entropy_score):
    if(min_user_score == max_user_score):
        user_score = user_score - min_user_score
    else:
        user_score = ((user_score - min_user_score)/(max_user_score - min_user_score))
        
    if(min_cosine_score == max_cosine_score):
        cosine_score = cosine_score - min_cosine_score
    else:
        cosine_score = ((cosine_score - min_cosine_score)/(max_cosine_score - min_cosine_score))
        
    if(min_entropy_score == max_entropy_score):
        entropy_score = entropy_score - min_entropy_score
    else:
        entropy_score = ((entropy_score - min_entropy_score)/(max_entropy_score - min_entropy_score))
    return (user_score + cosine_score + entropy_score + 2 * sim_score)

In [95]:
#produces a dataframe of the top 15 answers in each topic
for topic in topics:
    topic_data = data[data[topic]==1][['id', 'answer', 'link', 'code', 'score', 'answer list', 'question list']]
    topic_data.dropna()
    entropy_score = []
    topic_df = pd.DataFrame()
    for i in range(0, topic_data.shape[0]):
        cosc = cosine_score(topic_data.iloc[i]['question list'], topic_data.iloc[i]['answer list'])
        entro = entropy(topic_data.iloc[i]['answer list'], tfidf_dict)
        sim_score = query_sim_score(query, topic_data.iloc[i]['answer list'])
        topic_df = topic_df.append(pd.Series([topic_data.iloc[i]['id'], topic_data.iloc[i]['title'], topic_data.iloc[i]['answer'], topic_data.iloc[i]['link'], topic_data.iloc[i]['code'], topic_data.iloc[i]['score'], cosc, entro, sim_score]), ignore_index=True)
    if topic_df.shape[0]==0:
            continue
    else:
        topic_df.columns = ['id', 'title', 'answer', 'link', 'code', 'score', 'cosine score', 'entropy','sim score']
        min_user_score = topic_df['score'].min()
        max_user_score = topic_df['score'].max()
        min_cosine_score = topic_df['cosine score'].min()
        max_cosine_score = topic_df['cosine score'].max()
        min_entropy_score = topic_df['entropy'].min()
        max_entropy_score = topic_df['entropy'].max()
        scores = []
        for i in range(0, topic_df.shape[0]):
            scores.append(score(topic_df.iloc[i]['score'], topic_df.iloc[i]['cosine score'], topic_df.iloc[i]['entropy'], 
                                topic_df.iloc[i]['sim score'], min_user_score, max_user_score, min_cosine_score, max_cosine_score,
                                min_entropy_score, max_entropy_score))
        topic_df.drop(columns = ['score', 'cosine score', 'entropy'], inplace=True)
        topic_df['sc'] = scores
        topic_df.sort_values(by = 'sc', ascending=False, inplace=True, kind='quicksort')
        path = 'data/'+ topic +'.csv'
        topic_df = topic_df.head(15)
        topic_df.to_csv(path, index=False)

Hey2


AttributeError: 'numpy.float64' object has no attribute 'lower'