# Import libary

In [1]:
import pandas as pd
import numpy as np
import re
import gensim 
from gensim.parsing.preprocessing import remove_stopwords

# Download the Word2Vec model

In [8]:
from gensim.models import Word2Vec 
import gensim.downloader as api
    
w2v_model=None;
try:
    w2v_model = gensim.models.KeyedVectors.load("./w2vecmodel.mod")
    print("Loaded w2v model")
except:            
    w2v_model = api.load('word2vec-google-news-300')
    w2v_model.save("./w2vecmodel.mod")

w2vec_embedding_size=len(w2v_model['computer'])



# Call the dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/FAQ-Dataset/FAQs - FAQs.csv")
df_test = pd.read_csv("/content/drive/MyDrive/FAQ-Dataset/FAQs_test - FAQs_test.csv")

print("Number of data points in Train dataset:",df.shape[0])
print("Number of data points in Test dataset:",df_test.shape[0])

Print the **train data**

In [138]:
df.head()

Unnamed: 0,Question,Answer
0,When was Albert Einstein born?,Albert Einstein was born on 14 March 1879.
1,Where was he born?,"He was born in Ulm, Germany."
2,When did he die?,"He died 18 April 1955 in Princeton, New Jersey..."
3,Who were his parents?,His father was Hermann Einstein and his mother...
4,Did he have any sisters and brothers?,He had one sister named Maja.


Print the **test data**

In [134]:
df_test.head()

Unnamed: 0,Question
0,What is the date of his death?
1,Did Einstein have siblings?
2,Who was his wife?
3,What was Einstein's father's name?
4,At what institutions did he study?


# Datset Preprocessing


In [147]:
def clean_sentence(sentence, stopwords=False):
    
    sentence = sentence.lower().strip()
    sentence = re.sub(r'[^a-z0-9\s]', '', sentence)

    
    if stopwords:
         sentence = remove_stopwords(sentence)
    
    
    return sentence
                    
def get_cleaned_sentences(tf,stopwords=False):    
    sents=tf[["Question"]];
    cleaned_sentences=[]

    for index,row in tf.iterrows():
        #print(index,row)
        cleaned=clean_sentence(row["Question"],stopwords);
        cleaned_sentences.append(cleaned);
    return cleaned_sentences;



Train data 

In [148]:
Train_cleaned_sentences = get_cleaned_sentences(df,stopwords=True)
print(Train_cleaned_sentences);

['albert einstein born', 'born', 'die', 'parents', 'sisters brothers', 'marry children', 'receive education', 'albert einstein awarded nobel prize physics', 'albert einstein attend nobel prize award ceremony', 'receive nobel prize']


Test data

In [149]:
Test_cleaned_sentences = get_cleaned_sentences(df_test,stopwords=True)
print(Test_cleaned_sentences);

['date death', 'einstein siblings', 'wife', 'einsteins fathers', 'institutions study']


# Define function for word Embedding

In [150]:
def getWordVec(word,model):
        samp=model['computer'];
        vec=[0]*len(samp);
        try:
                vec=model[word];
        except:
                vec=[0]*len(samp);
        return (vec)


def getPhraseEmbedding(phrase,embeddingmodel):
                       
        samp=getWordVec('computer', embeddingmodel);
        vec=np.array([0]*len(samp));
        den=0;
        for word in phrase.split():
            den=den+1;
            vec=vec+np.array(getWordVec(word,embeddingmodel));
        return vec.reshape(1, -1)

Word Embedding of **train data**

In [151]:
#With w2Vec for original question dataset
import numpy
sent_embeddings=[];
for sent in Train_cleaned_sentences:
    sent_embeddings.append(getPhraseEmbedding(sent,w2v_model));

Word Embedding of **test data**

In [152]:
#With w2Vec for test question dataset
import numpy
sent_embeddings_test=[];
for sent_test in Test_cleaned_sentences:
    sent_embeddings_test.append(getPhraseEmbedding(sent_test,w2v_model));

# cosine_similarity of questions

In [153]:
import sklearn
from sklearn.metrics.pairwise import cosine_similarity;#sent_embeddings_test,sent_embeddings,df,df_test,Test_cleaned_sentences

def retrieveAndPrintFAQAnswer(test_embed, train_embed, train_df, test_df, Test_cleaned_sentences):
  for index_test,faq_embedding_test in enumerate(test_embed):
    max_sim = -1;
    index_sim = -1;
    for index,faq_embedding in enumerate(train_embed):
        
      sim = cosine_similarity(faq_embedding,faq_embedding_test)[0][0];
      if sim > max_sim:
        max_sim = sim;
        index_sim = index;

    test_df_new =  test_df
    print("Retrieved: ",test_df_new.iloc[index_test,0]) 
    print("\n")
    test_df_new. iloc[index_test,0] = train_df.iloc[index_sim,1]
    print("Answer:",train_df.iloc[index_sim,1])        

retrieveAndPrintFAQAnswer(sent_embeddings_test, sent_embeddings, df, df_test, Test_cleaned_sentences)

Retrieved:  What is the date of his death?
Answer: He died 18 April 1955 in Princeton, New Jersey, USA.
Retrieved:  Did Einstein have siblings?
Answer: He had one sister named Maja.
Retrieved:  Who was his wife?
Answer: He was married to Mileva Marić between 1903 and 1919. They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910). He married Elsa Löwenthal in 1919 and they lived together until her death in 1936.
Retrieved:  What was Einstein's father's name?
Answer: His father was Hermann Einstein and his mother was Pauline Einstein (born Koch).
Retrieved:  At what institutions did he study?
Answer: He received his main education at the following schools: Catholic elementary school in Munich, Germany (1885-1888)Luitpold Gymnasium in Munich, Germany (1888-1894) Cantonal school in Aarau, Switzerland (1895-1896) Swiss Federal Institute of Technology in Zurich, Switzerland (1896-1900) Ph.D. from Zurich University, Switzerland (1905)


# Full Code

In [159]:
#Load dataset
df = pd.read_csv("/content/drive/MyDrive/FAQ-Dataset/FAQs - FAQs.csv")
df_test = pd.read_csv("/content/drive/MyDrive/FAQ-Dataset/FAQs_test - FAQs_test.csv")

#Data preprocessing
def clean_sentence(sentence, stopwords=False):
    
    sentence = sentence.lower().strip()
    sentence = re.sub(r'[^a-z0-9\s]', '', sentence)

    if stopwords:
         sentence = remove_stopwords(sentence)
    
    return sentence
                    
def get_cleaned_sentences(tf,stopwords=False):    
    sents=tf[["Question"]];
    cleaned_sentences=[]

    for index,row in tf.iterrows():
        #print(index,row)
        cleaned=clean_sentence(row["Question"],stopwords);
        cleaned_sentences.append(cleaned);
    return cleaned_sentences;



Train_cleaned_sentences = get_cleaned_sentences(df,stopwords=True)
Test_cleaned_sentences = get_cleaned_sentences(df_test,stopwords=True)


#Stop word
def getWordVec(word,model):
        samp=model['computer'];
        vec=[0]*len(samp);
        try:
                vec=model[word];
        except:
                vec=[0]*len(samp);
        return (vec)


def getPhraseEmbedding(phrase,embeddingmodel):
                       
        samp=getWordVec('computer', embeddingmodel);
        vec=np.array([0]*len(samp));
        den=0;
        for word in phrase.split():
            den=den+1;
            vec=vec+np.array(getWordVec(word,embeddingmodel));
        return vec.reshape(1, -1)


#Embedding of train data using word2vec
import numpy
sent_embeddings=[];
for sent in Train_cleaned_sentences:
    sent_embeddings.append(getPhraseEmbedding(sent,w2v_model)); 


#Embedding of test data using word2vec
import numpy
sent_embeddings_test=[];
for sent_test in Test_cleaned_sentences:
    sent_embeddings_test.append(getPhraseEmbedding(sent_test,w2v_model));     

#Cosine similarity
import sklearn
from sklearn.metrics.pairwise import cosine_similarity;

def retrieveAndPrintFAQAnswer(test_embed, train_embed, train_df, test_df, Test_cleaned_sentences):
  for index_test,faq_embedding_test in enumerate(test_embed):
    max_sim = -1;
    index_sim = -1;
    for index,faq_embedding in enumerate(train_embed):
        
      sim = cosine_similarity(faq_embedding,faq_embedding_test)[0][0];
      if sim > max_sim:
        max_sim = sim;
        index_sim = index;

    test_df_new =  test_df
    print("Testset Question : ",test_df_new.iloc[index_test,0]) 
   
    test_df_new. iloc[index_test,0] = train_df.iloc[index_sim,1]
    print("Answer:",train_df.iloc[index_sim,1])        
    print("\n")

#Call the function
retrieveAndPrintFAQAnswer(sent_embeddings_test, sent_embeddings, df, df_test, Test_cleaned_sentences)

Testset Question :  What is the date of his death?
Answer: He died 18 April 1955 in Princeton, New Jersey, USA.


Testset Question :  Did Einstein have siblings?
Answer: He had one sister named Maja.


Testset Question :  Who was his wife?
Answer: He was married to Mileva Marić between 1903 and 1919. They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910). He married Elsa Löwenthal in 1919 and they lived together until her death in 1936.


Testset Question :  What was Einstein's father's name?
Answer: His father was Hermann Einstein and his mother was Pauline Einstein (born Koch).


Testset Question :  At what institutions did he study?
Answer: He received his main education at the following schools: Catholic elementary school in Munich, Germany (1885-1888)Luitpold Gymnasium in Munich, Germany (1888-1894) Cantonal school in Aarau, Switzerland (1895-1896) Swiss Federal Institute of Technology in Zurich, Switzerland (1896-1900) Ph.D. from Zurich Unive