In [3]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string
import unidecode
from sklearn.feature_extraction.text import CountVectorizer
from numpy import dot
from numpy.linalg import norm


In [4]:
translation_table = str.maketrans(string.punctuation+string.ascii_uppercase," "*len(string.punctuation)+string.ascii_lowercase)
my_stopwords = stopwords.words('english')
my_stopwords.extend(['', ' '])
ps = PorterStemmer()

def preprocess_file(filename):
    df = pd.read_json(filename)
    rows = []
    for subject in df['data']:
        subject_rows = []
        for paragraph in subject['paragraphs']:
            context = [paragraph['context']]
            questions = [question['question'] for question in paragraph['qas']]
            context.extend(questions)
            subject_rows.append(context)
        rows.append(subject_rows)
        
    classified = pd.DataFrame(rows)

    all_questions = []

    for paragraph in classified.columns:
        for questions in classified[paragraph]:
            try : 
                context = questions[0] 
                for question in questions[1:] :
                    all_questions.append([question, context])
            except : 
                pass 

    return pd.DataFrame(all_questions, columns = ['question', 'correct_context'])

def get_cosin_sim(question, contexts):
    cos_sim_for_question = []
    for context in contexts :
        cv = CountVectorizer(stop_words=my_stopwords, lowercase=False)
        matrix = cv.fit_transform(pd.DataFrame([question, context])[0]).toarray()
        cos_sim = dot(matrix[0], matrix[1])/(norm(matrix[0])*norm(matrix[1]))
        cos_sim_for_question.append(cos_sim)
    return pd.Series(cos_sim_for_question)

def stem_text(text):
    return ps.stem(unidecode.unidecode(text.translate(translation_table)))

In [3]:
preprocess_file('./data/train-v2.0.json').to_csv('./data/train.csv', index=False)
preprocess_file('./data/dev-v2.0.json').to_csv('./data/validation.csv', index=False)

In [4]:
df = pd.read_csv('data/train.csv')
validation = pd.read_csv('data/validation.csv')

In [5]:
df['processed_questions'] = df.question.apply(stem_text)
df['processed_context'] = df.correct_context.apply(stem_text)
validation['processed_questions'] = validation.question.apply(stem_text)
validation['processed_context'] = validation.correct_context.apply(stem_text)

In [6]:
question = df.processed_questions.loc[0]
contexts = df.processed_context.drop_duplicates()

train_cosin = get_cosin_sim(question, contexts)

In [16]:
question = validation.processed_questions.loc[0]
contexts = validation.processed_context.drop_duplicates()

validate_cosin = get_cosin_sim(question, contexts)

In [17]:
strongest_cosin_id = validate_cosin[validate_cosin == validate_cosin.max()].index
print(validation.processed_context.drop_duplicates().iloc[strongest_cosin_id])

5226    the normans were in contact with england from ...
Name: processed_context, dtype: object
