In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Create class for data preprocessing required for training

In [2]:
"""
Class that holds all datasets with initial data
"""
class Data:
    def __init__(self, data_path='../../data/'):
        self.emails = pd.read_csv(data_path + 'emails.csv')
        self.questions = pd.read_csv(data_path + 'questions.csv')
        self.professionals = pd.read_csv(data_path + 'professionals.csv')
        self.comments = pd.read_csv(data_path + 'comments.csv')
        self.tag_users = pd.read_csv(data_path + 'tag_users.csv')
        self.group_memberships = pd.read_csv(data_path + 'group_memberships.csv')
        self.tags = pd.read_csv(data_path + 'tags.csv')
        self.students = pd.read_csv(data_path + 'students.csv')
        self.groups = pd.read_csv(data_path + 'groups.csv')
        self.tag_questions = pd.read_csv(data_path + 'tag_questions.csv')
        self.matches = pd.read_csv(data_path + 'matches.csv')
        self.answers = pd.read_csv(data_path + 'answers.csv')
        self.school_memberships = pd.read_csv(data_path + 'school_memberships.csv')

In [5]:
"""
Class implementing all data preprocessing required for training
"""
class TrainPreprocessing(Data):
    def __init__(self, data_path='../../data/'):
        super().__init__(data_path)
    
    """
    Creates the question-answer pairs dataset called qa.
    """
    def qa_dataset_creation(self):
        # Merge questions with answers and delete not answered questions
        qa = self.questions.merge(self.answers, how='right', left_on='questions_id', right_on='answers_question_id')
        
        # Merge with professionals and students (students asked, professionals answered)
        # Maybe change this in the future by taking care of professional who change status to students and vise versa
        qa = qa.merge(self.professionals, how='inner', left_on='answers_author_id', right_on='professionals_id')
        qa = qa.merge(self.students, how='inner', left_on='questions_author_id', right_on='students_id')
        
        # Get tags for each question in lists
        tag_groups_dict = self.tag_questions.groupby('tag_questions_question_id').groups
        
        # Concatenate tag lists in single strings
        for question, tags in tag_groups_dict.items():
            tag_list = list(tags)
            for i in range(len(tag_list)):
                tag_list[i] = str(tag_list[i])

            tag_groups_dict[question] = ','.join(tag_list)
        
        # Create a DataFrame storing tags for questions in strings
        tag_groups = pd.DataFrame.from_dict(tag_groups_dict, orient='index').reset_index()
        tag_groups.rename({'index': 'questions_id', 0: 'questions_tags'}, axis=1, inplace=True)
        
        # Add question tags to qa
        qa = qa.merge(tag_groups, how='left', on='questions_id')
        
        # Select only professionals who answered at least one question
        temp = qa[['professionals_id', 'answers_date_added', 'answers_id']]
        prof_unique = pd.DataFrame(temp.professionals_id.unique(), columns=['professionals_id'])
        prof_unique = prof_unique.merge(self.professionals, how='left', on='professionals_id')
        
        # For every professional add a "dummy" question with answer date being professional's registration date
        prof_unique['answers_id'] = list(None for _ in range(prof_unique.shape[0]))
        prof_unique['answers_date_added'] = prof_unique['professionals_date_joined']
        prof_unique = prof_unique[['professionals_id', 'answers_date_added', 'answers_id']]
        
        # Add "dummy" questions to all questions
        temp = pd.concat([temp, prof_unique])
        
        # Sort by professionals and answer dates
        temp = temp.sort_values(by=['professionals_id', 'answers_date_added']).reset_index(drop=True)
        
        # Get the sorted representation of the answers_date_added and shift the index down by one
        # so that current question is aligned with previous question answer date
        last_answer_date = pd.DataFrame({'professionals_last_answer_date': temp.answers_date_added})
        last_answer_date.index += 1
        
        # Add the professionals_last_answer_date column to temp
        temp = temp.merge(last_answer_date, left_index=True, right_index=True)
        temp.dropna(subset=['answers_id'], inplace=True)
        temp.drop(columns=['professionals_id', 'answers_date_added'], inplace=True)
        
        # Add professionals_last_answer_date column to qa 
        qa = qa.merge(temp, on='answers_id')
        
        # Transform all dates from string representation to python datetime object
        qa.answers_date_added = pd.to_datetime(qa.answers_date_added)
        qa.questions_date_added = pd.to_datetime(qa.questions_date_added)
        qa.professionals_last_answer_date = pd.to_datetime(qa.professionals_last_answer_date)
        qa['questions_time_to_answer'] = qa.answers_date_added - qa.questions_date_added
        
        # Select only the columns we need
        qa = qa[[
            'questions_id', 'questions_author_id', 'questions_title', 'questions_body', 'questions_tags',
            'questions_date_added', 'answers_id', 'answers_author_id', 'answers_body', 'answers_date_added',
            'questions_time_to_answer', 'professionals_last_answer_date'
        ]]
        
        self.qa = qa
        return self.qa

In [6]:
preproc = TrainPreprocessing()
preproc.qa_dataset_creation().head()

Unnamed: 0,questions_id,questions_author_id,questions_title,questions_body,questions_tags,questions_date_added,answers_id,answers_author_id,answers_body,answers_date_added,questions_time_to_answer,professionals_last_answer_date
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,Teacher career question,What is a maths teacher? what is a ma...,410224646863885,2016-04-26 11:14:26,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,<p>Hi!</p>\n<p>You are asking a very interesti...,2016-04-29 19:40:14,3 days 08:25:48,2016-04-29 14:15:00
1,7a0d4bc67b1c492fb06fe455b1c07faf,8f6f374ffd834d258ab69d376dd998f5,Teacher's Qualification,Hi I am doing my 10th Standard. What are the q...,567026342509356396573152,2016-04-26 10:59:44,334f6735d31e45589e43da5ae7056e50,05ab77d4c6a141b999044ebbf5415b0d,<p>It's helpful to take higher-level classes i...,2018-03-08 18:23:36,681 days 07:23:52,2018-03-08 18:23:01
2,7a0d4bc67b1c492fb06fe455b1c07faf,8f6f374ffd834d258ab69d376dd998f5,Teacher's Qualification,Hi I am doing my 10th Standard. What are the q...,567026342509356396573152,2016-04-26 10:59:44,e5d66281cc314675b95ddbb799b75473,05ab77d4c6a141b999044ebbf5415b0d,"<p>Essentially, treat them like human beings. ...",2018-03-08 18:24:03,681 days 07:24:19,2018-03-08 18:23:36
3,7a0d4bc67b1c492fb06fe455b1c07faf,8f6f374ffd834d258ab69d376dd998f5,Teacher's Qualification,Hi I am doing my 10th Standard. What are the q...,567026342509356396573152,2016-04-26 10:59:44,e5c0da2a29ff414fa76b9da6e86337fc,58fa5e95fe9e480a9349bbb1d7faaddb,<p>Check the link below.</p>\n<p>http://www.ed...,2016-07-03 18:38:36,68 days 07:38:52,2016-07-03 18:09:58
4,0f1d6a4f276c4a05878dd48e03e52289,585ac233015447cc9e9a217044e515e1,what kind of college could i go to for a soc...,I like soccer because i been playing sense i w...,470966298266353,2016-05-19 22:16:25,f3519ab99a1a4a13a8a9ecb814287d2a,36ff3b3666df400f956f8335cf53e09e,<p>Hi Rodrigo!</p>\n<p>The important thing to ...,2016-07-31 15:35:54,72 days 17:19:29,2016-07-31 15:10:27
