In [13]:
import numpy as np
import pandas as pd
import os, pickle
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Create class for data preprocessing required for training

In [43]:
class DatasetCreator:
    """
    Class that imports initial datasets and creates additional datasets for convenience
    """
    
    def __init__(self, data_path='../../data/', created=False):
        # Add data_path to class properties
        self.data_path = data_path
        
        # Import all initial datasets
        self.emails = pd.read_csv(data_path + 'emails.csv')
        self.questions = pd.read_csv(data_path + 'questions.csv')
        self.professionals = pd.read_csv(data_path + 'professionals.csv')
        self.comments = pd.read_csv(data_path + 'comments.csv')
        self.tag_users = pd.read_csv(data_path + 'tag_users.csv')
        self.group_memberships = pd.read_csv(data_path + 'group_memberships.csv')
        self.tags = pd.read_csv(data_path + 'tags.csv')
        self.students = pd.read_csv(data_path + 'students.csv')
        self.groups = pd.read_csv(data_path + 'groups.csv')
        self.tag_questions = pd.read_csv(data_path + 'tag_questions.csv')
        self.matches = pd.read_csv(data_path + 'matches.csv')
        self.answers = pd.read_csv(data_path + 'answers.csv')
        self.school_memberships = pd.read_csv(data_path + 'school_memberships.csv')
        
        if created:
            # Load additional datasets from disk
            self.qa_data = pd.read_csv(self.data_path + 'qa_data.csv')
            self.prof_data = pd.read_csv(self.data_path + 'prof_data.csv')
            self.stud_data = pd.read_csv(self.data_path + 'stud_data.csv')
        else:
            # Create additional datasets and save them to disk
            self.additional_dataset_creation()
    
    
    def additional_dataset_creation(self):
        """
        Creates additional datasets for futher processing and save them to disk.
        """
        # Create temporary dataset for further processing
        all_data = self.all_data_dataset_creation()
        
        # Create question-answer pairs dataset called qa_data
        self.qa_data = self.qa_data_dataset_creation(all_data)
        
        # Create dataset called prof_data compirising data of professionals
        # who answered at least one question
        self.prof_data = self.prof_data_dataset_creation(all_data)
        
        # Create dataset called stud_data compirising data of students
        # who asked at least one answered question
        self.stud_data = self.stud_data_dataset_creation(all_data)
        
        # Save new datasets to disc
        self.qa_data.to_csv(self.data_path + 'qa_data.csv', index=False)
        self.prof_data.to_csv(self.data_path + 'prof_data.csv', index=False)
        self.stud_data.to_csv(self.data_path + 'stud_data.csv', index=False)
    
    
    def all_data_dataset_creation(self):
        """
        Merges questions, answers, professionals and students datasets
        to get temporary dataset for further processing
        """
        # Merge questions with answers and delete not answered questions
        all_data = self.questions.merge(self.answers, how='right', left_on='questions_id', right_on='answers_question_id')
        
        # Merge with professionals and students (students asked, professionals answered)
        # Maybe change this in the future by taking care of professional who change status to students and vise versa
        all_data = all_data.merge(self.professionals, how='inner', left_on='answers_author_id', right_on='professionals_id')
        all_data = all_data.merge(self.students, how='inner', left_on='questions_author_id', right_on='students_id')
        
        # Transform all dates from string representation to python datetime object
        all_data.answers_date_added = pd.to_datetime(all_data.answers_date_added)
        all_data.questions_date_added = pd.to_datetime(all_data.questions_date_added)
        
        # Add questions_age feature, which represents amount of time
        # from question emergence to a particular answer to that question
        all_data['questions_age'] = all_data.answers_date_added - all_data.questions_date_added
        
        return all_data
    
    
    def qa_data_dataset_creation(self, all_data):
        """
        Creates question-answer pairs dataset called qa_data_data
        """
        # Temporary qa_data representation
        qa_data = all_data
        
        # Select only unique professionals
        temp = qa_data[['professionals_id', 'answers_date_added', 'answers_id']]
        prof_unique = pd.DataFrame(temp.professionals_id.unique(), columns=['professionals_id'])
        prof_unique = prof_unique.merge(self.professionals, how='left', on='professionals_id')
        
        # For every professional add a "dummy" question with answer date being professional's registration date
        prof_unique['answers_id'] = list(None for _ in range(prof_unique.shape[0]))
        prof_unique['answers_date_added'] = prof_unique['professionals_date_joined']
        prof_unique = prof_unique[['professionals_id', 'answers_date_added', 'answers_id']]
        
        # Add "dummy" questions to all questions
        temp = pd.concat([temp, prof_unique])
        
        # Sort by professionals and answer dates
        temp = temp.sort_values(by=['professionals_id', 'answers_date_added']).reset_index(drop=True)
        
        # Get the sorted representation of the answers_date_added and shift the index down by one
        # so that current question is aligned with previous question answer date
        last_answer_date = pd.DataFrame({'professionals_last_answer_date': temp.answers_date_added})
        last_answer_date.index += 1
        
        # Add the professionals_last_answer_date column to temp
        temp = temp.merge(last_answer_date, left_index=True, right_index=True)
        temp.dropna(subset=['answers_id'], inplace=True)
        temp.drop(columns=['professionals_id', 'answers_date_added'], inplace=True)
        
        # Add professionals_last_answer_date column to qa_data 
        qa_data = qa_data.merge(temp, on='answers_id')
        
        # Transform dates from string representation to python datetime object
        qa_data.professionals_last_answer_date = pd.to_datetime(qa_data.professionals_last_answer_date)
        
        # Final qa_data representation
        qa_data = qa_data[[
            'students_id', 'questions_id', 'questions_title', 'questions_body',
            'questions_date_added', 'professionals_id', 'answers_id', 'answers_body',
            'professionals_last_answer_date'
        ]]
        
        return qa_data
    
    
    def prof_data_dataset_creation(self, all_data):
        """
        Creates dataset called prof_data compirising data of professionals who answered at least one question
        """
        # Select only professionals who answered at least one question
        active_professionals = pd.DataFrame({'professionals_id': all_data.professionals_id.unique()})
        prof_data = self.professionals.merge(active_professionals, how='right', on='professionals_id')
        prof_data.professionals_date_joined = pd.to_datetime(prof_data.professionals_date_joined)
        
        # Count the number of answered questions by each professional
        number_answered = all_data[['questions_id', 'professionals_id']].groupby('professionals_id').count()
        number_answered = number_answered.rename({'questions_id': 'professionals_questions_answered'}, axis=1)
        
        # Add professionals_questions_answered feature to prof_data
        prof_data = prof_data.merge(number_answered, left_on='professionals_id', right_index=True)
        
        # Get average question age for every professional among questions he answered
        average_question_age = (
            all_data.groupby('professionals_id')
            .questions_age.mean(numeric_only=False)
        )
        average_question_age = pd.DataFrame({'professionals_average_question_age': average_question_age})
        
        # Add professionals_average_question_age feature to prof_data
        prof_data = prof_data.merge(average_question_age, on='professionals_id')
        
        return prof_data
    
    
    def stud_data_dataset_creation(self, all_data):
        """
        Creates dataset called stud_data compirising data of students who asked at least one answered question
        """
        # Select only students who asked at least one answered question
        active_students = pd.DataFrame({'students_id': all_data.students_id.unique()})
        stud_data = self.students.merge(active_students, how='right', on='students_id')
        stud_data.students_date_joined = pd.to_datetime(stud_data.students_date_joined)
        
        # Count the number of asked questions by each student
        number_asked = all_data[['questions_id', 'students_id']].groupby('students_id').count()
        number_asked = number_asked.rename({'questions_id': 'students_questions_asked'}, axis=1)
        
        # Add students_questions_answered feature to stud_data
        stud_data = stud_data.merge(number_asked, left_on='students_id', right_index=True)
        
        # Get average question age for every student among questions he asked that were answered
        average_question_age = (
            all_data.groupby('students_id')
            .questions_age.mean(numeric_only=False)
        )
        average_question_age = pd.DataFrame({'students_average_question_age': average_question_age})
        
        # Add professionals_average_question_age feature to prof_data
        stud_data = stud_data.merge(average_question_age, on='students_id')
        
        return stud_data

In [29]:
creator = DatasetCreator(created=True)

In [30]:
creator.qa_data.head()

Unnamed: 0,questions_id,questions_author_id,questions_title,questions_body,questions_date_added,answers_id,answers_author_id,answers_body,questions_age,professionals_last_answer_date
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,Teacher career question,What is a maths teacher? what is a ma...,2016-04-26 11:14:26,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,<p>Hi!</p>\r\n<p>You are asking a very interes...,3 days 08:25:48.000000000,2016-04-29 14:15:00
1,7a0d4bc67b1c492fb06fe455b1c07faf,8f6f374ffd834d258ab69d376dd998f5,Teacher's Qualification,Hi I am doing my 10th Standard. What are the q...,2016-04-26 10:59:44,334f6735d31e45589e43da5ae7056e50,05ab77d4c6a141b999044ebbf5415b0d,<p>It's helpful to take higher-level classes i...,681 days 07:23:52.000000000,2018-03-08 18:23:01
2,7a0d4bc67b1c492fb06fe455b1c07faf,8f6f374ffd834d258ab69d376dd998f5,Teacher's Qualification,Hi I am doing my 10th Standard. What are the q...,2016-04-26 10:59:44,e5d66281cc314675b95ddbb799b75473,05ab77d4c6a141b999044ebbf5415b0d,"<p>Essentially, treat them like human beings. ...",681 days 07:24:19.000000000,2018-03-08 18:23:36
3,7a0d4bc67b1c492fb06fe455b1c07faf,8f6f374ffd834d258ab69d376dd998f5,Teacher's Qualification,Hi I am doing my 10th Standard. What are the q...,2016-04-26 10:59:44,e5c0da2a29ff414fa76b9da6e86337fc,58fa5e95fe9e480a9349bbb1d7faaddb,<p>Check the link below.</p>\r\n<p>http://www....,68 days 07:38:52.000000000,2016-07-03 18:09:58
4,0f1d6a4f276c4a05878dd48e03e52289,585ac233015447cc9e9a217044e515e1,what kind of college could i go to for a soc...,I like soccer because i been playing sense i w...,2016-05-19 22:16:25,f3519ab99a1a4a13a8a9ecb814287d2a,36ff3b3666df400f956f8335cf53e09e,<p>Hi Rodrigo!</p>\r\n<p>The important thing t...,72 days 17:19:29.000000000,2016-07-31 15:10:27


In [47]:
class Preprocessor(DatasetCreator):
    def __init__(self, created=False):
        super().__init__(created=created)
        
    def qa_data_preprocessing(self):
        datetime(self.qa_data, 'questions_date_added', hour=True)
        datetime(self.qa_data, 'professionals_last_answer_date', hour=True)
        
    def stud_data_preprocessing(self):
        datetime(self.stud_data, 'students_date_joined')
        timedelta(self.stud_data, 'students_average_question_age')
        
        

In [51]:
pp.stud_data.head()

Unnamed: 0,students_id,students_location,students_date_joined,students_questions_asked,students_average_question_age
0,12a89e96755a4dba83ff03e03043d9c0,,2011-12-16 14:19:24,2,794 days 06:00:53.000000000
1,5bdd2eb44dd944a9a7ab9aba068d1ef2,,2012-01-01 05:00:00,2,2 days 16:05:06.500000000
2,9658267bc2564a85bad1e802de5fb597,"Wayne, Pennsylvania",2012-01-01 05:00:00,6,377 days 15:36:03.000000000
3,7b1900c458e34573bfeb0d57ffbd260a,,2012-01-01 05:00:00,3,8 days 13:05:38.000000000
4,e9efc4d6e06e49c7ae5afe1aad8c5bd5,,2012-01-01 05:00:00,1,21 days 21:56:35.000000000


In [49]:
pp = Preprocessor(created=True)
pp.qa_data.head()

Unnamed: 0,students_id,questions_id,questions_title,questions_body,questions_date_added,professionals_id,answers_id,answers_body,professionals_last_answer_date
0,8f6f374ffd834d258ab69d376dd998f5,332a511f1569444485cf7a7a556a5e54,Teacher career question,What is a maths teacher? what is a ma...,2016-04-26 11:14:26,36ff3b3666df400f956f8335cf53e09e,4e5f01128cae4f6d8fd697cec5dca60c,<p>Hi!</p>\r\n<p>You are asking a very interes...,2016-04-29 14:15:00
1,8f6f374ffd834d258ab69d376dd998f5,7a0d4bc67b1c492fb06fe455b1c07faf,Teacher's Qualification,Hi I am doing my 10th Standard. What are the q...,2016-04-26 10:59:44,05ab77d4c6a141b999044ebbf5415b0d,334f6735d31e45589e43da5ae7056e50,<p>It's helpful to take higher-level classes i...,2018-03-08 18:23:01
2,8f6f374ffd834d258ab69d376dd998f5,7a0d4bc67b1c492fb06fe455b1c07faf,Teacher's Qualification,Hi I am doing my 10th Standard. What are the q...,2016-04-26 10:59:44,05ab77d4c6a141b999044ebbf5415b0d,e5d66281cc314675b95ddbb799b75473,"<p>Essentially, treat them like human beings. ...",2018-03-08 18:23:36
3,8f6f374ffd834d258ab69d376dd998f5,7a0d4bc67b1c492fb06fe455b1c07faf,Teacher's Qualification,Hi I am doing my 10th Standard. What are the q...,2016-04-26 10:59:44,58fa5e95fe9e480a9349bbb1d7faaddb,e5c0da2a29ff414fa76b9da6e86337fc,<p>Check the link below.</p>\r\n<p>http://www....,2016-07-03 18:09:58
4,585ac233015447cc9e9a217044e515e1,0f1d6a4f276c4a05878dd48e03e52289,what kind of college could i go to for a soc...,I like soccer because i been playing sense i w...,2016-05-19 22:16:25,36ff3b3666df400f956f8335cf53e09e,f3519ab99a1a4a13a8a9ecb814287d2a,<p>Hi Rodrigo!</p>\r\n<p>The important thing t...,2016-07-31 15:10:27


In [50]:
pp.qa_data_preprocessing()
pp.qa_data.head()

Unnamed: 0,students_id,questions_id,questions_title,questions_body,professionals_id,answers_id,answers_body,questions_date_added_time,questions_date_added_doy_sin,questions_date_added_doy_cos,questions_date_added_dow,questions_date_added_hour_sin,questions_date_added_hour_cos,professionals_last_answer_date_time,professionals_last_answer_date_doy_sin,professionals_last_answer_date_doy_cos,professionals_last_answer_date_dow,professionals_last_answer_date_hour_sin,professionals_last_answer_date_hour_cos
0,8f6f374ffd834d258ab69d376dd998f5,332a511f1569444485cf7a7a556a5e54,Teacher career question,What is a maths teacher? what is a ma...,36ff3b3666df400f956f8335cf53e09e,4e5f01128cae4f6d8fd697cec5dca60c,<p>Hi!</p>\r\n<p>You are asking a very interes...,2016.320548,0.903356,-0.428892,1,0.199368,-0.979925,2016.328767,0.880012,-0.474951,4,-0.55557,-0.83147
1,8f6f374ffd834d258ab69d376dd998f5,7a0d4bc67b1c492fb06fe455b1c07faf,Teacher's Qualification,Hi I am doing my 10th Standard. What are the q...,05ab77d4c6a141b999044ebbf5415b0d,334f6735d31e45589e43da5ae7056e50,<p>It's helpful to take higher-level classes i...,2016.320548,0.903356,-0.428892,1,0.263031,-0.964787,2018.183562,0.914128,0.405426,3,-0.994969,0.100188
2,8f6f374ffd834d258ab69d376dd998f5,7a0d4bc67b1c492fb06fe455b1c07faf,Teacher's Qualification,Hi I am doing my 10th Standard. What are the q...,05ab77d4c6a141b999044ebbf5415b0d,e5d66281cc314675b95ddbb799b75473,"<p>Essentially, treat them like human beings. ...",2016.320548,0.903356,-0.428892,1,0.263031,-0.964787,2018.183562,0.914128,0.405426,3,-0.994969,0.100188
3,8f6f374ffd834d258ab69d376dd998f5,7a0d4bc67b1c492fb06fe455b1c07faf,Teacher's Qualification,Hi I am doing my 10th Standard. What are the q...,58fa5e95fe9e480a9349bbb1d7faaddb,e5c0da2a29ff414fa76b9da6e86337fc,<p>Check the link below.</p>\r\n<p>http://www....,2016.320548,0.903356,-0.428892,1,0.263031,-0.964787,2016.506849,-0.043022,-0.999074,6,-0.999229,0.03926
4,585ac233015447cc9e9a217044e515e1,0f1d6a4f276c4a05878dd48e03e52289,what kind of college could i go to for a soc...,I like soccer because i been playing sense i w...,36ff3b3666df400f956f8335cf53e09e,f3519ab99a1a4a13a8a9ecb814287d2a,<p>Hi Rodrigo!</p>\r\n<p>The important thing t...,2016.383562,0.668064,-0.744104,3,-0.438371,0.898794,2016.583562,-0.501242,-0.865307,6,-0.737277,-0.67559


In [40]:
def datetime(df: pd.DataFrame, feature: str, hour: bool = False):
    '''
    Generates a bunch of new datetime features and drops the original feature inplace

    :param df: Data to work with.
    :param feature: Name of a column in df that contains date.
    :param hour: Whether feature contains time.
    '''
    df[feature] = pd.to_datetime(df[feature])
    
    df[feature + '_time'] = df[feature].apply(lambda d: d.year + d.dayofyear / 365)
    df[feature + '_doy_sin'] = df[feature].apply(lambda d: np.sin(2 * np.pi * d.dayofyear / 365))
    df[feature + '_doy_cos'] = df[feature].apply(lambda d: np.cos(2 * np.pi * d.dayofyear / 365))
    df[feature + '_dow'] = df[feature].apply(lambda d: d.weekday())

    if hour:
        df[feature + '_hour_sin'] = df[feature].apply(lambda d: np.sin(2 * np.pi * (d.hour + d.minute / 60) / 24))
        df[feature + '_hour_cos'] = df[feature].apply(lambda d: np.cos(2 * np.pi * (d.hour + d.minute / 60) / 24))

    df.drop(columns=feature, inplace=True)

In [42]:
def timedelta(df: pd.DataFrame, feature: str):
    '''
    Generates the new timedelta feature and drops the original feature inplace

    :param df: Data to work with.
    :param feature: Name of a column in df that contains timedelta.
    '''
    df[feature] = pd.to_timedelta(df[feature])
    
    df[feature] = df[feature] / pd.Timedelta("1 day")

    df.drop(columns=feature, inplace=True)

In [7]:
def _get_preprocessor(fit_data: np.array, feature: str, base, oblige_fit: bool, pp: dict = None):
    '''
    Returns object for pre-processing
    Creates new one of class base or uses existent in preprocessors.pickle

    :param fit_data: NumPy array of data to fit new preprocessor.
    :param feature: Feature name to search for in preprocessors.pickle.
    :param base: Preprocessor's class.
    :param oblige_fit: Whether to fit new preprocessor on feature even if there already exists one.
    :param pp: Object with preprocessors.
    :returns: Preprocessor object.
    '''
    if pp is None:
        if os.path.isfile('preprocessors.pickle'):
            with open('preprocessors.pickle', 'rb') as file:
                pp = pickle.load(file)
        else:
            pp = {}
    if feature in pp and not oblige_fit:
        preproc = pp[feature]
    else:
        preproc = base()
        preproc.fit(fit_data)
        pp[feature] = preproc
        with open('preprocessors.pickle', 'wb') as file:
            pickle.dump(pp, file)
    return preproc

In [8]:
def numerical(df: pd.DataFrame, feature: str, fillmode: str, oblige_fit: bool = False, pp: dict = None):
    '''
    Fills NaNs with given fillmode
    Transforms via StandardScaler

    :param df: Data to work with.
    :param feature: Name of a column in df that contains numerical data.
    :param fillmode: Method to fill NaNs, either 'mean' or 'zero',
    :param oblige_fit: Whether to fit new StandardScaler on feature even if there already exists one.
    :param pp: Object with preprocessors.
    '''
    na = df[feature].mean() if fillmode == 'mean' else 0
    df[feature].fillna(na, inplace=True)

    fit_data = df[feature].values.reshape(-1, 1).astype('float64')
    sc = _get_preprocessor(fit_data, feature, StandardScaler, oblige_fit, pp)
    df[feature] = sc.transform(fit_data)

In [9]:
def categorical(df: pd.DataFrame, feature: str, n: int, oblige_fit: bool = False, pp: dict = None):
    '''
    Encodes top n most popular values with different labels from 0 to n-1,
    remaining values with n and NaNs with n+1

    :param df: Data to work with.
    :param feature: Name of a column in df that contains categorical data.
    :param n: Number of top by popularity values to move in separate categories.
              0 to encode everything with different labels.
    :param oblige_fit: Whether to fit new LabelEncoder on feature even if there already exists one.
    :param pp: Object with preprocessors.
    '''
    vc = df[feature].value_counts()
    n = len(vc) if n == 0 else n

    top = set(vc[:n].index)
    isin_top = df[feature].isin(top)

    fit_data = df.loc[isin_top, feature]
    le = _get_preprocessor(fit_data, feature, LabelEncoder, oblige_fit, pp)

    isin_le = df[feature].isin(set(le.classes_))
    df.loc[isin_le, feature] = le.transform(df.loc[isin_le, feature])

    bottom = set(vc.index) - set(le.classes_)
    isin_bottom = df[feature].isin(bottom)
    df.loc[isin_bottom, feature] = n
    df[feature].fillna(n + 1, inplace=True)