### Some data preprocessing to clean the data

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from scipy.sparse import vstack, hstack

In [4]:
def format_raw_df(df):
    """
    Cleanup data and join questions to answers
    :param df: raw DataFrame
    :return: processed DataFrame
    """
    df['PostTypeId'] = df['PostTypeId'].astype(int)
    df['Id'] = df['Id'].astype(int)
    df['AnswerCount'] = df['AnswerCount'].fillna(-1)
    df['AnswerCount'] = df['AnswerCount'].astype(int)
    df['OwnerUserId'].fillna(-1, inplace=True)
    df['OwnerUserId'] = df['OwnerUserId'].astype(int)
    df.set_index('Id', inplace = True, drop = False)
    df['is_question'] = df['PostTypeId'] == 1
    
    df = df[df['PostTypeId'].isin([1,2])]
    df = df.join(
        df[['Id','Title','body_text','Score', 'AcceptedAnswerId']],
        on = 'ParentId', how = 'left', rsuffix = '_question',
    
    )
    return df

def add_v1_features(df):
    """
    Add our first features to an input DataFrame
    :param df: DataFrame of questions
    :return: DataFrame with added feature columns
    """
    df['action_verb_full'] = (
        df['full_text'].str.contains('can',regex = False)
        | df['full_text'].str.contains('What', regex = False)
        | df['full_text'].str.contains('should', regex = False)
    
    )
    
    df['language_question'] = (
        df['full_text'].str.contains('punctuate', regex = False)
        | df['full_text'].str.contains('capitalize', regex = False)
        | df['full_text'].str.contains('should', regex = False)
    
    )
    df['question_mark_full'] = df['full_text'].str.contains('?', regex = False)
    df['text_len'] = df['full_text'].str.len()
    return df

def train_vectorizer(df):
    vectorizer = TfidfVectorizer(
        strip_accents = 'ascii', min_df = 5, max_df = 0.5, max_features = 10000
    )
    vectorizer.fit(df['full_text'].copy())
    return vectorizer

def get_vectorized_series(text_series, vectorizer):
    vectors = vectorizer.transform(text_series)
    vectorized_series = [vectors[i] for i in range(vectors.shape[0])]
    return vectorized_series

def add_text_features_to_df(df):
    df['full_text'] = df['Title'].str.cat(df['body_text'], sep=' ', na_rep = '')
    df = add_v1_features(df.copy())
    return df

def get_vectorized_inputs_and_labels(df):
    vectorized_features = np.append(
        np.vstack(df['vectors']),
        df[
            [
                'action_verb_full', 'question_mark_full',
                'norm_text_length', 'language_question'
            ]
        ],1,
    
    )
    label = df['Score']> df['Score'].median()
    return vectorized_features, label

def get_feature_vector_and_label(df, feature_names):
    vec_features = vstack(df['vectors'])
    num_features = df[feature_names].astype(float)
    features = hstack([vec_features, num_features])
    labels = df['Score']>df['Score'].median()
    return features, labels

def get_normalized_series(df, col):
    return (df[col]-df[col].mean())/df[col].std()

def get_random_train_test_split(posts, test_size = 0.3, random_state = 40):
    return train_test_split(
        posts, test_size = test_size, random_state = random_state
    
    )

def get_split_by_author(
    posts, author_id_columns = 'OwnerUserId', test_size = 0.3, random_state = 40
):
    splitter = GroupShuffleSplit(
        n_splits = 1, test_size = test_size, random_state = random_state
    )
    splits = splitter.split(posts, groups=posts[author_id_columns])
    train_idx, test_idx = next(splits)
    return posts.iloc[train_idx, :], posts.iloc[test_idx, :]