# Approach

The text will merge title and body with little or no additional cleanup because titles tend to be properly formulated questions that respect natural language stucture and they are sometimes the only piece of text on a post.

Given the easier appreciation of score differences in the log scale, I will use that transformation.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

reddit_df = pd.read_csv('./askscience_data.csv', index_col=0)

def save_data_split_as_jsonlines(split, indxs, df, cols=['title','full_text','score']):

    f_name = f'./data/reddit_scores_{split}.jsonlines'
    
    print(f'Saving {len(indxs)} records to {f_name}')
    
    # include title as an identifier
    df.loc[indxs,cols].to_json(f_name, orient="records", lines=True)

In [2]:
# merge title and body
reddit_df['full_text'] =  reddit_df.apply(lambda x: str(x.title) + ' ' + str(x.body), axis=1)
reddit_df['score'] = np.log(1+reddit_df.score)

# create splits
fit_indxs, test_indxs = train_test_split(reddit_df.index.values, test_size=0.2, shuffle=False)
train_indxs, val_indxs = train_test_split(fit_indxs, test_size=0.1, shuffle=False)

# save splits
save_data_split_as_jsonlines('test', test_indxs, reddit_df)
save_data_split_as_jsonlines('train', train_indxs, reddit_df)
save_data_split_as_jsonlines('val', val_indxs, reddit_df)

Saving 841 records to ./data/reddit_scores_test.jsonlines
Saving 3027 records to ./data/reddit_scores_train.jsonlines
Saving 337 records to ./data/reddit_scores_val.jsonlines
