In [18]:
import pandas as pd
import spacy
import umap
import numpy as np
from pathlib import Path
import sys
sys.path.append('..')

import warnings
warnings.filterwarnings('ignore')
from ipynb.fs.full.data_processing import format_raw_df, get_random_train_test_split, get_vectorized_inputs_and_labels, get_split_by_author

data_path = Path('../data/writers.csv')
df = pd.read_csv(data_path)
df = format_raw_df(df.copy())

In [19]:
train_df_rand, test_df_rand = get_random_train_test_split(df[df['is_question']], test_size = 0.3, random_state =40)

In [20]:
print('%s questions in training, %s in test.'% (len(train_df_rand), len(test_df_rand)))
train_owners = set(train_df_rand['OwnerUserId'].values)
test_owners = set(test_df_rand['OwnerUserId'].values)

print('%s different owners in training set' % len(train_df_rand))
print('%s different owners in testing set'% len(test_df_rand))
print('%s owners appear in both sets' % len(train_owners.intersection(test_owners)))

5579 questions in training, 2392 in test.
5579 different owners in training set
2392 different owners in testing set
596 owners appear in both sets


To make sure we are accurately judging question quality, we would want to make sure that a given author only appears in either the training set or the validation set. This guarantee that a model will not be able to leverage information to identify a given author and use it to predict more easily.

In [24]:
train_author, test_author = get_split_by_author(df[df['is_question']], test_size = 0.3, random_state = 40)

print('%s questions in training, %s in test'% (len(train_author), len(test_author)))
train_owners = set(train_author['OwnerUserId'].values)
test_owners = set(test_author['OwnerUserId'].values)
print('%s different owners in the training set'% len(train_owners))
print('%s different owners in the testing set'% len(test_owners))
print('%s owners appear in both training set and test set'% len(train_owners.intersection(test_owners)))

5676 questions in training, 2295 in test
2723 different owners in the training set
1167 different owners in the testing set
0 owners appear in both training set and test set
