In [1]:
%cd ~/RATER-C

/home/daved/RATER-C


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import pandas as pd
import numpy as np
import random

seed = 123
rand = random.Random(seed)
np.random.seed(seed)

In [3]:
df = pd.read_csv('data/raw/Training and validation data.csv')
len(df)

291119

In [4]:
len(df.SourceId.unique())

3337

In [5]:
df[df['ItemText'].isnull()].SourceId.unique() # no missing definitions

array([], dtype=int64)

In [6]:
df[df['definition'].isnull()].SourceId.unique() # missing definitions in SourceId 11392

array([11392])

In [7]:
len(df[df['definition'].isnull()]) # 9 missing definitions

9

In [8]:
len(df[df['SourceId'] == 11392]) # 27 items related to this source

27

In [9]:
df = df[df['SourceId'] != 11392] # remove source 11392 with missing definitions
len(df)

291092

In [10]:
# find sources with no target variance
noVar = df.groupby(['SourceId', 'Target']).Target.count().groupby('SourceId').size().index[df.groupby(['SourceId', 'Target']).Target.count().groupby('SourceId').size() == 1]
len(noVar)

893

In [11]:
# check first problematic source based on no variance
print(noVar[0])
print(df.query('SourceId == ' + str(noVar[0])).Target.unique())

72
[1]


In [12]:
# remove sources with no target variance
clean_df = df[~df.SourceId.isin(noVar)]
len(clean_df.SourceId.unique())

2443

In [13]:
print(len(clean_df))

287426


In [14]:
clean_df.groupby('Target').Target.count()

Target
0    245854
1     41572
Name: Target, dtype: int64

In [15]:
ids = clean_df['SourceId'].unique()
rand.shuffle(ids)
final_df = clean_df.set_index('SourceId').loc[ids].reset_index()

In [16]:
train_sources = rand.sample(list(ids), k = round(len(list(ids)) * 0.8))
other_sources = set(ids) ^ set(train_sources)
val_sources = rand.sample(list(other_sources), k = round(len(list(other_sources)) * 0.5))
test_sources = set(other_sources) ^ set(val_sources)

len(ids) == len(train_sources) + len(val_sources) + len(test_sources)

True

In [17]:
train = final_df[final_df['SourceId'].isin(train_sources)]
val   = final_df[final_df['SourceId'].isin(val_sources)]
test  = final_df[final_df['SourceId'].isin(test_sources)]

In [18]:
print(len(train))
print(len(train)/len(final_df))

227474
0.7914176170562162


In [19]:
print(len(val))
print(len(val)/len(final_df))

26648
0.09271255905867945


In [20]:
print(len(test))
print(len(test)/len(final_df))

33304
0.11586982388510433


In [21]:
with pd.ExcelWriter('data/processed/train_val_test.xlsx') as writer:
    train.to_excel(writer, index = False, sheet_name = 'train')
    val.  to_excel(writer, index = False, sheet_name = 'val')
    test. to_excel(writer, index = False, sheet_name = 'test')

In [22]:
train.groupby('Area').Area.count() / len(final_df)

Area
BM            0.040000
Education     0.028376
IS            0.380860
Management    0.166286
Marketing     0.023937
Nursing       0.034656
Psychology    0.096227
Sociology     0.021077
Name: Area, dtype: float64

In [23]:
final_df.groupby(['Area']).apply(lambda x: len(np.unique(x['SourceId'])), include_groups = False)

Area
BM            188
Education      82
IS            654
Management    694
Marketing     134
Nursing       160
Psychology    401
Sociology     130
dtype: int64

In [24]:
len(final_df['SourceId'].unique())

2443