# AILAB_splitter Dataset Stats

In [1]:
import pandas as pd
import re

In [2]:
DATA_PATH = '/mnt/nas/victor_splitter/data/104pecas/uniao_104_ailab.parquet.gzip'
TRAIN_LABEL_PATH = '/mnt/nas/victor_splitter/data/104pecas/uniao_104_ailab.train'
TEST_LABEL_PATH = '/mnt/nas/victor_splitter/data/104pecas/uniao_104_ailab.val'

In [3]:
df_train = pd.read_csv(TRAIN_LABEL_PATH, sep=';', skiprows=0, low_memory=False)
df_test = pd.read_csv(TEST_LABEL_PATH,sep=';', skiprows=0, low_memory=False) 

In [4]:
data_df = pd.read_parquet(DATA_PATH)

In [5]:
df_train = pd.merge(df_train,
                 data_df[['path', 'page', 'img']],
                 left_on='page_name', 
                 how='left', 
                 right_index=True)

In [6]:
df_test = pd.merge(df_test,
                 data_df[['path', 'page', 'img']],
                 left_on='page_name', 
                 how='left', 
                 right_index=True)

In [7]:
len(df_train), len(df_test)

(25694, 6095)

In [8]:
len(df_train)+ len(df_test)

31789

In [9]:
del data_df

In [10]:
def get_extended_class(x):
    last_class = x['last_class']
    target_class = x['class']
    next_class = x['next_class']
    
    if target_class == 'FirstPage':
        if next_class == 'FirstPage': 
            x['extended_class'] = 'single page'
            return x
        else:
            x['extended_class'] = 'first of many'
        return x
    elif target_class == 'NextPage':
        if next_class == 'NextPage': 
            x['extended_class'] = 'middle'
            return x
        else:
            x['extended_class'] = 'last page'
        return x
    
def get_doc_ids (df):    
    doc_ids = []
    suffix = 1
    for index in range (len(df)):
        row = df.iloc[index]
        if row['extended_class'] == 'single page' or row['extended_class'] == 'first of many':
            suffix += 1
        doc_ids.append(row['path'][:-4] + '_'+ str(suffix))        
    return doc_ids

def add_extended_class_column(df):
    unk_class = pd.Series(['UNK'])
    last_class = unk_class.append(df['class'].iloc[:-1], ignore_index=True)
    df['last_class'] = last_class.values   

    next_class = df['class'].iloc[1:]
    next_class = next_class.append(unk_class, ignore_index=True)
    df['next_class'] = next_class.values    
    df = df.apply(get_extended_class, axis=1)
    
    df['doc_id'] = get_doc_ids(df)
    return df.copy()

# Volume Sample to Valid

In [12]:
s1 = df_train.page_name
s2 = df_test.page_name
list(set(s1).intersection(set(s2)))

[]

## Fixing Same Lawsuits Presence in Train and Test Split

### Set Lawsuits for each row in splits

In [13]:
def add_lawsuit_column(row):
    page_name = row['page_name']
    match = re.match(r"^([A-Z]+_\d+)_.*$", page_name)
    if match:
        row['lawsuit'] = match[1]
    else:
        row['lawsuit'] = 'UNK'
    return row

In [14]:
df_test = df_test.apply(add_lawsuit_column, axis=1)
df_train = df_train.apply(add_lawsuit_column, axis=1)

In [15]:
df_train = add_extended_class_column(df_train)
df_test = add_extended_class_column(df_test)

### Highlight lawsuit intersections

In [17]:
s1 = df_train.lawsuit
s2 = df_test.lawsuit
interctions = list(set(s1).intersection(set(s2)))

In [21]:
interctions[-10:]

['ARE_1222128',
 'ARE_1222527',
 'ARE_1223927',
 'ARE_1225968',
 'ARE_1225752',
 'ARE_1231854',
 'ARE_1233093',
 'ARE_1224103',
 'ARE_1229112',
 'ARE_1223636']

#### Show amount of pages and percentage wro intersection 

In [22]:
len(df_train[df_train['lawsuit'].isin(interctions)]),len(df_train),len(df_train[df_train['lawsuit'].isin(interctions)])/len(df_train)

(5230, 25694, 0.20354946680158792)

In [23]:
len(df_test[df_test['lawsuit'].isin(interctions)]),len(df_test),len(df_test[df_test['lawsuit'].isin(interctions)])/len(df_test)

(1277, 6095, 0.2095159967186218)

### Get the amount of pages for each lawsuit in the splits

In [24]:
test_lawsuits_counter = df_test[df_test['lawsuit'].isin(interctions)].lawsuit.value_counts(); test_lawsuits_counter

ARE_1221810    164
ARE_1222758     75
ARE_1223751     58
AI_868292       48
ARE_1222527     45
              ... 
ARE_1226310      1
RE_1194423       1
ARE_1223878      1
ARE_1223877      1
ARE_1223649      1
Name: lawsuit, Length: 104, dtype: int64

In [25]:
train_lawsuits_counter = df_train[df_train['lawsuit'].isin(interctions)].lawsuit.value_counts(); train_lawsuits_counter

ARE_1221810    1227
ARE_1223112     549
ARE_1226572     204
ARE_1222901     150
ARE_1221963     140
               ... 
ARE_1235684       1
ARE_1223877       1
ARE_1223878       1
ARE_1223486       1
ARE_1223998       1
Name: lawsuit, Length: 104, dtype: int64

### Get the amount of pages for each lawsuit from both splits

start with train

In [26]:
lawsuit_pages_count = {}
for key, value in train_lawsuits_counter.items():
    if lawsuit_pages_count.get(key):
        lawsuit_pages_count[key]+= value
    else:
        lawsuit_pages_count[key] = value

continue with test

In [27]:
for key, value in test_lawsuits_counter.items():
    if lawsuit_pages_count.get(key):
        lawsuit_pages_count[key]+= value
    else:
        lawsuit_pages_count[key] = value
sample = next(iter(lawsuit_pages_count))
print(sample,lawsuit_pages_count[sample])

ARE_1221810 1391


#### Generate a list to flip between train and test according to a ratio

In [28]:
def flip_lawsuit_samples_by_ratio(lawsuit_pages_count, ratio):
    '''
        Triage of a list of lawsuit according to their amount of pages, 
        in order to respect a process ratio between train and test
        
        Retur
    '''
    issue_train = 0
    issue_test = 0
    ratio = .8
    flip = {}
    for key, value in lawsuit_pages_count.items():
        if issue_train < 1:
            flip['train']=[key]
            issue_train+=value
            continue
        if issue_test < 1:
            flip['test']=[key]
            issue_test+=value
            continue
        if issue_train / (issue_train + issue_test) < ratio:
            #issue test
            flip['train'].append(key)
            issue_train+=value        
        else:
            #issue train
            flip['test'].append(key)
            issue_test+=value
    return flip

In [29]:
flip = flip_lawsuit_samples_by_ratio(lawsuit_pages_count, 0.8)

## Flipping Workflow

In [30]:
df_train['split'] = 'train'
df_test['split'] = 'test'

In [31]:
stats_df = pd.concat([df_train, df_test], ignore_index=True)

In [32]:
print(len(df_train),len(df_test), len(stats_df)) 

25694 6095 31789


In [33]:
stats_df.loc[stats_df['lawsuit'].isin(flip['train']), 'split'] = 'train'

In [34]:
stats_df.loc[stats_df['lawsuit'].isin(flip['test']), 'split'] = 'test'

In [35]:
df_test = stats_df[stats_df.split=='test']

In [36]:
df_train = stats_df[stats_df.split=='train']

In [37]:
s1 = df_train.lawsuit
s2 = df_test.lawsuit
interctions = list(set(s1).intersection(set(s2))); interctions

[]

`the empty list above means that there is no lawsuit intersection between test and train`

### Lets split the train/valid data

In [38]:
# sort raws by path and page, to make it easy to split by the end
df_train = df_train.sort_values(by=['path', 'page'], ignore_index=True)

In [39]:
def split_test_valid(df_train):
    print(len(df_train))
    
    ### Take one of the volume project files `ARE_1178596`
    df_val = df_train.loc[df_train['path'].str.contains("ARE_1178596")]
    df_train.drop(df_val.index, inplace=True)
    
    ### Take the 4800 tail of the train dataframe for valid
    df_val_aux = df_train.iloc[-4773:]
    df_val = df_val.append(df_val_aux, ignore_index=True)
    df_train.drop(df_val_aux.index, inplace=True)
    print(len(df_train), len(df_val))
    return df_train, df_val

In [40]:
df_train[['page_name','lawsuit']].iloc[-4810:-4769]

Unnamed: 0,page_name,lawsuit
20858,RE_656286_872193_30_05082013-2,RE_656286
20859,RE_656286_872193_30_05082013-3,RE_656286
20860,RE_656286_872193_30_05082013-4,RE_656286
20861,RE_656286_872193_30_05082013-5,RE_656286
20862,RE_656286_872200_12_05082013-1,RE_656286
20863,RE_656286_872200_12_05082013-2,RE_656286
20864,RE_656286_872200_12_05082013-3,RE_656286
20865,RE_656286_872200_12_05082013-4,RE_656286
20866,RE_656286_872200_12_05082013-5,RE_656286
20867,RE_656286_872200_12_05082013-6,RE_656286


In [41]:
train_lawsuits_counter = df_train.lawsuit.value_counts(); train_lawsuits_counter

ARE_1221810    1391
ARE_1106715     984
ARE_1178596     713
ARE_1092927     535
ARE_1095892     424
               ... 
RE_1234319        1
ARE_1033191       1
RE_1236130        1
RE_1226564        1
ARE_1039407       1
Name: lawsuit, Length: 1413, dtype: int64

In [42]:
lawsuit_pages_count = {}
for key, value in train_lawsuits_counter.items():
    if lawsuit_pages_count.get(key):
        lawsuit_pages_count[key]+= value
    else:
        lawsuit_pages_count[key] = value

In [43]:
flip = flip_lawsuit_samples_by_ratio(lawsuit_pages_count, 0.9)

In [44]:
df_train.loc[df_train['lawsuit'].isin(flip['train']), 'split'] = 'train'

In [45]:
df_train.loc[df_train['lawsuit'].isin(flip['test']), 'split'] = 'valid'

In [46]:
df_val = df_train[df_train.split=='valid'].copy()

In [47]:
df_train = df_train[df_train.split=='train'].copy()

In [56]:
df_train.columns

Index(['binder', 'page_name', 'class', 'new_process', 'prev_page_name', 'path',
       'page', 'img', 'lawsuit', 'last_class', 'next_class', 'extended_class',
       'doc_id', 'split'],
      dtype='object')

## Stats

In [91]:
def print_df_stats(df):
    n_lawsuits = len(df['lawsuit'].unique())
    n_files = len(df['path'].unique())
    distro_2 = df['class'].value_counts()
    distro_4 = df['extended_class'].value_counts()
    docs = len(df['doc_id'].unique())
    total = len(df)
    print(f'#lawsuits {n_lawsuits}')
    print('\n')
    print(f'#files {n_files}')
    print('\n')
    print(f'#Docs {docs}')
    print('\n')    
    print(f'-- two classes distribution -- \n{distro_2}')
    print('\n')
    print(f'-- three classes distribution -- \n{distro_4}')
    print('\n')
    print(f'# Total pages {total}')          
    return n_lawsuits, n_files, distro_2, distro_4, docs, total

In [92]:
summary = [print_df_stats(df_train), print_df_stats(df_val) , print_df_stats(df_test)]

#lawsuits 1133


#files 2751


#Docs 3077


-- two classes distribution -- 
NextPage     17464
FirstPage     3070
Name: class, dtype: int64


-- three classes distribution -- 
middle           15545
last page         1919
first of many     1917
single page       1153
Name: extended_class, dtype: int64


# Total pages 20534
#lawsuits 280


#files 642


#Docs 1363


-- two classes distribution -- 
NextPage     3771
FirstPage    1363
Name: class, dtype: int64


-- three classes distribution -- 
middle           3263
single page       854
first of many     509
last page         508
Name: extended_class, dtype: int64


# Total pages 5134
#lawsuits 456


#files 899


#Docs 1063


-- two classes distribution -- 
NextPage     5067
FirstPage    1054
Name: class, dtype: int64


-- three classes distribution -- 
middle           4526
last page         541
first of many     540
single page       514
Name: extended_class, dtype: int64


# Total pages 6121


In [101]:
stats_df = pd.concat([df_train, df_val, df_test], ignore_index=True)

In [147]:
print_df_stats(df_train)

#lawsuits 1133


#files 2751


#Docs 3077


-- two classes distribution -- 
NextPage     17464
FirstPage     3070
Name: class, dtype: int64


-- three classes distribution -- 
middle           15545
last page         1919
first of many     1917
single page       1153
Name: extended_class, dtype: int64


# Total pages 20534


(1133,
 2751,
 NextPage     17464
 FirstPage     3070
 Name: class, dtype: int64,
 middle           15545
 last page         1919
 first of many     1917
 single page       1153
 Name: extended_class, dtype: int64,
 3077,
 20534)

In [96]:
col = 0
#lawsuits
print('lawsuits', summary[0][col] + summary[1][col] + summary[2][col])
col = 1
print('files ',summary[0][col] + summary[1][col] + summary[2][col])

col = 5
print('total',summary[0][col] + summary[1][col] + summary[2][col])

lawsuits 1869
files  4292
total 31789


In [135]:
stats_df_ds2 = stats_df.loc[stats_df['lawsuit'].str.contains("ARE_1178596|ARE_1106715|ARE_1177294")]

In [136]:
stats_df.loc[stats_df['lawsuit'].str.contains("ARE_1106715")]#|ARE_1178596")]#|ARE_1177294")]

Unnamed: 0,binder,page_name,class,new_process,prev_page_name,path,page,img,lawsuit,last_class,next_class,extended_class,doc_id,split
22240,ailab,ARE_1106715_15339200896_60_24052019-1,FirstPage,no,ARE_1106715_313661377_60_14022018-490,ARE_1106715_15339200896_60_24052019.pdf,1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1106715,FirstPage,NextPage,first of many,ARE_1106715_15339200896_60_24052019_2252,valid
22241,ailab,ARE_1106715_15339200896_60_24052019-2,NextPage,no,ARE_1106715_15339200896_60_24052019-1,ARE_1106715_15339200896_60_24052019.pdf,2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1106715,FirstPage,NextPage,middle,ARE_1106715_15339200896_60_24052019_2252,valid
22242,ailab,ARE_1106715_15339200896_60_24052019-3,NextPage,no,ARE_1106715_15339200896_60_24052019-2,ARE_1106715_15339200896_60_24052019.pdf,3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1106715,NextPage,FirstPage,last page,ARE_1106715_15339200896_60_24052019_2252,valid
22243,ailab,ARE_1106715_15339200896_60_24052019-4,FirstPage,no,ARE_1106715_15339200896_60_24052019-3,ARE_1106715_15339200896_60_24052019.pdf,4,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1106715,NextPage,FirstPage,single page,ARE_1106715_15339200896_60_24052019_2253,valid
22244,ailab,ARE_1106715_15339200896_60_24052019-5,FirstPage,no,ARE_1106715_15339200896_60_24052019-4,ARE_1106715_15339200896_60_24052019.pdf,5,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1106715,FirstPage,FirstPage,single page,ARE_1106715_15339200896_60_24052019_2254,valid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23219,ailab,ARE_1106715_313661377_60_14022018-486,FirstPage,no,ARE_1106715_313661377_60_14022018-485,ARE_1106715_313661377_60_14022018.pdf,486,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1106715,FirstPage,FirstPage,single page,ARE_1106715_313661377_60_14022018_2247,valid
23220,ailab,ARE_1106715_313661377_60_14022018-487,FirstPage,no,ARE_1106715_313661377_60_14022018-486,ARE_1106715_313661377_60_14022018.pdf,487,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1106715,FirstPage,FirstPage,single page,ARE_1106715_313661377_60_14022018_2248,valid
23221,ailab,ARE_1106715_313661377_60_14022018-488,FirstPage,no,ARE_1106715_313661377_60_14022018-487,ARE_1106715_313661377_60_14022018.pdf,488,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1106715,FirstPage,FirstPage,single page,ARE_1106715_313661377_60_14022018_2249,valid
23222,ailab,ARE_1106715_313661377_60_14022018-489,FirstPage,no,ARE_1106715_313661377_60_14022018-488,ARE_1106715_313661377_60_14022018.pdf,489,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1106715,FirstPage,FirstPage,single page,ARE_1106715_313661377_60_14022018_2250,valid


In [137]:
stats_df_ds1 = stats_df.loc[~stats_df['lawsuit'].str.contains("ARE_1178596|ARE_1106715|ARE_1177294")]

In [120]:
stats_df_ds2

Unnamed: 0,binder,page_name,class,new_process,prev_page_name,path,page,img,lawsuit,last_class,next_class,extended_class,doc_id,split
11319,ailab,ARE_1178596_15339146471_60_28112018-1,NextPage,yes,ARE_1106715_15339200896_60_24052019-494,ARE_1178596_15339146471_60_28112018.pdf,1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
11320,ailab,ARE_1178596_15339146471_60_28112018-2,NextPage,no,ARE_1178596_15339146471_60_28112018-1,ARE_1178596_15339146471_60_28112018.pdf,2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
11321,ailab,ARE_1178596_15339146471_60_28112018-3,NextPage,no,ARE_1178596_15339146471_60_28112018-2,ARE_1178596_15339146471_60_28112018.pdf,3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
11322,ailab,ARE_1178596_15339146471_60_28112018-4,NextPage,no,ARE_1178596_15339146471_60_28112018-3,ARE_1178596_15339146471_60_28112018.pdf,4,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
11323,ailab,ARE_1178596_15339146471_60_28112018-5,NextPage,no,ARE_1178596_15339146471_60_28112018-4,ARE_1178596_15339146471_60_28112018.pdf,5,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31137,ailab,ARE_1177294_15339107099_60_22112018-8,FirstPage,no,ARE_1177294_15339107099_60_22112018-7,ARE_1177294_15339107099_60_22112018.pdf,8,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1177294,FirstPage,FirstPage,single page,ARE_1177294_15339107099_60_22112018_597,test
31138,ailab,ARE_1177294_15339107099_60_22112018-9,FirstPage,no,ARE_1177294_15339107099_60_22112018-8,ARE_1177294_15339107099_60_22112018.pdf,9,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1177294,FirstPage,FirstPage,single page,ARE_1177294_15339107099_60_22112018_598,test
31139,ailab,ARE_1177294_15339107099_60_22112018-10,FirstPage,no,ARE_1177294_15339107099_60_22112018-9,ARE_1177294_15339107099_60_22112018.pdf,10,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1177294,FirstPage,FirstPage,single page,ARE_1177294_15339107099_60_22112018_599,test
31140,ailab,ARE_1177294_15339107099_60_22112018-11,FirstPage,no,ARE_1177294_15339107099_60_22112018-10,ARE_1177294_15339107099_60_22112018.pdf,11,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1177294,FirstPage,FirstPage,single page,ARE_1177294_15339107099_60_22112018_600,test


In [128]:
#stats_df.loc[stats_df['lawsuit'].str.contains("ARE_1178596|ARE_1178596|ARE_1177294")]
stats_df_ds1.loc[stats_df_ds1['lawsuit'].str.contains("ARE_1178596|ARE_1178596|ARE_1177294")]

Unnamed: 0,binder,page_name,class,new_process,prev_page_name,path,page,img,lawsuit,last_class,next_class,extended_class,doc_id,split


In [119]:
df_train.loc[df_train['lawsuit'].str.contains("ARE_1178596|ARE_1178596|ARE_1177294")]

Unnamed: 0,binder,page_name,class,new_process,prev_page_name,path,page,img,lawsuit,last_class,next_class,extended_class,doc_id,split
14348,ailab,ARE_1178596_15339146471_60_28112018-1,NextPage,yes,ARE_1106715_15339200896_60_24052019-494,ARE_1178596_15339146471_60_28112018.pdf,1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
14349,ailab,ARE_1178596_15339146471_60_28112018-2,NextPage,no,ARE_1178596_15339146471_60_28112018-1,ARE_1178596_15339146471_60_28112018.pdf,2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
14350,ailab,ARE_1178596_15339146471_60_28112018-3,NextPage,no,ARE_1178596_15339146471_60_28112018-2,ARE_1178596_15339146471_60_28112018.pdf,3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
14351,ailab,ARE_1178596_15339146471_60_28112018-4,NextPage,no,ARE_1178596_15339146471_60_28112018-3,ARE_1178596_15339146471_60_28112018.pdf,4,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
14352,ailab,ARE_1178596_15339146471_60_28112018-5,NextPage,no,ARE_1178596_15339146471_60_28112018-4,ARE_1178596_15339146471_60_28112018.pdf,5,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15056,ailab,ARE_1178596_15339146477_60_28112018-183,FirstPage,no,ARE_1178596_15339146477_60_28112018-182,ARE_1178596_15339146477_60_28112018.pdf,183,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,FirstPage,NextPage,first of many,ARE_1178596_15339146477_60_28112018_2906,train
15057,ailab,ARE_1178596_15339146477_60_28112018-184,NextPage,no,ARE_1178596_15339146477_60_28112018-183,ARE_1178596_15339146477_60_28112018.pdf,184,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,FirstPage,NextPage,middle,ARE_1178596_15339146477_60_28112018_2906,train
15058,ailab,ARE_1178596_15339146477_60_28112018-185,NextPage,no,ARE_1178596_15339146477_60_28112018-184,ARE_1178596_15339146477_60_28112018.pdf,185,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146477_60_28112018_2906,train
15059,ailab,ARE_1178596_15339146477_60_28112018-186,NextPage,no,ARE_1178596_15339146477_60_28112018-185,ARE_1178596_15339146477_60_28112018.pdf,186,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146477_60_28112018_2906,train


In [51]:
print(len(df_train), len(df_val), len(df_test), len(df_train)+ len(df_val)+ len(df_test))

20534 5134 6121 31789


In [52]:
s1 = df_train.lawsuit
s2 = df_val.lawsuit
interctions = list(set(s1).intersection(set(s2))); interctions

[]

## Export Data

In [225]:
EXPORT_PATH = '/mnt/nas/victor_splitter/data/isjeaai'

In [226]:
df_train.to_parquet(EXPORT_PATH+'/df_train.parquet.gzip',
              compression='gzip')

In [228]:
df_val.to_parquet(EXPORT_PATH+'/df_val.parquet.gzip',
              compression='gzip')

In [229]:
df_test.to_parquet(EXPORT_PATH+'/df_test.parquet.gzip',
              compression='gzip')

In [211]:
df_train.loc[df_train['path'].str.contains("ARE_1178596")]

Unnamed: 0,binder,page_name,class,new_process,prev_page_name,path,page,img,lawsuit,last_class,next_class,extended_class,doc_id,split
14348,ailab,ARE_1178596_15339146471_60_28112018-1,NextPage,yes,ARE_1106715_15339200896_60_24052019-494,ARE_1178596_15339146471_60_28112018.pdf,1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
14349,ailab,ARE_1178596_15339146471_60_28112018-2,NextPage,no,ARE_1178596_15339146471_60_28112018-1,ARE_1178596_15339146471_60_28112018.pdf,2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
14350,ailab,ARE_1178596_15339146471_60_28112018-3,NextPage,no,ARE_1178596_15339146471_60_28112018-2,ARE_1178596_15339146471_60_28112018.pdf,3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
14351,ailab,ARE_1178596_15339146471_60_28112018-4,NextPage,no,ARE_1178596_15339146471_60_28112018-3,ARE_1178596_15339146471_60_28112018.pdf,4,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
14352,ailab,ARE_1178596_15339146471_60_28112018-5,NextPage,no,ARE_1178596_15339146471_60_28112018-4,ARE_1178596_15339146471_60_28112018.pdf,5,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146471_60_28112018_2608,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15056,ailab,ARE_1178596_15339146477_60_28112018-183,FirstPage,no,ARE_1178596_15339146477_60_28112018-182,ARE_1178596_15339146477_60_28112018.pdf,183,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,FirstPage,NextPage,first of many,ARE_1178596_15339146477_60_28112018_2906,train
15057,ailab,ARE_1178596_15339146477_60_28112018-184,NextPage,no,ARE_1178596_15339146477_60_28112018-183,ARE_1178596_15339146477_60_28112018.pdf,184,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,FirstPage,NextPage,middle,ARE_1178596_15339146477_60_28112018_2906,train
15058,ailab,ARE_1178596_15339146477_60_28112018-185,NextPage,no,ARE_1178596_15339146477_60_28112018-184,ARE_1178596_15339146477_60_28112018.pdf,185,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146477_60_28112018_2906,train
15059,ailab,ARE_1178596_15339146477_60_28112018-186,NextPage,no,ARE_1178596_15339146477_60_28112018-185,ARE_1178596_15339146477_60_28112018.pdf,186,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1178596,NextPage,NextPage,middle,ARE_1178596_15339146477_60_28112018_2906,train


In [91]:
df_train[df_train.lawsuit=='ARE_1225697']#['path'].value_counts

Unnamed: 0,binder,page_name,class,new_process,prev_page_name,path,page,img,lawsuit
23707,ailab,ARE_1225697_15340788116_0_09082019-1,FirstPage,no,ARE_1225693_15340787823_0_09082019-22,1225697/ARE_1225697_15340788116_0_09082019.pdf,1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1225697
23708,ailab,ARE_1225697_15340788116_0_09082019-2,NextPage,no,ARE_1225697_15340788116_0_09082019-1,1225697/ARE_1225697_15340788116_0_09082019.pdf,2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1225697
23709,ailab,ARE_1225697_15340788116_0_09082019-3,NextPage,no,ARE_1225697_15340788116_0_09082019-2,1225697/ARE_1225697_15340788116_0_09082019.pdf,3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1225697
23710,ailab,ARE_1225697_15340788116_0_09082019-4,NextPage,no,ARE_1225697_15340788116_0_09082019-3,1225697/ARE_1225697_15340788116_0_09082019.pdf,4,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1225697
23711,ailab,ARE_1225697_15340788116_0_09082019-5,NextPage,no,ARE_1225697_15340788116_0_09082019-4,1225697/ARE_1225697_15340788116_0_09082019.pdf,5,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1225697
23712,ailab,ARE_1225697_15340788116_0_09082019-6,NextPage,no,ARE_1225697_15340788116_0_09082019-5,1225697/ARE_1225697_15340788116_0_09082019.pdf,6,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1225697
23713,ailab,ARE_1225697_15340788116_0_09082019-7,NextPage,no,ARE_1225697_15340788116_0_09082019-6,1225697/ARE_1225697_15340788116_0_09082019.pdf,7,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1225697
23714,ailab,ARE_1225697_15340788116_0_09082019-8,NextPage,no,ARE_1225697_15340788116_0_09082019-7,1225697/ARE_1225697_15340788116_0_09082019.pdf,8,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1225697
23715,ailab,ARE_1225697_15340788137_0_09082019-1,FirstPage,no,ARE_1225697_15340788116_0_09082019-8,1225697/ARE_1225697_15340788137_0_09082019.pdf,1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1225697
23716,ailab,ARE_1225697_15340788137_0_09082019-2,NextPage,no,ARE_1225697_15340788137_0_09082019-1,1225697/ARE_1225697_15340788137_0_09082019.pdf,2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1225697


In [92]:
df_test[df_test.lawsuit=='ARE_1225697']#['path'].value_counts

Unnamed: 0,binder,page_name,class,new_process,prev_page_name,path,page,img,lawsuit
5628,ailab,ARE_1225697_15340788151_0_09082019-1,FirstPage,no,ARE_1225628_15340784554_0_09082019-2,1225697/ARE_1225697_15340788151_0_09082019.pdf,1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1225697
5629,ailab,ARE_1225697_15340788153_0_09082019-1,FirstPage,no,ARE_1225697_15340788151_0_09082019-1,1225697/ARE_1225697_15340788153_0_09082019.pdf,1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQ...,ARE_1225697


### Distribution

In [52]:
len(df_train),len(df_train)/total_pages, len(df_val), len(df_val)/total_pages, len(df_test), len(df_test)/total_pages

(20181,
 0.6348422410267702,
 5513,
 0.17342476957438108,
 6095,
 0.19173298939884867)

In [55]:
df_train['class'].value_counts()

NextPage     17017
FirstPage     3164
Name: class, dtype: int64

In [56]:
df_val['class'].value_counts()

NextPage     4277
FirstPage    1236
Name: class, dtype: int64

In [57]:
df_test['class'].value_counts()

NextPage     5008
FirstPage    1087
Name: class, dtype: int64