In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from pathlib import Path
import os
from data_io import get_book

In [2]:
def author_to_onehop(df):
    s = pd.Series(list(df['author']))
    one_hot = pd.get_dummies(s)
    df = pd.concat([df, one_hot], axis=1)
    return df

In [3]:
metadata_filename = 'metadata.csv'
counts_dirname = 'counts'
tokens_dirname = 'tokens'

metadata_df = pd.read_csv(metadata_filename)

filtered_df = metadata_df[(metadata_df.language == "['en']") & (metadata_df.type == 'Text')]

SELECTED_COLUMNS = ['id', 'title', 'author', 'authoryearofbirth', 'authoryearofdeath']
filtered_df = filtered_df.dropna(subset=SELECTED_COLUMNS)
filtered_df = filtered_df[SELECTED_COLUMNS]
filtered_df = filtered_df.reset_index(drop=True)

author_count = filtered_df['author'].value_counts()
many_works_author = author_count[author_count >= 10]
filtered_df = filtered_df[filtered_df.author.isin(many_works_author.index.to_numpy())].reset_index()

'PG8700' in filtered_df.id

False

In [4]:
# filtered_df = filtered_df.sample(n=50, random_state=2).reset_index()

sampled_authors = filtered_df.author.sample(n=50, random_state=1)

train_ids = []
test_ids = []
val_ids = []

for author in sampled_authors:
    works = filtered_df[filtered_df.author == author].sample(n=3, random_state=1)
    train_id, test_id, val_id = works.id
    
    # Does not check if this file exists and is valid
    
    train_ids.append(train_id)
    test_ids.append(test_id)
    val_ids.append(val_id)

# one hot
filtered_df = author_to_onehop(filtered_df)

train_df = filtered_df[filtered_df.id.isin(train_ids)]
test_df = filtered_df[filtered_df.id.isin(test_ids)]
val_df = filtered_df[filtered_df.id.isin(val_ids)]

df_arrs = []
for df in [train_df, test_df, val_df]:

    docs = []
    docs_unavail_pg_ids = []
    for pg_id in df.id:    
        try:
            tokens = get_book(pg_id, os.path.join(tokens_dirname), level='tokens')
            # docs: a list of list
            docs.append(tokens)
        except:
            docs_unavail_pg_ids.append(pg_id)
    
    df = df[~df.id.isin(docs_unavail_pg_ids)].reset_index()
    df = df.drop(columns=['level_0', 'index', 'id', 'title', 'author', 'authoryearofbirth', 'authoryearofdeath'])
    
    # split to 512
    chunk_size = 512
    chunk_list = []
    for index, row in df.iterrows():
        doc = docs[index]
        for i in range(0, len(doc), chunk_size):
            sub_doc = pd.Series({"text": ' '.join(doc[i: i+chunk_size])})
            new_row = sub_doc.append(row)
            chunk_list.append(new_row)
    chunk_df = pd.DataFrame(chunk_list)
    #df.insert(0, 'text', docs, True)
    df_arrs.append(chunk_df)

train_df, test_df, val_df = df_arrs

99148
22448
72820
25937
4394
33421
29951
123704
6575
31966
18551
17488
8287
27148
11205
5344
10151
51785
3945
70344
169073
52845
86410
35825
24187
26724
4957
33515
60738
3961
23944
49593
6737
38945
7194
26094
95496
34732
67670
5148
171869
45768
71556
43027
57095
46991
49912
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
21504
22016
22528
23040
23552
24064
24576
25088
25600
26112
26624
27136
27648
28160
28672
29184
29696
30208
30720
31232
31744
32256
32768
33280
33792
34304
34816
35328
35840
36352
36864
37376
37888
38400
38912
39424
39936
40448
40960
41472
41984
42496
43008
43520
44032
44544
45056
45568
46080
46592
47104
47616
48128
48640
49152
49664
50176
50688
51200
51712
52224
52736
53248
53760
54272
54784
55296
55808
56320
56832
57344
57856
58368
58880
59392
59904
60416
60928
61440
61952
62464
62976
63488

  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_ro

97792
98304
98816
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
21504
22016
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
21504
22016
22528
23040
23552
24064
24576
25088
25600
26112
26624
27136
27648
28160
28672
29184
29696
30208
30720
31232
31744
32256
32768
33280
33792
34304
34816
35328
35840
36352
36864
37376
37888
38400
38912
39424
39936
40448
40960
41472
41984
42496
43008
43520
44032
44544
45056
45568
46080
46592
47104
47616
48128
48640
49152
49664
50176
50688
51200
51712
52224
52736
53248
53760
54272
54784
55296
55808
56320
56832
57344
57856
58368
58880
59392
59904
60416
60928
61440
61952
62464
62976
63488
64000
64512
6502

  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_ro

160768
161280
161792
162304
162816
163328
163840
164352
164864
165376
165888
166400
166912
167424
167936
168448
168960
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
21504
22016
22528
23040
23552
24064
24576
25088
25600
26112
26624
27136
27648
28160
28672
29184
29696
30208
30720
31232
31744
32256
32768
33280
33792
34304
34816
35328
35840
36352
36864
37376
37888
38400
38912
39424
39936
40448
40960
41472
41984
42496
43008
43520
44032
44544
45056
45568
46080
46592
47104
47616
48128
48640
49152
49664
50176
50688
51200
51712
52224
52736
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
21504
22016
22528
23040
23552
24064
24576
25088
25600

  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_ro

56832
57344
57856
58368
58880
59392
59904
60416
60928
61440
61952
62464
62976
63488
64000
64512
65024
65536
66048
66560
67072
67584
68096
68608
69120
69632
70144
70656
71168
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
21504
22016
22528
23040
23552
24064
24576
25088
25600
26112
26624
27136
27648
28160
28672
29184
29696
30208
30720
31232
31744
32256
32768
33280
33792
34304
34816
35328
35840
36352
36864
37376
37888
38400
38912
39424
39936
40448
40960
41472
41984
42496
43008
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
21504
22016
22528
23040
23552
24064
24576
25088
25600
26112
26624
27136
27648
28160
28672
29184
29696
30208
3072

  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_ro

66420
63999
21967
1556
129211
81380
185175
34460
27256
15897
8486
7310
45211
17457
12267
78052
76126
36524
34989
38244
48985
49792
65151
19363
72795
23652
5285
59355
5952
94565
100856
77391
66666
9139
7642
59107
70919
805
28079
129201
23197
121551
22762
51375
67452
95641
91548
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
21504
22016
22528
23040
23552
24064
24576
25088
25600
26112
26624
27136
27648
28160
28672
29184
29696
30208
30720
31232
31744
32256
32768
33280
33792
34304
34816
35328
35840
36352
36864
37376
37888
38400
38912
39424
39936
40448
40960
41472
41984
42496
43008
43520
44032
44544
45056
45568
46080
46592
47104
47616
48128
48640
49152
49664
50176
50688
51200
51712
52224
52736
53248
53760
54272
54784
55296
55808
56320
56832
57344
57856
58368
58880
59392
59904
60416
60928
61440
61952
62464
62976
63

  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_ro

113152
113664
114176
114688
115200
115712
116224
116736
117248
117760
118272
118784
119296
119808
120320
120832
121344
121856
122368
122880
123392
123904
124416
124928
125440
125952
126464
126976
127488
128000
128512
129024
129536
130048
130560
131072
131584
132096
132608
133120
133632
134144
134656
135168
135680
136192
136704
137216
137728
138240
138752
139264
139776
140288
140800
141312
141824
142336
142848
143360
143872
144384
144896
145408
145920
146432
146944
147456
147968
148480
148992
149504
150016
150528
151040
151552
152064
152576
153088
153600
154112
154624
155136
155648
156160
156672
157184
157696
158208
158720
159232
159744
160256
160768
161280
161792
162304
162816
163328
163840
164352
164864
165376
165888
166400
166912
167424
167936
168448
168960
169472
169984
170496
171008
171520
172032
172544
173056
173568
174080
174592
175104
175616
176128
176640
177152
177664
178176
178688
179200
179712
180224
180736
181248
181760
182272
182784
183296
183808
184320
184832
0
512
1024
15

  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_ro

11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
21504
22016
22528
23040
23552
24064
24576
25088
25600
26112
26624
27136
27648
28160
28672
29184
29696
30208
30720
31232
31744
32256
32768
33280
33792
34304
34816
35328
35840
36352
36864
37376
37888
38400
38912
39424
39936
40448
40960
41472
41984
42496
43008
43520
44032
44544
45056
45568
46080
46592
47104
47616
48128
48640
49152
49664
50176
50688
51200
51712
52224
52736
53248
53760
54272
54784
55296
55808
56320
56832
57344
57856
58368
58880
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
21504
22016
22528
23040
23552
24064
24576
25088
25600
26112
26624
27136
27648
28160
28672
29184
29696
30208
30720
31232
31744
32256
32768
33280
33792
34304
34

  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_ro

90112
90624
91136
91648
92160
92672
93184
93696
94208
94720
95232
95744
96256
96768
97280
97792
98304
98816
99328
99840
100352
100864
101376
101888
102400
102912
103424
103936
104448
104960
105472
105984
106496
107008
107520
108032
108544
109056
109568
110080
110592
111104
111616
112128
112640
113152
113664
114176
114688
115200
115712
116224
116736
117248
117760
118272
118784
119296
119808
120320
120832
121344
121856
122368
122880
123392
123904
124416
124928
125440
125952
126464
126976
127488
128000
128512
129024
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
21504
22016
22528
23040
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
2

  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_ro

10396
105143
25362
95225
31379
10532
51210
53037
84501
59831
54875
23380
82194
11644
4691
21849
14066
224244
40770
8203
63463
14661
57572
67034
6928
95743
94071
86257
5501
11835
70370
4325
54294
55358
13654
84415
49119
78044
17692
60721
37806
49759
47900
89412
26301
70618
132813
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
21504
22016
22528
23040
23552
24064
24576
25088
25600
26112
26624
27136
27648
28160
28672
29184
29696
30208
30720
31232
31744
32256
32768
33280
33792
34304
34816
35328
35840
36352
36864
37376
37888
38400
38912
39424
39936
40448
40960
41472
41984
42496
43008
43520
44032
44544
45056
45568
46080
46592
47104
47616
48128
48640
49152
49664
50176
50688
51200
51712
52224
52736
53248
53760
54272


  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_ro

24576
25088
25600
26112
26624
27136
27648
28160
28672
29184
29696
30208
30720
31232
31744
32256
32768
33280
33792
34304
34816
35328
35840
36352
36864
37376
37888
38400
38912
39424
39936
40448
40960
41472
41984
42496
43008
43520
44032
44544
45056
45568
46080
46592
47104
47616
48128
48640
49152
49664
50176
50688
51200
51712
52224
52736
53248
53760
54272
54784
55296
55808
56320
56832
57344
57856
58368
58880
59392
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
19968
20480
20992
21504
22016
22528
23040
23552
24064
24576
25088
25600
26112
26624
27136
27648
28160
28672
29184
29696
30208
30720
31232
31744
32256
32768
33280
33792
34304
34816
35328
35840
36352
36864
37376
37888
38400
38912
39424
39936
40448
40960
41472
41984
42496
43008
43520
44032
44544
45056
45568
46080
46592
47104
47616
48128
48640
49152
49664
50176
50688
51200
5171

  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_ro

26112
26624
27136
27648
28160
28672
29184
29696
30208
30720
31232
31744
32256
32768
33280
33792
34304
34816
35328
35840
36352
36864
37376
37888
38400
38912
39424
39936
40448
40960
41472
41984
42496
43008
43520
44032
44544
45056
45568
46080
46592
47104
47616
48128
48640
49152
49664
50176
50688
51200
51712
52224
52736
53248
53760
54272
54784
55296
55808
56320
56832
57344
57856
58368
58880
59392
59904
60416
60928
61440
61952
62464
62976
63488
64000
64512
65024
65536
66048
66560
67072
67584
68096
68608
69120
69632
70144
70656
71168
71680
72192
72704
73216
73728
74240
74752
75264
75776
76288
76800
77312
77824
78336
78848
79360
79872
80384
80896
81408
81920
82432
82944
83456
83968
84480
84992
85504
86016
86528
87040
87552
88064
88576
89088
89600
90112
90624
91136
91648
92160
92672
93184
93696
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
1894

  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_ro

18944
19456
19968
20480
20992
21504
22016
22528
23040
23552
24064
24576
25088
25600
26112
26624
27136
27648
28160
28672
29184
29696
30208
30720
31232
31744
32256
32768
33280
33792
34304
34816
35328
35840
36352
36864
37376
37888
38400
38912
39424
39936
40448
40960
41472
41984
42496
43008
43520
44032
44544
45056
45568
46080
46592
47104
47616
48128
48640
49152
49664
50176
50688
51200
51712
52224
52736
53248
53760
54272
54784
55296
55808
56320
56832
57344
57856
58368
58880
59392
59904
60416
60928
61440
61952
62464
62976
63488
64000
64512
65024
65536
66048
66560
67072
67584
68096
68608
69120
69632
70144
70656
71168
71680
72192
72704
73216
73728
74240
74752
75264
75776
76288
76800
77312
77824
78336
78848
79360
79872
80384
80896
81408
81920
82432
82944
83456
83968
84480
84992
85504
86016
86528
87040
87552
88064
88576
89088
0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
1638

  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_row = sub_doc.append(row)
  new_ro

In [5]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)
val_df.to_csv('val.csv', index=False)