In [111]:
import pandas as pd
import os
import glob
import tqdm

In [1]:
"""The folder structure can be seen here - 
    README.md                            
    [task-name-folder]/                                # natural_language_inference, paraphrase_generation, question_answering, relation_extraction, topic_models
        ├── [article-counter-folder]/                  # ranges between 0 to 100 since we annotated varying numbers of articles per task
        │   ├── [articlename].pdf                      # scholarly article pdf
        │   ├── [articlename]-Grobid-out.txt           # plaintext output from the [Grobid parser](https://github.com/kermitt2/grobid)
        │   ├── [articlename]-Stanza-out.txt           # plaintext preprocessed output from [Stanza](https://github.com/stanfordnlp/stanza)
        │   ├── sentences.txt                          # annotated Contribution sentences in the file
        │   ├── entities.txt                           # annotated entities in the Contribution sentences
        │   └── info-units/                            # the folder containing information units in JSON format
        │   │   └── research-problem.json              # `research problem` mandatory information unit in json format
        │   │   └── model.json                         # `model` information unit in json format; in some articles it is called `approach`
        │   │   └── ...                                # there are 12 information units in all and each article may be annotated by 3 or 6
        │   └── triples/                               # the folder containing information unit triples one per line
        │   │   └── research-problem.txt               # `research problem` triples (one research problem statement per line)
        │   │   └── model.txt                          # `model` triples (one statement per line)
        │   │   └── ...                                # there are 12 information units in all and each article may be annotated by 3 or 6
        │   └── ...                                    # there are between 1 to 100 articles annotated for each task, so this repeats for the remaining annotated articles
        └── ...                                        # there are 24 tasks selected overall, so this repeats 23 more times"""

'The folder structure can be seen here - \n    README.md                            \n    [task-name-folder]/                                # natural_language_inference, paraphrase_generation, question_answering, relation_extraction, topic_models\n        ├── [article-counter-folder]/                  # ranges between 0 to 100 since we annotated varying numbers of articles per task\n        │   ├── [articlename].pdf                      # scholarly article pdf\n        │   ├── [articlename]-Grobid-out.txt           # plaintext output from the [Grobid parser](https://github.com/kermitt2/grobid)\n        │   ├── [articlename]-Stanza-out.txt           # plaintext preprocessed output from [Stanza](https://github.com/stanfordnlp/stanza)\n        │   ├── sentences.txt                          # annotated Contribution sentences in the file\n        │   ├── entities.txt                           # annotated entities in the Contribution sentences\n        │   └── info-units/                   

In [112]:
# walk through all directories
list_of_directories = []
for (_, dirnames, _) in os.walk('.'):
    list_of_directories.extend(dirnames)
    break

# append all results in the dataframe
results = pd.DataFrame()

# loop through all sentences
for k in (list_of_directories):
    sub_folders=glob.glob("./"+k+"/*")
    for i in tqdm.tqdm(sub_folders):
        try:
            path_for_sentences = glob.glob(i+"/*-Stanza-out.txt")[0]
            path_for_labels = glob.glob(i+"/sentences.txt")[0]
            f = open(path_for_sentences, "r")
            lines = [line.rstrip() for line in f]
            h = open(path_for_labels, "r")
            labels = [int(line.rstrip())-1 for line in h]
            d = {'contents': lines, 'label': [0]*len(lines)}
            df = pd.DataFrame(data=d)
            df.loc[labels,'label'] = 1
            results = pd.concat([results, df], axis=0).reset_index(drop=True)
        except:
            # for debugging
            print(path_for_labels)
            continue

100%|██████████| 1/1 [00:00<00:00, 545.57it/s]
100%|██████████| 2/2 [00:00<00:00, 450.20it/s]
100%|██████████| 2/2 [00:00<00:00, 510.01it/s]
100%|██████████| 52/52 [00:00<00:00, 689.89it/s]
100%|██████████| 2/2 [00:00<00:00, 512.28it/s]
100%|██████████| 1/1 [00:00<00:00, 459.20it/s]
100%|██████████| 1/1 [00:00<00:00, 508.71it/s]
100%|██████████| 2/2 [00:00<00:00, 375.67it/s]
100%|██████████| 14/14 [00:00<00:00, 544.19it/s]
100%|██████████| 1/1 [00:00<00:00, 266.69it/s]
100%|██████████| 15/15 [00:00<00:00, 387.75it/s]
100%|██████████| 3/3 [00:00<00:00, 308.67it/s]
100%|██████████| 2/2 [00:00<00:00, 249.22it/s]
100%|██████████| 5/5 [00:00<00:00, 201.84it/s]
100%|██████████| 8/8 [00:00<00:00, 440.45it/s]
100%|██████████| 6/6 [00:00<00:00, 373.17it/s]
100%|██████████| 4/4 [00:00<00:00, 287.25it/s]
100%|██████████| 1/1 [00:00<00:00, 275.23it/s]
100%|██████████| 6/6 [00:00<00:00, 323.47it/s]
100%|██████████| 1/1 [00:00<00:00, 432.63it/s]
100%|██████████| 1/1 [00:00<00:00, 363.65it/s]
100%|██

In [114]:
results.to_csv('training.csv')