## FARM: Use your own dataset
    
In Tutorial 1 you already learned about the major building blocks.
In this tutorial, you will see how to use FARM with your own dataset.

In [1]:
!git clone https://github.com/deepset-ai/FARM.git
import os
os.chdir("FARM")
!pip install -r requirements.txt
!pip install --editable .

Cloning into 'FARM'...
remote: Enumerating objects: 54, done.[K
remote: Counting objects: 100% (54/54), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 4472 (delta 26), reused 29 (delta 12), pack-reused 4418[K
Receiving objects: 100% (4472/4472), 65.08 MiB | 27.22 MiB/s, done.
Resolving deltas: 100% (3290/3290), done.
Collecting torch==1.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/24/19/4804aea17cd136f1705a5e98a00618cb8f6ccc375ad8bfa437408e09d058/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl (753.4MB)
[K     |████████████████████████████████| 753.4MB 21kB/s 
Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Collecting mlflow==1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/01/ec/8c9448968d4662e8354b9c3a62e635f8929ed507a45af3d9fdb84be51270/mlflow-1.0.0-py3-none-any.whl (47.7MB)
[K     |████████████████████████

In [2]:
# Let's start by adjust the working directory so that it is the root of the repository
# This should be run just once.

import os
os.chdir('../')
print("Current working directory is {}".format(os.getcwd()))

Current working directory is /content/FARM


# 1) How a Processor works

### Architecture
The Processor converts a <b>raw input (e.g File) into a Pytorch dataset</b>.   
For using an own dataset we need to adjust this Processor.

<img src="https://raw.githubusercontent.com/deepset-ai/FARM/master/docs/img/data_silo_no_bg.jpg" width="400" height="400" align="left"/>
<br/><br/>
<br/><br/>
<br/><br/>
<br/><br/>
<br/><br/>
<br/><br/>
<br/><br/>

​
### Main Conversion Stages 
1. Read from file / raw input 
2. Create samples
3. Featurize samples
4. Create PyTorch Dataset

### Functions to implement
1. file\_to_dicts()
2. \_dict_to_samples()
3. \_sample_to_features()  

## Example: TextClassificationProcessor

In [3]:
from farm.data_handler.processor import *
from farm.data_handler.samples import Sample
from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata

import os

class TextClassificationProcessor(Processor):
    """
    Used to handle the text classification datasets that come in tabular format (CSV, TSV, etc.)
    """
    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        label_list=None,
        metric=None,
        train_filename="train.tsv",
        dev_filename=None,
        test_filename="test.tsv",
        dev_split=0.1,
        delimiter="\t",
        quote_char="'",
        skiprows=None,
        label_column_name="label",
        multilabel=False,
        header=0,
        proxies=None,
        max_samples=None,
        **kwargs
    ):
        #TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs

        # Custom processor attributes
        self.delimiter = delimiter
        self.quote_char = quote_char
        self.skiprows = skiprows
        self.header = header
        self.max_samples = max_samples

        super(TextClassificationProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies,

        )
        if metric and label_list:
            if multilabel:
                task_type = "multilabel_classification"
            else:
                task_type = "classification"
            self.add_task(name="text_classification",
                          metric=metric,
                          label_list=label_list,
                          label_column_name=label_column_name,
                          task_type=task_type)
        else:
            logger.info("Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for "
                        "using the default task or add a custom task later via processor.add_task()")

    def file_to_dicts(self, file: str) -> [dict]:
        column_mapping = {task["label_column_name"]: task["label_name"] for task in self.tasks.values()}
        dicts = read_tsv(
            filename=file,
            delimiter=self.delimiter,
            skiprows=self.skiprows,
            quotechar=self.quote_char,
            rename_columns=column_mapping,
            header=self.header,
            proxies=self.proxies,
            max_samples=self.max_samples
            )

        return dicts

    def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
        # this tokenization also stores offsets and a start_of_word mask
        text = dictionary["text"]
        tokenized = tokenize_with_metadata(text, self.tokenizer)
        if len(tokenized["tokens"]) == 0:
            logger.warning(f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {text}")
            return []
        # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
        for seq_name in tokenized.keys():
            tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer,
                                                max_seq_len=self.max_seq_len)
        return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]

    def _sample_to_features(self, sample) -> dict:
        features = sample_to_features_text(
            sample=sample,
            tasks=self.tasks,
            max_seq_len=self.max_seq_len,
            tokenizer=self.tokenizer,
        )
        return features
      
      
# Helper
def read_tsv(filename, rename_columns, quotechar='"', delimiter="\t", skiprows=None, header=0, proxies=None, max_samples=None):
    """Reads a tab separated value file. Tries to download the data if filename is not found"""
    
    # get remote dataset if needed
    if not (os.path.exists(filename)):
        logger.info(f" Couldn't find {filename} locally. Trying to download ...")
        _download_extract_downstream_data(filename)
    
    # read file into df
    df = pd.read_csv(
        filename,
        sep=delimiter,
        encoding="utf-8",
        quotechar=quotechar,
        dtype=str,
        skiprows=skiprows,
        header=header
    )

    # let's rename our target columns to the default names FARM expects: 
    # "text": contains the text
    # "text_classification_label": contains a label for text classification
    columns = ["text"] + list(rename_columns.keys())
    df = df[columns]
    for source_column, label_name in rename_columns.items():
        df[label_name] = df[source_column]
        df.drop(columns=[source_column], inplace=True)
    
    if "unused" in df.columns:
        df.drop(columns=["unused"], inplace=True)
    raw_dict = df.to_dict(orient="records")
    return raw_dict

05/05/2020 09:28:44 - INFO - transformers.file_utils -   PyTorch version 1.4.0 available.
05/05/2020 09:28:45 - INFO - transformers.file_utils -   TensorFlow version 2.2.0-rc3 available.


In [4]:
# The default format is: 
# - tab separated
# - column "text"
# - column "label" 

import pandas as pd

df = pd.DataFrame({"text": ["The concerts supercaliphractisch was great!", "I hate people ignoring climate change."],
                  "label": ["positive","negative"]
                  })
print(df)
df.to_csv("train.tsv", sep="\t")

                                          text     label
0  The concerts supercaliphractisch was great!  positive
1       I hate people ignoring climate change.  negative


In [5]:
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path="bert-base-uncased")

processor = TextClassificationProcessor(data_dir = "", 
                                        tokenizer=tokenizer,
                                        max_seq_len=64,
                                        label_list=["positive","negative"],
                                        label_column_name="label",
                                        metric="acc",
                                       )

05/05/2020 09:28:53 - INFO - farm.modeling.tokenization -   Loading tokenizer of type 'BertTokenizer'
05/05/2020 09:28:53 - INFO - filelock -   Lock 140031920012984 acquired on /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock
05/05/2020 09:28:53 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpi1hgpklj


HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…

05/05/2020 09:28:53 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt in cache at /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
05/05/2020 09:28:53 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
05/05/2020 09:28:53 - INFO - filelock -   Lock 140031920012984 released on /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock
05/05/2020 09:28:53 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /root/.cache/torch/tr




In [6]:
#  1. One File -> Dictionarie(s) with "raw data"
dicts = processor.file_to_dicts(file="train.tsv")
print(dicts)

[{'text': 'The concerts supercaliphractisch was great!', 'text_classification_label': 'positive'}, {'text': 'I hate people ignoring climate change.', 'text_classification_label': 'negative'}]


In [7]:
#  2. One Dictionary -> Sample(s) 
#     (Sample = "clear text" model input + meta information) 
samples = processor._dict_to_samples(dictionary=dicts[0])
# print each attribute of sample
print(samples[0].clear_text)
print(samples[0].tokenized)
print(samples[0].features)
print("----------------------------------\n\n\n")
# or in a nicer, formatted style
print(samples[0])

{'text': 'The concerts supercaliphractisch was great!', 'text_classification_label': 'positive'}
{'tokens': ['the', 'concerts', 'super', '##cal', '##ip', '##hra', '##ct', '##isch', 'was', 'great', '!'], 'offsets': [0, 4, 13, 18, 21, 23, 26, 28, 33, 37, 42], 'start_of_word': [True, True, True, False, False, False, False, False, True, True, False]}
None
----------------------------------





      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: None
Clear Text: 
 	text: The concerts supercaliphractisch was great!
 	text_classification_label: positive
Tokenized: 
 	tokens: ['the', 'concerts'

In [8]:
# 3. One Sample -> Features
#    (Features = "vectorized" model input)
features = processor._sample_to_features(samples[0])
print(features[0])

{'input_ids': [101, 1996, 6759, 3565, 9289, 11514, 13492, 6593, 19946, 2001, 2307, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'padding_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'segment_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'text_classification_label_ids': [0]}


# 2) Hands-On: Adjust it to your dataset

## Task 1: Use an existing Processor

This works if you have:
- standard tasks
- common file formats 

**Example: Text classification on CSV with multiple columns**

Dataset: GermEval18 (Hatespeech detection)  
Format: TSV  
Columns: `text coarse_label fine_label`

In [9]:
# Download dataset
from farm.data_handler import utils
utils._download_extract_downstream_data("germeval18/train.tsv")
!head -n 10 germeval18/train.tsv

# TODO: Initialize a processor for the above file by passing the right arguments

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=128,
                                        data_dir="germeval18",
                                        train_filename="train.tsv",
                                        label_list=["OTHER","OFFENSE"],
                                        metric="acc",
                                        label_column_name="coarse_label"
                                        )

05/05/2020 09:29:08 - INFO - farm.data_handler.utils -   downloading and extracting file germeval18 to dir /content/FARM
100%|██████████| 525101/525101 [00:00<00:00, 1154163.28B/s]


text	coarse_label	fine_label
@corinnamilborn Liebe Corinna, wir würden dich gerne als Moderatorin für uns gewinnen! Wärst du begeisterbar?	OTHER	OTHER
@Martin28a Sie haben ja auch Recht. Unser Tweet war etwas missverständlich. Dass das BVerfG Sachleistungen nicht ausschließt, kritisieren wir.	OTHER	OTHER
@ahrens_theo fröhlicher gruß aus der schönsten stadt der welt theo ⚓️	OTHER	OTHER
@dushanwegner Amis hätten alles und jeden gewählt...nur Hillary wollten sie nicht und eine Fortsetzung von Obama-Politik erst recht nicht..!	OTHER	OTHER
@spdde kein verläßlicher Verhandlungspartner. Nachkarteln nach den Sondierzngsgesprächen - schickt diese Stümper #SPD in die Versenkung.	OFFENSE	INSULT
@Dirki_M Ja, aber wo widersprechen die Zahlen denn denen, die im von uns verlinkten Artikel stehen? In unserem Tweet geht es rein um subs. Geschützte. 2017 ist der gesamte Familiennachzug im Vergleich zu 2016 - die Zahlen, die Hr. Brandner bemüht - übrigens leicht rückläufig gewesen.	OTHER	OTHER
@milenahan

In [10]:
# test it
dicts = processor.file_to_dicts(file="germeval18/train.tsv")
print(dicts[0])
assert dicts[0] == {'text': '@corinnamilborn Liebe Corinna, wir würden dich gerne als Moderatorin für uns gewinnen! Wärst du begeisterbar?', 'text_classification_label': 'OTHER'}

{'text': '@corinnamilborn Liebe Corinna, wir würden dich gerne als Moderatorin für uns gewinnen! Wärst du begeisterbar?', 'text_classification_label': 'OTHER'}


## Task 2: Build your own Processor
This works best for:
- custom input files
- special preprocessing steps
- advanced multitask learning 

**Example: Text classification with JSON as input file** 

Dataset: [100k Yelp reviews](https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/yelp_reviews_100k.json) ( [full dataset](https://https://www.yelp.com/dataset/download), [documentation](https://https://www.yelp.com/dataset/documentation/main))

Format: 

``` 
{
...
    // integer, star rating
    "stars": 4,

    // string, the review itself
    "text": "Great place to hang out after work: the prices are decent, and the ambience is fun. It's a bit loud, but very lively. The staff is friendly, and the food is good. They have a good selection of drinks.",
...
}
```

In [11]:
# Download dataset
!wget https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/yelp_reviews_100k.json
!head -5 yelp_reviews_100k.json

--2020-05-05 09:29:21--  https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/yelp_reviews_100k.json
Resolving s3.eu-central-1.amazonaws.com (s3.eu-central-1.amazonaws.com)... 52.219.73.135
Connecting to s3.eu-central-1.amazonaws.com (s3.eu-central-1.amazonaws.com)|52.219.73.135|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78383948 (75M) [application/json]
Saving to: ‘yelp_reviews_100k.json’


2020-05-05 09:29:24 (26.9 MB/s) - ‘yelp_reviews_100k.json’ saved [78383948/78383948]

{"review_id":"Q1sbwvVQXV2734tPgoKj4Q","user_id":"hG7b0MtEbXx5QzbzE6C_VA","business_id":"ujmEBvifdJM6h6RLv4wQIg","stars":1.0,"useful":6,"funny":1,"cool":0,"text":"Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.","date":"2013-05-07 04:34:36"}
{"review_id":"GJXCdrto3ASJOqKeVWPi6Q","user_id":"yXQM5uF2jS6es16SJzNHfg","busi

In [0]:
import pandas as pd

# TODO: Create a new Processor class and overwrite the function that reads from the file
# The dicts created should look like this to comply with the default TextClassificationProcessor.
#{'text': 'Total bill for this horrible service? ...',
# 'text_classification_label': '4'}


class CustomTextClassificationProcessor(TextClassificationProcessor):
  
    # we need to overwrite this function from the parent class
    def file_to_dicts(self, file: str) -> [dict]:
      # read into df
      df = pd.read_json(file, lines=True)
      # rename
      df["text_classification_label"] = df["stars"].astype(str)
      # drop unused
      columns = ["text_classification_label","text"]
      df = df[columns]
      # convert to dicts
      dicts = df.to_dict(orient="records")
      return dicts

In [0]:
processor = CustomTextClassificationProcessor(tokenizer=tokenizer,
                                              max_seq_len=128,
                                              data_dir="",
                                              label_list=["1","2","3","4","5"],
                                              metric="acc",
                                              )

In [14]:
# test it

dicts = processor.file_to_dicts(file="yelp_reviews_100k.json")
print(dicts[0])

assert dicts[0] == {'text_classification_label': '1', 'text': 'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.'}

{'text_classification_label': '1', 'text': 'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.'}
