## FARM: Use your own dataset
    
In Tutorial 1 you already learned about the major building blocks.
In this tutorial, you will see how to use FARM with your own dataset.

In [None]:
# Let's start by adjust the working directory so that it is the root of the repository
# This should be run just once.

import os
os.chdir('../')
print("Current working directory is {}".format(os.getcwd()))

# 1) How a Processor works

### Architecture
The Processor converts a <b>raw input (e.g File) into a Pytorch dataset</b>.   
For using an own dataset we need to adjust this Processor.

<img src="https://raw.githubusercontent.com/deepset-ai/FARM/master/docs/img/data_silo_no_bg.jpg" width="400" height="400" align="left"/>
<br/><br/>
<br/><br/>
<br/><br/>
<br/><br/>
<br/><br/>
<br/><br/>
<br/><br/>

​
### Main Conversion Stages 
1. Read from file / raw input 
2. Create samples
3. Featurize samples
4. Create PyTorch Dataset

### Functions to implement
1. file\_to_dicts()
2. \_dict_to_samples()
3. \_sample_to_features()  

## Example: TextClassificationProcessor

In [None]:
from farm.data_handler.processor import *
from farm.data_handler.samples import Sample
from farm.modeling.tokenization import BertTokenizer
#from farm.modeling.tokenization import tokenize_with_metadata

import os

class TextClassificationProcessor(Processor):
    """
    Used to handle the text classification datasets that come in tabular format (CSV, TSV, etc.)
    """
    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        label_list=None,
        metric=None,
        train_filename="train.tsv",
        dev_filename=None,
        test_filename="test.tsv",
        dev_split=0.1,
        delimiter="\t",
        quote_char="'",
        skiprows=None,
        label_column_name="label",
        multilabel=False,
        header=0,
        **kwargs,
    ):
        #TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs

        # Custom processor attributes
        self.delimiter = delimiter
        self.quote_char = quote_char
        self.skiprows = skiprows
        self.header = header

        super(TextClassificationProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
        )
        #TODO raise info when no task is added due to missing "metric" or "labels" arg
        if metric and label_list:
            if multilabel:
                task_type = "multilabel_classification"
            else:
                task_type = "classification"
            self.add_task(name="text_classification",
                          metric=metric,
                          label_list=label_list,
                          label_column_name=label_column_name,
                          task_type=task_type)

    def file_to_dicts(self, file: str) -> [dict]:
        column_mapping = {task["label_column_name"]: task["label_name"] for task in self.tasks.values()}
        dicts = read_tsv(
            filename=file,
            delimiter=self.delimiter,
            skiprows=self.skiprows,
            quotechar=self.quote_char,
            rename_columns=column_mapping,
            header=self.header
            )

        return dicts

    def _dict_to_samples(self, dict: dict, **kwargs) -> [Sample]:
        # this tokenization also stores offsets
        tokenized = tokenize_with_metadata(dict["text"], self.tokenizer, self.max_seq_len)
        return [Sample(id=None, clear_text=dict, tokenized=tokenized)]

    def _sample_to_features(self, sample) -> dict:
        features = sample_to_features_text(
            sample=sample,
            tasks=self.tasks,
            max_seq_len=self.max_seq_len,
            tokenizer=self.tokenizer,
        )
        return features
      
      
# Helper
def read_tsv(filename, rename_columns, quotechar='"', delimiter="\t", skiprows=None, header=0):
    """Reads a tab separated value file. Tries to download the data if filename is not found"""
    
    # get remote dataset if needed
    if not (os.path.exists(filename)):
        logger.info(f" Couldn't find {filename} locally. Trying to download ...")
        _download_extract_downstream_data(filename)
    
    # read file into df
    df = pd.read_csv(
        filename,
        sep=delimiter,
        encoding="utf-8",
        quotechar=quotechar,
        dtype=str,
        skiprows=skiprows,
        header=header
    )

    # let's rename our target columns to the default names FARM expects: 
    # "text": contains the text
    # "text_classification_label": contains a label for text classification
    columns = ["text"] + list(rename_columns.keys())
    df = df[columns]
    for source_column, label_name in rename_columns.items():
        df[label_name] = df[source_column]
        df.drop(columns=[source_column], inplace=True)
    
    if "unused" in df.columns:
        df.drop(columns=["unused"], inplace=True)
    raw_dict = df.to_dict(orient="records")
    return raw_dict

In [None]:
# The default format is: 
# - tab separated
# - column "text"
# - column "label" 

import pandas as pd

df = pd.DataFrame({"text": ["The concerts supercaliphractisch was great!", "I hate people ignoring climate change."],
                  "label": ["positive","negative"]
                  })
print(df)
df.to_csv("train.tsv", sep="\t")

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path="bert-base-uncased")

processor = TextClassificationProcessor(data_dir = "", 
                                        tokenizer=tokenizer,
                                        max_seq_len=64,
                                        label_list=["positive","negative"],
                                        label_column_name="label",
                                        metric="acc",
                                       )

In [None]:
#  1. One File -> Dictionarie(s) with "raw data"
dicts = processor.file_to_dicts(file="train.tsv")
print(dicts)

In [None]:
#  2. One Dictionary -> Sample(s) 
#     (Sample = "clear text" model input + meta information) 
samples = processor._dict_to_samples(dict=dicts[0])
# print each attribute of sample
print(samples[0].clear_text)
print(samples[0].tokenized)
print(samples[0].features)
print("----------------------------------\n\n\n")
# or in a nicer, formatted style
print(samples[0])

In [None]:
# 3. One Sample -> Features
#    (Features = "vectorized" model input)
features = processor._sample_to_features(samples[0])
print(features[0])

# 2) Hands-On: Adjust it to your dataset

## Task 1: Use an existing Processor

This works if you have:
- standard tasks
- common file formats 

**Example: Text classification on CSV with multiple columns**

Dataset: GermEval18 (Hatespeech detection)  
Format: TSV  
Columns: `text coarse_label fine_label`

In [None]:
# Download dataset
from farm.data_handler import utils
utils._download_extract_downstream_data("germeval18/train.tsv")
!head -n 10 germeval18/train.tsv

# TODO: Initialize a processor for the above file by passing the right arguments

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=128,
                                        data_dir="germeval18",
                                        train_filename="train.tsv",
                                        label_list=["OTHER","OFFENSE"],
                                        metric="acc",
                                        label_column_name="coarse_label"
                                        )

In [None]:
# test it
dicts = processor.file_to_dicts(file="germeval18/train.tsv")
print(dicts[0])
assert dicts[0] == {'text': '@corinnamilborn Liebe Corinna, wir würden dich gerne als Moderatorin für uns gewinnen! Wärst du begeisterbar?', 'text_classification_label': 'OTHER'}

## Task 2: Build your own Processor
This works best for:
- custom input files
- special preprocessing steps
- advanced multitask learning 

**Example: Text classification with JSON as input file** 

Dataset: [100k Yelp reviews](https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/yelp_reviews_100k.json) ( [full dataset](https://https://www.yelp.com/dataset/download), [documentation](https://https://www.yelp.com/dataset/documentation/main))

Format: 

``` 
{
...
    // integer, star rating
    "stars": 4,

    // string, the review itself
    "text": "Great place to hang out after work: the prices are decent, and the ambience is fun. It's a bit loud, but very lively. The staff is friendly, and the food is good. They have a good selection of drinks.",
...
}
```

In [None]:
# Download dataset
!wget https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/yelp_reviews_100k.json
!head -5 yelp_reviews_100k.json

In [None]:
import pandas as pd

# TODO: Create a new Processor class and overwrite the function that reads from the file
# The dicts created should look like this to comply with the default TextClassificationProcessor.
#{'text': 'Total bill for this horrible service? ...',
# 'text_classification_label': '4'}


class CustomTextClassificationProcessor(TextClassificationProcessor):
  
    # we need to overwrite this function from the parent class
    def file_to_dicts(self, file: str) -> [dict]:
      # read into df
      df = pd.read_json(file, lines=True)
      # rename
      df["text_classification_label"] = df["stars"].astype(str)
      # drop unused
      columns = ["text_classification_label","text"]
      df = df[columns]
      # convert to dicts
      dicts = df.to_dict(orient="records")
      return dicts

In [None]:
processor = CustomTextClassificationProcessor(tokenizer=tokenizer,
                                              max_seq_len=128,
                                              data_dir="",
                                              label_list=["1","2","3","4","5"],
                                              metric="acc",
                                              )

In [None]:
# test it

dicts = processor.file_to_dicts(file="yelp_reviews_100k.json")
print(dicts[0])

assert dicts[0] == {'text_classification_label': '1', 'text': 'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.'}