## Prepare Dataset

Using [10k German News Articles Dataset](https://tblock.github.io/10kGNAD/)

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import hashlib

In [3]:
from sklearn.model_selection import train_test_split

class TextDatasetWithLabels:
    """A data container that simplifies data preprocessing for NLP models."""
    
    _datasets = {}
    
    def __init__(self, train: pd.DataFrame, dev: pd.DataFrame = None, test: pd.DataFrame = None, text_column: str= None, label_column: str=None):
        if train is None:
            raise Exception("need at least a training set!")
        self._datasets["train"] = train
        self._datasets["dev"] = dev
        self._datasets["test"] = test
        self._text_column = text_column
        self._label_column = label_column
        # TODO check that text_column and label_column are present in datasets
    
    @classmethod
    def read_csv(cls, train, dev=None, test=None, sep=",", quotechar='"', columns: list=None, text_column: str="text", label_column: str="labels"):
        """Read data from csv files."""
        train = pd.read_csv(train, sep=sep, quotechar=quotechar, names=columns)
        if dev is not None:
            dev = pd.read_csv(dev, sep=sep, quotechar=quotechar, names=columns)
        if test is not None:
            test = pd.read_csv(test, sep=sep, quotechar=quotechar, names=columns)
        return TextDatasetWithLabels(train, dev, test, text_column, label_column)
    
    def to_csv(self, train_file, dev_file=None, test_file=None, label_first=True, header=True):
        columns = [self._text_column, self._label_column]
        if label_first:
            columns.reverse()
        
        if train_file is not None:
            self._datasets["train"][columns].to_csv(train_file, header=header, index=False)
        if dev_file is not None:
            self._datasets["dev"][columns].to_csv(dev_file, header=header, index=False)
        if test_file is not None:
            self._datasets["test"][columns].to_csv(test_file, header=header, index=False)
            
    def to_single_csv(self, file, ml_use=["training", "validation", "test"], mluse_first=True, label_first=True, header=True):
        """Concatenate datasets in single file and mark them properly in extra column 'ml_use'"""
        columns = [self._text_column, self._label_column]
        if label_first:
            columns.reverse()
        if mluse_first:
            columns = ["ml_use"] + columns
        else:
            columns.append("ml_use")
        
        sets = []
        sets.append(self._datasets["train"].assign(ml_use=ml_use[0]))
        if self._datasets["dev"] is not None:
            sets.append(self._datasets["dev"].assign(ml_use=ml_use[1]))
        if self._datasets["test"] is not None:
            sets.append(self._datasets["test"].assign(ml_use=ml_use[2]))
        pd.concat(sets).reset_index(drop=True)[columns].to_csv(file, header=header, index=False)
    
    def info(self):
        """Prints basic information about the data."""
        dfs = [(name, self._datasets[name]) for name in ["train", "dev", "test"]]
        counts = pd.DataFrame([(name, 0 if df is None else df.shape[0]) for name, df in dfs], columns=["name", "count"])
        total = counts["count"].sum()
        for i, (name, count) in counts.iterrows():
            percent = '' if count == 0 else f"({count/total:.1%})"
            count = 'n/a' if count == 0 else f"{count:,}"
            print(f"{name:<5}: {count:>8} {percent:>5}")
    
    def clean(self, keep_testdata=True):
        """Check for duplicates."""
        
        hashes = {}
        remove = []
        
        for name in ["train", "dev", "test"]:
            df = self._datasets[name]
            if df is None:
                continue
            for idx, row in df.iterrows():
                hash = hashlib.md5(row[self._text_column].encode('utf-8')).hexdigest()
                if hash in hashes:
                    prev_name, prev_idx = hashes.get(hash)
                    prev_label = self._datasets[prev_name][self._label_column][prev_idx]
                    label = self._datasets[name][self._label_column][idx]
                    print(f"found duplicate ({prev_name}, {prev_idx}, {prev_label}) - ({name}, {idx}, {label})")
                    if prev_label != label:
                        print("same text but different labels - removing both")
                        remove.append((prev_name, prev_idx))
                        remove.append((name, idx))
                        hashes.pop(hash)
                    elif prev_name != name and name == "test" and keep_testdata:
                        print("duplicates in different datasets - keeping test data, removing other")
                        remove.append((prev_name, prev_idx))
                        hashes[hash] = (name, idx)
                    else:
                        print("duplicate entries - removing later one")
                        remove.append((name, idx))
                else:
                    hashes[hash] = (name, idx)
        
        print("removing ", remove)
        remove_df = pd.DataFrame(remove, columns=["name", "id"])
        for name in remove_df.name.unique():
        #     print(name, remove_df[lambda x: x.name == name].id.values)
            self._datasets[name] = self._datasets[name].drop(remove_df[lambda x: x.name == name].id.values)
    
    def create_dev_set(self, random_state: int = 42, ratio="auto"):
        
        if self._datasets["dev"] is not None:
            print("dev set already exists!")
            return

        label_col = self._label_column
        X = self._datasets["train"].drop(label_col, axis=1)
        y = self._datasets["train"][label_col]
        
        if ratio == "auto":
            if self._datasets["test"] is not None:
                # make same size as test dataset
                ratio = self._datasets["test"].shape[0] / X.shape[0]
            else:
                ratio = 0.1
        
        X_train, X_dev, y_train, y_dev = train_test_split(X, y, stratify=y, test_size=ratio, random_state=random_state)
        
        column_order = self._datasets["train"].columns
        self._datasets["train"] = pd.concat([X_train, y_train], axis=1)[column_order]
        self._datasets["dev"] = pd.concat([X_dev, y_dev], axis=1)[column_order]

## Create Dataset

In [4]:
text_ds = TextDatasetWithLabels.read_csv(train="https://github.com/tblock/10kGNAD/blob/master/train.csv?raw=true",
                                         test="https://github.com/tblock/10kGNAD/blob/master/test.csv?raw=true",
                                         sep=";", quotechar="'",
                                         columns=["labels", "text"]
                                        )
text_ds.info()

train:    9,245 (90.0%)
dev  :      n/a      
test :    1,028 (10.0%)


In [5]:
text_ds.clean()

found duplicate (train, 1187, Panorama) - (train, 6411, International)
same text but different labels - removing both
found duplicate (train, 2785, Inland) - (test, 728, Inland)
duplicates in different datasets - keeping test data, removing other
removing  [('train', 1187), ('train', 6411), ('train', 2785)]


In [6]:
text_ds.info()

train:    9,242 (90.0%)
dev  :      n/a      
test :    1,028 (10.0%)


## Save Training and Test Data for AWS Comprehend

* AWS Comprehend does not support a validation set

In [7]:
# save train and test set for AWS Comprehend (label, text)
text_ds.to_csv(train_file="data/10kgnad_aws_comprehend_train.csv",
               test_file="data/10kgnad_aws_comprehend_test.csv",
               label_first=True, header=False)

## Create a Dev Set (Validation Set)

In [8]:
text_ds.create_dev_set()
text_ds.info()

train:    8,214 (80.0%)
dev  :    1,028 (10.0%)
test :    1,028 (10.0%)


## Save Training, Dev and Test Set

In [9]:
text_ds.to_csv(train_file="data/10kgnad_train.csv",
               dev_file="data/10kgnad_valid.csv",
               test_file="data/10kgnad_test.csv",
               label_first=False, header=True)

## Save combined Training, Dev and Test Set for GCP Auto ML

In [10]:
text_ds.to_single_csv("data/10kgnad_gcp_auto_ml.csv", mluse_first=True, label_first=False, header=False)