In [1]:
import os

import pandas as pd
from sklearn.model_selection import train_test_split

## Get your data

https://github.com/fursovia/fraud_detection

```bash
cd fraud_detection
python data_prep.py -dd data/adversarial
```

In [2]:
data = pd.read_csv('/Users/fursovia/Desktop/fraud/data/adversarial/full.csv')

In [3]:
data.head()

Unnamed: 0,amount,age,sex,ins_type,speciality,id,treatments,types,target
0,330.97,53.570385,0,1,1,ID_1,a_178 a_1884 a_1 a_168 a_172 a_174,aa_10 aa_2 aa_2 aa_10 aa_10 aa_10,0
1,455.2,83.38272,1,1,1,ID_2,a_765 a_764 a_1257 a_777 a_802 a_797 a_1191 a...,aa_13 aa_13 aa_3 aa_13 aa_13 aa_13 aa_3 aa_13...,0
2,199.53,69.56751,1,1,1,ID_3,a_1978 a_710 a_1677 a_1701 a_1 a_585 a_375 a_...,aa_2 aa_12 aa_5 aa_5 aa_2 aa_3 aa_3 aa_12 aa_...,0
3,142.85,69.55633,1,1,1,ID_4,a_1257 a_545 a_1128 a_1 a_1191 a_2001 a_1978 ...,aa_3 aa_3 aa_3 aa_2 aa_3 aa_2 aa_2 aa_2,0
4,168.87,18.873434,0,1,1,ID_5,a_737 a_20 a_1257 a_1191 a_642 a_1 a_733 a_11...,aa_13 aa_8 aa_3 aa_3 aa_3 aa_2 aa_13 aa_3 aa_...,0


In [4]:
data = data[['treatments', 'target']]

In [5]:
data.head()

Unnamed: 0,treatments,target
0,a_178 a_1884 a_1 a_168 a_172 a_174,0
1,a_765 a_764 a_1257 a_777 a_802 a_797 a_1191 a...,0
2,a_1978 a_710 a_1677 a_1701 a_1 a_585 a_375 a_...,0
3,a_1257 a_545 a_1128 a_1 a_1191 a_2001 a_1978 ...,0
4,a_737 a_20 a_1257 a_1191 a_642 a_1 a_733 a_11...,0


In [8]:
max_sequence_len = max(data.treatments.apply(lambda x: len(x.split())))
print(f'Maxiumum sequence length = {max_sequence_len}')

Maxiumum sequence length = 168


In [12]:
# the texar model has a tough time handling long sequences

max_allowed_length = 20

data['treatments'] = data['treatments'].apply(lambda x: ' '.join(x.split()[:max_allowed_length]))

In [13]:
train, intermediate_data = train_test_split(data, stratify=data.target, test_size=0.3, random_state=24)
dev, test = train_test_split(intermediate_data, stratify=intermediate_data.target, test_size=0.5, random_state=24)

In [14]:
train.shape, dev.shape, test.shape

((266051, 2), (57011, 2), (57012, 2))

In [15]:
data_path = '/Users/fursovia/Documents/texar/examples/text_style_transfer/data/insurance_cropped'

if not os.path.exists(data_path):
    os.mkdir(data_path)

In [16]:
vocab_path = '/Users/fursovia/Desktop/fraud/data/adversarial/treatments.txt'
new_vocab_path = os.path.join(data_path, 'vocab')

In [17]:
! cp {vocab_path} {new_vocab_path}

We also need to delete first two tokens from the vocab.

In [18]:
! sed -i -e 1,2d {new_vocab_path}

In [19]:
for df, name in ((train, 'train'), (dev, 'dev'), (test, 'test')):
    with open(os.path.join(data_path, f'insurance.{name}.text'), 'w') as file:
        for text in df['treatments'].tolist():
            file.write(f'{text}\n')

    with open(os.path.join(data_path, f'insurance.{name}.labels'), 'w') as file:
        for label in df['target'].tolist():
            file.write(f'{label}\n')

In [20]:
! ls {data_path}

insurance.dev.labels   insurance.test.text    vocab
insurance.dev.text     insurance.train.labels vocab-e
insurance.test.labels  insurance.train.text


You can download these files from [here](https://yadi.sk/d/YfllAjLD3H1_Kw)

## Update texar config

We need to modify data folders and create a new config file. You can use the code below to replace some python variables or just open `config.py` and do it yourself.

In [14]:
initial_config = '/Users/fursovia/Documents/texar/examples/text_style_transfer/config.py'
new_config = '/Users/fursovia/Documents/texar/examples/text_style_transfer/insurance_config.py'

In [None]:
! sed 's/yelp/insurance/g' {initial_config} > {new_config}
! sed  -i 's/sentiment/insurance/g' {new_config}
! sed  -i 's/samples/insurance_samples/g' {new_config}
! sed  -i 's/checkpoints/insurance_checkpoints/g' {new_config}

In [None]:
! cat {new_config}