# ABOUT:
- we create a question answering dataset 
- given the address "toko dita, kertosono" the dataset shouild train a question answering model to predict "toko dita" answer span

In [1]:
import pandas as pd
df = pd.read_csv(r"C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\datasets\train.csv\train.csv",index_col = 'id')
df[['POI', 'street']] = df['POI/street'].str.split('/', expand=True)
df = df.drop(["POI/street"],axis=1)
df

Unnamed: 0_level_0,raw_address,POI,street
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,jl kapuk timur delta sili iii lippo cika 11 a ...,,jl kapuk timur delta sili iii lippo cika
1,"aye, jati sampurna",,
2,setu siung 119 rt 5 1 13880 cipayung,,siung
3,"toko dita, kertosono",toko dita,
4,jl. orde baru,,jl. orde baru
...,...,...,...
299995,jend ahmad yani 331 kertasari ciamis,,jend ahmad yani
299996,"raya cila kko, cilandak timur kel.",,raya cila kko
299997,tanjung gusta jl. yaya 2 no 17,,
299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri,


In [2]:
max_length = 50

## SQUAD format
- **answer_start** refers to the character level position of the answer to the question 
    - take note that contexts can have multiple answers in other datasets
    - we can assume out shopee dataset only has 1 answer_start in the context
- **context** refers to the raw_address in our case
- **question** - we will have two questions:
    - what's the point of interest? == apa gunanya minat?
    - What is the name of the street? == Siapa nama jalannya?
- **text** refers to the answer to the question

In [3]:
squad_sample = {'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'id': '5733be284776f41900661182',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'title': 'University_of_Notre_Dame'}

In [4]:
squad_sample['context'].find(squad_sample['answers']['text'][0])

515

- (above) we can see that the answer_start is indeed character level position of the answer to the question

## Create POI dataset

In [5]:
poiQuestion = 'apa gunanya minat?'

In [6]:
def create_temp_df(dataframe, target_colname):
    dataframe = dataframe[dataframe[target_colname]!=""]
    dataframe = dataframe[dataframe.apply(lambda row: row[target_colname] in row['raw_address'],axis=1)]
    return dataframe

In [7]:
temp_df = create_temp_df(df,"POI")
temp_df.head()

Unnamed: 0_level_0,raw_address,POI,street
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,"toko dita, kertosono",toko dita,
5,"raya samb gede, 299 toko bb kids",toko bb kids,raya samb gede
15,"kampung.gudang areng,desa:anyer, kecamatan:any...",gudang areng,
17,pangkalan lareh kel.ikurkoto koto panjang.kec ...,pangkalan lareh,
21,"stadion kobelete,",stadion kobelete,


- (above) we don't want answers that are empty
- we also don't want rows that have abbreviated answers

In [8]:
def get_answers_dict(row):
    answer_start = row['raw_address'].find(row['POI'])
    return {'answer_start': [answer_start], 'text': [row['POI']]}

In [9]:
answers_column = temp_df.apply(lambda row: get_answers_dict(row),axis=1)

In [10]:
POI_df = pd.DataFrame({"answers":answers_column,
                      "context":temp_df.raw_address,
                      "question":poiQuestion})
POI_df.sample(10)

Unnamed: 0_level_0,answers,context,question
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
223077,"{'answer_start': [6], 'text': ['selep wangi']}","kali, selep wangi, kedungwungu",apa gunanya minat?
289004,"{'answer_start': [0], 'text': ['pom mini']}","pom mini gabus raya, bekasi selatan",apa gunanya minat?
33555,"{'answer_start': [13], 'text': ['batam center']}","teluk tering batam center, ahm ya batam kota",apa gunanya minat?
225899,"{'answer_start': [0], 'text': ['jaya service']}","jaya service,",apa gunanya minat?
214827,"{'answer_start': [0], 'text': ['toko gopar']}",toko gopar baros,apa gunanya minat?
268175,"{'answer_start': [0], 'text': ['kfc']}","kfc, cile raya, a 97",apa gunanya minat?
245089,"{'answer_start': [0], 'text': ['sunset ave ap3']}","sunset ave ap3, jl. grand wisata no.32, lamban...",apa gunanya minat?
211222,"{'answer_start': [17], 'text': ['toko apollo']}","purwokerto kulon toko apollo, may d i panja, n...",apa gunanya minat?
261126,"{'answer_start': [14], 'text': ['balines cellu...","ling sela, 67 balines cellular,",apa gunanya minat?
204918,"{'answer_start': [0], 'text': ['toko cat duko']}","toko cat duko, pasundan,",apa gunanya minat?


## Create street name dataset

In [11]:
streetQuestion = 'Siapa nama jalannya?'

In [12]:
temp_df = create_temp_df(df,"street")
temp_df.head()

Unnamed: 0_level_0,raw_address,POI,street
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,jl kapuk timur delta sili iii lippo cika 11 a ...,,jl kapuk timur delta sili iii lippo cika
2,setu siung 119 rt 5 1 13880 cipayung,,siung
4,jl. orde baru,,jl. orde baru
5,"raya samb gede, 299 toko bb kids",toko bb kids,raya samb gede
6,"kem mel raya, no 4 bojong rawalumbu rt 1 36 ra...",,kem mel raya


- (above) we don't want answers that are empty
- we also don't want rows that have abbreviated answers

In [13]:
def get_answers_dict(row):
    answer_start = row['raw_address'].find(row['street'])
    return {'answer_start': [answer_start], 'text': [row['street']]}

In [14]:
answers_column = temp_df.apply(lambda row: get_answers_dict(row),axis=1)

In [15]:
street_df = pd.DataFrame({"answers":answers_column,
                      "context":temp_df.raw_address,
                      "question":streetQuestion})
street_df.sample(10)

Unnamed: 0_level_0,answers,context,question
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
232209,"{'answer_start': [9], 'text': ['pahl ser']}","jm cell, pahl ser,",Siapa nama jalannya?
277310,"{'answer_start': [5], 'text': ['belim ivc']}",wage belim ivc 130 61257 taman,Siapa nama jalannya?
73261,"{'answer_start': [10], 'text': ['kup jaya']}",simomulyo kup jaya 92 3 60189 sukomanunggal,Siapa nama jalannya?
150045,"{'answer_start': [10], 'text': ['pejag']}","gun pasi, pejag, mangga dua selatan",Siapa nama jalannya?
137273,"{'answer_start': [21], 'text': ['perum bumi ra...","muti bani sarfan cv, perum bumi rak asri cilegon",Siapa nama jalannya?
262798,"{'answer_start': [0], 'text': ['mur']}",mur,Siapa nama jalannya?
210726,"{'answer_start': [0], 'text': ['jl. lin timur']}",jl. lin timur 36554 sekernan,Siapa nama jalannya?
103017,"{'answer_start': [0], 'text': ['yos suda']}",yos suda sumbawa,Siapa nama jalannya?
225190,"{'answer_start': [0], 'text': ['(de sd haur gg...",(de sd haur gg. ii lebakgede coblong,Siapa nama jalannya?
286347,"{'answer_start': [10], 'text': ['jln ekad putr...",neglasari jln ekad putra banjar,Siapa nama jalannya?


In [16]:
len(street_df),len(POI_df)

(212470, 75351)

## final shopee dataset in SQUAD format
- the dataset only has two types of questions:
    1. apa gunanya minat?
    2. Siapa nama jalannya? 
       - these questions ask " what is the street name?" and "what is the Point of interest?"

In [17]:
output_df = pd.concat([street_df,POI_df]).sample(frac = 1).reset_index(drop=True)
output_df.head(10)

Unnamed: 0,answers,context,question
0,"{'answer_start': [0], 'text': ['kp. tanah ungk...",kp. tanah ungkuk rt.001 002 desa. sarimukti,apa gunanya minat?
1,"{'answer_start': [0], 'text': ['aksa']}",aksa 57 sambongjaya,Siapa nama jalannya?
2,"{'answer_start': [0], 'text': ['gg. 2']}",gg. 2 no 64 surodinawan prajurit kulon,Siapa nama jalannya?
3,"{'answer_start': [0], 'text': ['damai putra de...","damai putra development, bamb sug mertoyudan",apa gunanya minat?
4,"{'answer_start': [22], 'text': ['jalan wiroto ...","griya kos noor laily, jalan wiroto dalam ii, k...",Siapa nama jalannya?
5,"{'answer_start': [9], 'text': ['jl. kali']}",pakunden jl. kali 5 sukorejo,Siapa nama jalannya?
6,"{'answer_start': [0], 'text': ['vete']}",vete magelang magelang tengah,Siapa nama jalannya?
7,"{'answer_start': [0], 'text': ['dewa anom']}",dewa anom 126 rendang rendang,Siapa nama jalannya?
8,"{'answer_start': [0], 'text': ['kampung jogoy']}",kampung jogoy 555 55233 jetis,Siapa nama jalannya?
9,"{'answer_start': [0], 'text': ['bant baru vii']}",bant baru vii,Siapa nama jalannya?


## convert to huggingface Dataset

In [18]:
from datasets import Dataset
shopee_QA_Dataset = Dataset.from_pandas(output_df)

In [19]:
from datasets import DatasetDict
shopee_QA_Dataset = shopee_QA_Dataset.train_test_split(test_size = 0.05)
shopee_QA_Dataset

DatasetDict({
    train: Dataset({
        features: ['answers', 'context', 'question'],
        num_rows: 273429
    })
    test: Dataset({
        features: ['answers', 'context', 'question'],
        num_rows: 14392
    })
})

## Tokenize

In [21]:
model_checkpoint = "cahya/bert-base-indonesian-tydiqa"

In [22]:
# instantiate the tokenzier 
# note that different models require different tokenizers
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [23]:
# check that the tokenizer we instantiated  is a fast tokenizer because we need its special features
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [24]:
def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    pad_on_right = tokenizer.padding_side == "right"
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        # stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",                                                                                    
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [25]:
# more samples have been produced due the the splitting function
# transformers uses smart caching - the following code needs to be run only once as subsequent runs uses cached data
tokenized_datasets = shopee_QA_Dataset.map(prepare_train_features, batched=True, remove_columns=shopee_QA_Dataset["train"].column_names)
tokenized_datasets

HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions', 'token_type_ids'],
        num_rows: 273429
    })
    test: Dataset({
        features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions', 'token_type_ids'],
        num_rows: 14392
    })
})

In [26]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML
def show_random_elements(dataset, num_examples = 10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [27]:
show_random_elements(tokenized_datasets['train'])

Unnamed: 0,attention_mask,end_positions,input_ids,start_positions,token_type_ids
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",26,"[3, 6186, 1769, 15324, 32, 1, 2177, 9207, 15299, 4947, 1501, 11532, 2978, 1630, 16, 6812, 12018, 10079, 8692, 13371, 2845, 40, 1050, 22674, 17, 4053, 1008, 4462, 1635, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",23,"[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",11,"[3, 6186, 1769, 15324, 32, 1, 6318, 1661, 26589, 2980, 12898, 21, 6253, 10413, 2102, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",6,"[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",9,"[3, 3397, 6834, 1538, 9118, 32, 1, 7732, 1697, 5282, 15, 3823, 1007, 15456, 9807, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",7,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",16,"[3, 3397, 6834, 1538, 9118, 32, 1, 31057, 1538, 13791, 1544, 14795, 12186, 15, 8010, 8640, 9040, 15, 7448, 5596, 1028, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",14,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",12,"[3, 6186, 1769, 15324, 32, 1, 3411, 63, 26505, 10132, 9790, 15, 28259, 15, 6506, 11511, 1066, 4219, 10795, 1010, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",12,"[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",8,"[3, 6186, 1769, 15324, 32, 1, 5536, 1029, 7207, 6518, 8059, 6518, 8059, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",6,"[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",12,"[3, 3397, 6834, 1538, 9118, 32, 1, 1849, 4934, 15, 18585, 3980, 1015, 15, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",10,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7,"[3, 6186, 1769, 15324, 32, 1, 9671, 9887, 15, 4462, 14878, 5679, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",6,"[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
8,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7,"[3, 6186, 1769, 15324, 32, 1, 28002, 10729, 2321, 20565, 14391, 10745, 1608, 28012, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",6,"[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
9,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",8,"[3, 6186, 1769, 15324, 32, 1, 31815, 22, 21, 15, 20, 6181, 27, 24, 19745, 1488, 2070, 1661, 17, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",6,"[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


## export

In [28]:
tokenized_datasets.save_to_disk(r"C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\datasets\shopee QA dataset")