In [49]:
import pandas as pd
import ast

In [63]:
df = pd.read_csv("dataset/mountain_dataset_with_markup.csv")

## Convert markers from str to list 

In [64]:
df['marker'] = df['marker'].apply(ast.literal_eval)

## Redefine table to contain index of word, not the index of letter 

#### Artifacts in dataset with name Patagonian Andes (cause this dataset create GPT)

In [55]:
df[df['marker'].apply(len) > 1]

Unnamed: 0,text,marker
128,"I climbed Mount Eiger in the Swiss Alps, a cha...","[(10, 21), (35, 39)]"
480,Mount St. Helens is an active volcano in the C...,"[(45, 58), (0, 16)]"
531,The Patagonian Andes in Argentina and Chile of...,"[(15, 20), (4, 20)]"
698,Mount Cook is a prominent feature of the South...,"[(0, 10), (50, 54)]"
975,"I explored the Patagonian Andes in Argentina, ...","[(26, 31), (15, 31)]"
1292,I trekked through the Patagonian Andes in Arge...,"[(33, 38), (22, 38)]"
1335,Mount Cook is the highest peak in New Zealand ...,"[(0, 10), (71, 75)]"
1428,Mount Cook in New Zealand is a prominent featu...,"[(0, 10), (65, 69)]"
1447,I trekked through the Patagonian Andes in Chile.,"[(33, 38), (22, 38)]"


### Apply function

In [99]:
def markup_generator(markup: tuple):
    for value in markup:
        yield value

def sentence_markup(row):
    text = row['text']
    markup = row['marker']
    words = text.split()
    # base res
    res = ['O'] * len(words)
    
    if markup:
        markups = markup_generator(markup)
        temp = next(markups)
        stop_index = 0
        
        for i, d in enumerate(words):
            start_index = text.find(d, stop_index)
            stop_index += len(d) + 1
            # print(start_index, d, stop_index)
            
            # check to pick next item
            if start_index > temp[1]:
                try:
                    temp = next(markups)
                except:
                    break
            
            # change value
            if temp[0] <= start_index < temp[1]:
                if i == 0:
                    res[i] = "B-LOC"
                else:
                    if res[i - 1] == 'O':
                        res[i] = "B-LOC"
                    else:
                        res[i] = "I-LOC"
    return res


### Test

In [97]:
sentence_markup(df.iloc[30])

0 The 4
4 Blue 9
9 Ridge 15
15 Mountains 25
25 in 28


['O', 'B-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

### Applying

In [100]:
df['markup'] = df.apply(sentence_markup, axis=1)

In [101]:
df

Unnamed: 0,text,marker,markup
0,A visit to a science museum for hands-on learn...,[],"[O, O, O, O, O, O, O, O, O]"
1,Voice surface coach set democratic time year. ...,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Parent according maybe activity activity finis...,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
3,A visit to a sculpture garden with intriguing ...,[],"[O, O, O, O, O, O, O, O, O]"
4,The Julian Alps in Slovenia offer pristine lak...,"[(11, 15)]","[O, O, B-LOC, O, O, O, O, O, O, O, O]"
...,...,...,...
1579,They never audience meet. Appear region allow ...,[],"[O, O, O, O, O, O, O, O, O, O, O]"
1580,Witnessing the mesmerizing Northern Lights dan...,"[(75, 97)]","[O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC..."
1581,Consumer join stage. Best likely center they p...,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1582,Hospital real school cover hotel over. Any tra...,[],"[O, O, O, O, O, O, O, O, O, O, O]"


### Dropping unnecessary column

In [102]:
df = df.drop(columns='marker')

In [103]:
df

Unnamed: 0,text,markup
0,A visit to a science museum for hands-on learn...,"[O, O, O, O, O, O, O, O, O]"
1,Voice surface coach set democratic time year. ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Parent according maybe activity activity finis...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
3,A visit to a sculpture garden with intriguing ...,"[O, O, O, O, O, O, O, O, O]"
4,The Julian Alps in Slovenia offer pristine lak...,"[O, O, B-LOC, O, O, O, O, O, O, O, O]"
...,...,...
1579,They never audience meet. Appear region allow ...,"[O, O, O, O, O, O, O, O, O, O, O]"
1580,Witnessing the mesmerizing Northern Lights dan...,"[O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC..."
1581,Consumer join stage. Best likely center they p...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1582,Hospital real school cover hotel over. Any tra...,"[O, O, O, O, O, O, O, O, O, O, O]"


### Export with data type saving

In [105]:
df.to_pickle('dataset/ready_dataset.pkl')