In [1]:
import pandas as pd
import ast

In [2]:
df = pd.read_csv("dataset/mountain_dataset_with_markup.csv")

## Convert markers from str to list 

In [3]:
df['marker'] = df['marker'].apply(ast.literal_eval)

## Redefine table to contain index of word, not the index of letter 

#### Artifacts in dataset with name Patagonian Andes (cause this dataset create GPT)

In [None]:
df[df['marker'].apply(len) > 1]

### Balance dataset

In [17]:
df_with_markup = df[df['marker'].apply(len) > 0]

In [19]:
df_without_markup = df[df['marker'].apply(len) < 1]

In [23]:
df_combined = pd.concat([df_with_markup, df_without_markup.iloc[:len(df_with_markup)]])

In [26]:
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

In [28]:
df = df_combined

### Apply function

In [29]:
def markup_generator(markup: tuple):
    for value in markup:
        yield value

def sentence_markup(row):
    text = row['text']
    markup = row['marker']
    words = text.split()
    # base res
    res = ['O'] * len(words)
    
    if markup:
        markups = markup_generator(markup)
        temp = next(markups)
        stop_index = 0
        
        for i, d in enumerate(words):
            start_index = text.find(d, stop_index)
            stop_index += len(d) + 1
            # print(start_index, d, stop_index)
            
            # check to pick next item
            if start_index > temp[1]:
                try:
                    temp = next(markups)
                except:
                    break
            
            # change value
            if temp[0] <= start_index < temp[1]:
                if i == 0:
                    res[i] = "B-LOC"
                else:
                    if res[i - 1] == 'O':
                        res[i] = "B-LOC"
                    else:
                        res[i] = "I-LOC"
    return res


### Test

In [30]:
sentence_markup(df.iloc[30])

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

### Applying

In [31]:
df['markup'] = df.apply(sentence_markup, axis=1)

In [32]:
df

Unnamed: 0,text,marker,markup
0,A visit to a riverside park with scenic views.,[],"[O, O, O, O, O, O, O, O, O]"
1,The Brooks Range is the northernmost mountain ...,"[(4, 16)]","[O, B-LOC, I-LOC, O, O, O, O, O, O, O, O]"
2,I enjoyed a relaxing vacation by the beach.,[],"[O, O, O, O, O, O, O, O]"
3,Red north money star prevent box truth. Agains...,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"I explored the Patagonian Andes in Argentina, ...","[(26, 31), (15, 31)]","[O, O, O, O, B-LOC, O, O, O, O, O, O, O]"
...,...,...,...
447,Feel the spiritual connection to the sacred mo...,"[(61, 70)]","[O, O, O, O, O, O, O, O, O, O, B-LOC, O, O, O,..."
448,Sell card son toward fund. Wall side place pro...,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
449,Particularly tell not section southern suffer....,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O]"
450,Camping under the stars is a peaceful retreat.,[],"[O, O, O, O, O, O, O, O]"


### Dropping unnecessary column

In [33]:
df = df.drop(columns='marker')

In [34]:
df

Unnamed: 0,text,markup
0,A visit to a riverside park with scenic views.,"[O, O, O, O, O, O, O, O, O]"
1,The Brooks Range is the northernmost mountain ...,"[O, B-LOC, I-LOC, O, O, O, O, O, O, O, O]"
2,I enjoyed a relaxing vacation by the beach.,"[O, O, O, O, O, O, O, O]"
3,Red north money star prevent box truth. Agains...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"I explored the Patagonian Andes in Argentina, ...","[O, O, O, O, B-LOC, O, O, O, O, O, O, O]"
...,...,...
447,Feel the spiritual connection to the sacred mo...,"[O, O, O, O, O, O, O, O, O, O, B-LOC, O, O, O,..."
448,Sell card son toward fund. Wall side place pro...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
449,Particularly tell not section southern suffer....,"[O, O, O, O, O, O, O, O, O, O, O, O, O]"
450,Camping under the stars is a peaceful retreat.,"[O, O, O, O, O, O, O, O]"


### Export with data type saving

In [35]:
df.to_pickle('dataset/ready_dataset.pkl')

In [36]:
df[df['text'].apply(len) == 177]

Unnamed: 0,text,markup
333,Immerse yourself in the diverse ecosystems of ...,"[O, O, O, O, O, O, O, O, B-LOC, O, O, O, O, O,..."
