In [4]:
import openai
from gpt4_config import GPT4_API_KEY

openai.api_key = GPT4_API_KEY
model = 'gpt-4-1106-preview'

In [1]:
import json
import pandas as pd
import time

In [53]:
def generate_sentences():
    # get 20 sentences about mountains
    messages=[
        {"role": "system", 
         "content": """Here you have the checklist for generating sentences with mountain names:
                    Geographical Diversity: Include mountains from different continents and regions.
                    Contextual Variety: Use sentences from various contexts like travel, history, geography, and news.
                    Sentence Complexity: Mix simple and complex sentences, varying in length and structure.
                    Formal and Informal References: Incorporate both official and colloquial names of mountains.
                    Metaphorical Use: Include sentences where mountain names are used metaphorically.
                    Entity Co-occurrence: Use sentences where mountains are mentioned alongside other entities like rivers, cities, etc.
                    Temporal References: Have historical and current references to mountains.
                    Multiple Entities: Occasionally use sentences with multiple mountain names.
                    Negative Samples: Include sentences with mountain-related words but no mountain name.
                    Multilingual and Transliteration Aspects: If applicable, include names in different languages and transliterations.
                    Cultural and Mythological References: Add references from cultural or mythological texts.
                    Varied Data Sources: Gather sentences from diverse sources to ensure different writing styles.
                    JSON format: {'sentences':[s1,s2,...]} without newline characters"""},
        {"role": "assistant", 
         "content": "generate 50 sentences. Return them in json without any other notes or comments on the task"}
    ]
    response = openai.ChatCompletion.create(model=model, response_format={ "type": "json_object" }, messages=messages)
    #print(response)
    #print(f"Tokens used: {response['usage']}")
    sentences = response['choices'][0]['message']['content']
    sentences_dict = json.loads(sentences)
    
    return sentences_dict

In [None]:
#sentences_dict = generate_sentences()

In [16]:
#print(sentences_dict)

{'sentences': ["Mount Everest, the highest peak on Earth, straddles the border between Nepal and China's autonomous region of Tibet.", "The hikers embarked on an arduous journey to ascend Mount Kilimanjaro, Africa's tallest mountain, which is also a dormant volcano.", "Denali, formerly known as Mount McKinley, is North America's highest mountain peak and is renowned for its stunning beauty and challenging terrain.", 'The Inca citadel of Machu Picchu is nestled high in the Andes mountains, revealing the architectural ingenuity of ancient civilizations.', 'In Greek mythology, Mount Olympus was regarded as the abode of the gods, a place where Zeus and his pantheon resided.', "Mount Fuji's symmetrical cone, a celebrated symbol of Japan, is a frequent subject in art and literature due to its cultural significance.", 'During the climbing season, base camp at Everest becomes a bustling community of mountaineers, guides, and sherpas.', 'The Rocky Mountains stretch across North America, influen

In [None]:
for _ in range(20):
    print(f'{_}: generating 50.')
    sentences_dict = generate_sentences()
    #print(sentences_dict)
    with open('sentences.txt', 'a', encoding="utf-8") as file:
        for sentence in sentences_dict['sentences']:
            file.write(sentence + '\n')
    print(f'{_}: done.')

In [5]:
def tag_sentence(sentence):
    messages=[
        {"role": "system",
         "content": """IOB (Inside, Outside, Beginning) tagging scheme:
                    MNT: For names of mountains.
                    MNTREL: General mountain-related term (e.g., summit, peak, ridge).
                    LOC: For broader geographical locations.
                    COUNTRY/CITY: For countries and cities related to mountains.
                    GEO: For other geographical features like rivers, forests.
                    PARK: For national parks or reserves including mountains.
                    HIST: For historical events related to mountains.
                    CULT: For cultural or religious sites associated with mountains.
                    PER: For names of people associated with mountains.
                    ORG: For organizations related to mountains.
                    DATE: For dates or times relevant to mountain-related events.
                    O: For tokens that don't belong to any specific category.
                    In the IOB scheme, "B-" indicates the beginning of an entity, and "I-" indicates that the token is inside an entity but not the first token.
                    (do not add any other tags!)
                    Required json format:
                    {"sentence_id": 0,
                    "tokens": [
                    {"token": "Mount", "pos": "NNP", "tag": "B-MNT"},
                    {"token": "Everest", "pos": "NNP", "tag": "I-MNT"},...]}"""},
        {"role": "assistant",
         "content": f'separate the next sentence into tokens and tag them with given tags(if applicable), add for each token its part of speech(pos) in abbreviation. Return content in json format. Do not add to the response any other your comments or notes! Sentence: {sentence}'}
    ]
    response = openai.ChatCompletion.create(model=model, response_format={ "type": "json_object" }, messages=messages)
    tagged = (response['choices'][0]['message']['content'])
    return tagged

In [None]:
json_list = []
with open('sentences.txt', 'r') as file:
    for sentence_number, sentence in enumerate(file, start=1):
        if sentence_number % 20 == 0 :
            print('---Short break---')
            time.sleep(60)
        print(f'{sentence_number}: loading...\t')
        tagged =  tag_sentence(sentence.strip())
        json_tagged = json.loads(tagged)
        json_tagged['sentence_id'] = sentence_number
        json_list.append(json_tagged)
        print(f'{sentence_number}: done.\t')

In [None]:
with open('sentences.txt', 'r', encoding='utf-8') as file:
    for sentence_number, sentence in enumerate(file, start=1):
        if sentence_number < 91:
            continue
        if sentence_number % 20 == 0 :
            print('---Short break---')
            time.sleep(60)
        print(f'{sentence_number}: loading...\t')
        tagged =  tag_sentence(sentence.strip())
        json_tagged = json.loads(tagged)
        json_tagged['sentence_id'] = sentence_number
        json_list.append(json_tagged)
        print(f'{sentence_number}: done.\t')

In [22]:
print(len(json_list))

490


In [23]:
# Write the list to a file as a JSON array
with open('data-tagged.json', 'w') as file:
    json.dump(json_list, file)

In [24]:
with open('data-tagged.json', 'r') as file:
    data = json.load(file)

# Flatten the nested structure
# 'record_path' is the path to the nested list (in this case, 'tokens')
# 'meta' contains additional fields you want to keep (in this case, 'sentence_id')
df = pd.json_normalize(data, record_path='tokens', meta=['sentence_id'])
df = df[['sentence_id', 'token', 'pos', 'tag']] 

In [25]:
df

Unnamed: 0,sentence_id,token,pos,tag
0,1,Mount,NNP,B-MNT
1,1,Everest,NNP,I-MNT
2,1,is,VBZ,O
3,1,known,VBN,O
4,1,as,IN,O
...,...,...,...,...
8265,490,a,DT,O
8266,490,mountain,NN,B-MNTREL
8267,490,range,NN,I-MNTREL
8268,490,in,IN,O


In [26]:
df.tag.unique()

array(['B-MNT', 'I-MNT', 'O', 'B-MNTREL', 'B-LOC', 'I-LOC', 'B-CITY',
       'I-CITY', 'B-COUNTRY', 'I-COUNTRY', 'B-PARK', 'I-PARK', 'I-MNTREL',
       'B-CULT', 'I-CULT', 'HIST', 'B-GEO', 'I-GEO', 'B-DATE', 'B-HIST',
       'MNTREL', 'I-DATE', 'B-PER', 'I-HIST', 'B-COUNTRY/CITY', 'B-ORG',
       'I-ORG', 'LOC', 'I-PER', 'DATE'], dtype=object)