In [1]:
import os
import requests
import zipfile
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("gazetteer/filtered_gazetteer.csv")

location_df = df[['asciiname']]

location_names = location_df['asciiname'].dropna().tolist()
location_names = [name for name in location_names if isinstance(name, str)]

print(f"Extracted {len(location_names)} location names.")



Extracted 80888 location names.


In [11]:
location_names

['Yotsuhama',
 'Tahara-mura',
 'Tatsuno-shi',
 'Shimo-funaka-mura',
 'Shingu',
 'Naha Air Base',
 'Naegi-cho',
 'Mimitsu',
 'Katayamazu',
 'Kadena Air Base',
 'Gonotao',
 'Futenma Marine Corps Air Station',
 'Atsugi',
 'Akiaga-eki',
 'Akashi',
 'Zushi',
 'Zukeran',
 'Zozo',
 'Zoshuku',
 'Awazucho',
 'Zeze-eki',
 'Zeze',
 'Zentsuji Shi',
 'Zentsujicho',
 'Ikawadanicho-zenkai',
 'Zengi',
 'Zenda',
 'Zeidani',
 'Zazare',
 'Zaomachi',
 'Zama Shi',
 'Zamami Son',
 'Zamami',
 'Azumacho-zama',
 'Zama',
 'Zakimi',
 'Zaisho-mura',
 'Zaisho',
 'Kamitakaracho-zaike',
 'Zahana',
 'Zaha',
 'Yuzuriha',
 'Yuzawa',
 'Yuza',
 'Yuyama',
 'Yuyama',
 'Amagasemachi-yuyama',
 'Yuwan',
 'Yutorino',
 'Yuto-cho',
 'Yutamaminami',
 'Yutaka-mura',
 'Yusuhara',
 'Yushima-mura',
 'Yurugi',
 'Yurakucho',
 'Yura Cho',
 'Yura',
 'Yura',
 'Yura',
 'Yuracho-yura',
 'Yunoyama',
 'Yunotsuru-onsen',
 'Yunotsucho-kohama',
 'Gero',
 'Yunoo',
 'Yunoo',
 'Katsumotocho-yunomotoura',
 'Yunomae Machi',
 'Yunomae',
 'Yunokuchi',


In [4]:
def prepare_spacy_training_data(location_names):
    print("Preparing SpaCy training data...")
    training_data = []
    
    for name in location_names:
        sentence = f"{name} was affected severly by earthquake."
        entities = [(0, len(name), "GPE")]  
        training_data.append((sentence, {"entities": entities}))
    
    print(f"Prepared {len(training_data)} sentences for training.")
    return training_data

In [5]:
training_data = prepare_spacy_training_data(location_names)

Preparing SpaCy training data...
Prepared 80888 sentences for training.


In [9]:
training_data_file = os.path.join("train/train_data_1.jsonl")

In [10]:
import json

with open(training_data_file, 'w') as file:
    for entry in training_data:
        json.dump(entry, file)
        file.write('\n')

print(f"Training data saved to {training_data_file}")

Training data saved to train/train_data_1.jsonl


In [13]:
import pandas as pd
import json
import unicodedata
import re
from io import StringIO


df = pd.read_csv("data2.csv")

def convert_to_ascii(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

def create_custom_format(df):
    records = []
    
    for _, row in df.iterrows():
        text = convert_to_ascii(row['text'])  
        start_loc = row['start_loc']
        end_loc = row['end_loc']

        entities = []
        entities.append([start_loc, end_loc, "GPE"])
        
        disaster_keywords = ["earthquake", "death", "died", "shake", "collapse", "collapses", "collapsed", "shaking", "aftershocks", "tremors", "disturbances", "disaster", "casualty"]
        
        for keyword in disaster_keywords:
            pattern = r'\b' + re.escape(keyword) + r'\b'
            matches = [(m.start(), m.end()) for m in re.finditer(pattern, text.lower())]
            for start, end in matches:
                entities.append([start, end, "DISASTER"])
        
        records.append([text, {"entities": entities}])
    
    return records

custom_records = create_custom_format(df)

output_file = 'train/train_data_2.jsonl'
with open(output_file, 'w') as f:
    for record in custom_records:
        f.write(json.dumps(record) + '\n')

print(f"JSONL data has been saved to {output_file}.")

JSONL data has been saved to train/train_data_2.jsonl.
