In [27]:
import pandas as pd
import re
import emoji
import json
from tqdm import tqdm

In [28]:
df = pd.read_csv("../datasets/jptweets.csv")

In [29]:
df = df.drop_duplicates(subset='text', keep='first')

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 978 entries, 0 to 1082
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          978 non-null    object
 1   text        977 non-null    object
 2   time stamp  978 non-null    object
dtypes: object(3)
memory usage: 30.6+ KB


### Pre-processing

In [31]:
def clean_tweet(tweet):
    if not isinstance(tweet, str):
        tweet = ""
    tweet = re.sub(r"http\S+|www\S+|https\S+", " ", tweet, flags=re.MULTILINE)
    tweet = re.sub(r"@|#|:|_", " ", tweet)
    tweet = re.sub(r",", ", ", tweet)
    tweet = re.sub(r"!", "! ", tweet)
    tweet = re.sub(r"\?", "? ", tweet)
    tweet = re.sub(r";", "; ", tweet)
    tweet = re.sub(r"[^\w\s.,!?;]", " ", tweet)
    tweet = emoji.replace_emoji(tweet, " ")  
    tweet = re.sub(r"(?<!\d)\.|(?<=\d)\.(?!\d)|(?<!\d)\.(?!\d)", ". ", tweet)
    tweet = re.sub(r"(?<=\d)\. (?=\d)", ".", tweet)
    tweet = re.sub(r"\s+", " ", tweet).strip()
    return tweet

def is_ascii(text):
    return all(ord(char) < 128 for char in text)


df['text'] = df['text'].apply(clean_tweet)
df = df[df['text'].apply(is_ascii)]
df = df[df['text'].str.strip() != '']
df = df.reset_index(drop=True)

df


Unnamed: 0,id,text,time stamp
0,@EN_NERV,"Major Tsunami Warning 1 1, 4 22pm The Tsunami ...",2024-01-01
1,@diar_esthetic,According to mathematical and tectonophysical ...,2024-10-07
2,@LiveStormChaser,Earlier video showing smaller wave caused by t...,2024-01-01
3,@EN_NERV,"Tsunami Advisory 8 8, 4 44pm A Tsunami Advisor...",2024-08-08
4,@volcaholic1,UNCONFIRMED video of considerable damage in No...,2024-01-01
...,...,...,...
750,@SallySueIam,Replying to mattgaetz,2024-10-06
751,@NathanNoonan,Replying to Terumi4416 and fema,2024-10-07
752,@mntomorii,Replying to ceriseaustralis,2024-10-02
753,@mrsverypicky,Japan s NotoEarthquake thousands of survivors ...,2024-02-14


In [32]:
def find_non_ascii_chars(tweet):
    return set(char for char in tweet if ord(char) >= 128)

non_ascii_chars = set()
for tweet in df['text']:
    non_ascii_chars.update(find_non_ascii_chars(tweet))

print("Unique non-ASCII characters found in the tweets:")
print(non_ascii_chars)


Unique non-ASCII characters found in the tweets:
set()


In [33]:
jp = pd.read_csv('../datasets/jp.csv')
city = pd.read_csv('../datasets/city.csv')
countries = pd.read_csv('../datasets/countries.csv')
added_locs = pd.read_csv('../datasets/added_locs.csv')

  jp = pd.read_csv('../datasets/jp.csv')


In [34]:
jp = jp[jp['feature class'].isin(['A', 'P'])]

In [35]:
jp = jp[jp['population']>0]

In [36]:
jp

Unnamed: 0.1,Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,country code,cc2,admin1 code,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification date
2,4766202,1847945,Tatsuno-shi,Tatsuno-shi,Tatsuno;Tatsuno-cho;Tatsuno-chō;Tatsuno-machi;...,34.88804,134.51910,A,ADM2,JP,,13.0,1847945.0,,,79870,,250,Asia/Tokyo,2017-07-21
4,4766204,1847947,Shingū,Shingu,Schingu;Shingu;Shingui;Shingū;Sing;Singu;Singu...,33.73333,135.98333,P,PPLA2,JP,,43.0,1852105.0,,,31619,,7,Asia/Tokyo,2017-07-22
19,4766219,1847963,Atsugi,Atsugi,Acugi;Atsugi;Atsugicho;Atsugichō;Atsuki;Atugi;...,35.44272,139.36931,P,PPLA2,JP,,19.0,1864928.0,,,229199,,26,Asia/Tokyo,2017-07-22
22,4766222,1847966,Akashi,Akashi,Akashi;Akasi;Akasi-chhi;Akasi-chhī;Akasis;Akas...,34.65524,135.00687,P,PPLA2,JP,,13.0,1865470.0,,,297279,,27,Asia/Tokyo,2017-07-28
24,4766224,1847968,Zushi,Zushi,Zushi;dou zi;zushi;ずし;ズシ;逗子,35.29483,139.57812,P,PPLA2,JP,,19.0,7470888.0,,,60055,,9,Asia/Tokyo,2017-07-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89018,4855218,11497708,Masaki-chō,Masaki-cho,,33.79242,132.72446,A,ADM3,JP,,5.0,1861141.0,38401.0,,31168,,13,Asia/Tokyo,2017-04-07
89021,4855221,11523579,Yonedacho Sendo,Yonedacho Sendo,,34.77560,134.82551,P,PPLX,JP,,13.0,1860702.0,,,6000,,54,Asia/Tokyo,2017-04-14
89022,4855222,11524139,Hokuto,Hokuto,bei du;bei du shi;北杜;北杜市,35.83458,138.39606,P,PPLA2,JP,,46.0,7415915.0,,,45000,,731,Asia/Tokyo,2017-07-21
90711,4856911,11668516,Aioi Shi,Aioi Shi,xiang sheng shi;相生市,34.82719,134.46699,A,ADM2,JP,,13.0,11668516.0,,,30931,,100,Asia/Tokyo,2017-08-07


In [37]:
city

Unnamed: 0.1,Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,country code,cc2,admin1 code,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification date
0,37927,1149361,Islamic Republic of Afghanistan,Islamic Republic of Afghanistan,'Afikanisitani;'Apekanikana;A Phu Han (Afghani...,33.00000,66.00000,A,PCLI,AF,,0,,,,29121286,,2260,Asia/Kabul,2015-10-07
1,131289,2240444,Luanda Province,Luanda Province,Distrito de Luanda;Loanda;Luanda;Luanda Provin...,-8.91667,13.33924,A,ADM1,AO,,20.0,,,,6542942,,103,Africa/Luanda,2016-05-04
2,141795,3351879,Republic of Angola,Republic of Angola,'Enikola;Africa Ocidental Portuguesa;An'nkola;...,-12.50000,18.50000,A,PCLI,AO,,0.0,,,,13068161,,1355,Africa/Luanda,2013-04-03
3,202632,3435907,Provincia de Buenos Aires,Provincia de Buenos Aires,BA;Buenos Aires;Provincia de Buenos Aires,-36.00000,-60.00000,A,ADM1,AR,,1.0,,,,13827203,,54,America/Argentina/Buenos_Aires,2014-06-11
4,236675,3865483,Argentine Republic,Argentine Republic,'Asenitina;A Can GJinh (Argentina);Ac-hen-ti-n...,-34.00000,-64.00000,A,PCLI,AR,,00,,,,41343201,,189,,2016-02-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,10987267,1085593,Eastern Cape,Eastern Cape,Austrumkapa;Austrumkāpa;Cabo Oriental;Cap Orie...,-32.00000,26.00000,A,ADM1,ZA,,5.0,,,,6490714,,1125,Africa/Johannesburg,2014-07-30
367,10987268,1085594,Gauteng,Gauteng,Chrysotopia;Gauteng;Gauteng Gobol;Gauteng-sen;...,-26.08333,28.25000,A,ADM1,ZA,,6.0,,,,10058121,,1652,Africa/Johannesburg,2016-11-04
368,10987271,1085597,Limpopo,Limpopo,IPhondo yaLimpopo;Kgaolo ya Limpopo;Limpopas;L...,-24.00000,29.50000,A,ADM1,ZA,,9.0,,,,5494928,,1373,Africa/Johannesburg,2016-11-09
369,11005424,895949,Republic of Zambia,Republic of Zambia,An tSaimbia;Dam-bi-a;Dam-bi-a (Zambia);Dăm-bi-...,-14.33333,28.50000,A,PCLI,ZM,,0.0,,,,13460305,,1156,Africa/Lusaka,2016-09-12


In [38]:
countries

Unnamed: 0.1,Unnamed: 0,Country,Density(P/Km2),Abbreviation,Agricultural Land( %),Land Area(Km2),Armed Forces size,Birth Rate,Calling Code,Capital/Major City,...,Out of pocket health expenditure,Physicians per thousand,Population,Population: Labor force participation (%),Tax revenue (%),Total tax rate,Unemployment rate,Urban_population,Latitude,Longitude
0,0,Afghanistan,60,AF,58.10%,652230,323000,32.49,93.0,Kabul,...,78.40%,0.28,38041754,48.90%,9.30%,71.40%,11.12%,9797273,33.939110,67.709953
1,2,Algeria,18,DZ,17.40%,2381741,317000,24.28,213.0,Algiers,...,28.10%,1.72,43053054,41.20%,37.20%,66.10%,11.70%,31510100,28.033886,1.659626
2,4,Angola,26,AO,47.50%,1246700,117000,40.73,244.0,Luanda,...,33.40%,0.21,31825295,77.50%,9.20%,49.10%,6.89%,21061025,-11.202692,17.873887
3,6,Argentina,17,AR,54.30%,2780400,105000,17.02,54.0,Buenos Aires,...,17.60%,3.96,44938712,61.30%,10.10%,106.30%,9.79%,41339571,-38.416097,-63.616672
4,8,Australia,3,AU,48.20%,7741220,58000,12.60,61.0,Canberra,...,19.60%,3.68,25766605,65.50%,23.00%,47.40%,5.27%,21844756,-25.274398,133.775136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,190,Venezuela,32,VE,24.50%,912050,343000,17.88,58.0,Caracas,...,45.80%,1.92,28515829,59.70%,,73.30%,8.80%,25162368,6.423750,-66.589730
107,191,Vietnam,314,VN,39.30%,331210,522000,16.75,84.0,Hanoi,...,43.50%,0.82,96462106,77.40%,19.10%,37.60%,2.01%,35332140,14.058324,108.277199
108,192,Yemen,56,YE,44.60%,527968,40000,30.45,967.0,Sanaa,...,81.00%,0.31,29161922,38.00%,,26.60%,12.91%,10869523,15.552727,48.516388
109,193,Zambia,25,ZM,32.10%,752618,16000,36.19,260.0,Lusaka,...,27.50%,1.19,17861030,74.60%,16.20%,15.60%,11.43%,7871713,-13.133897,27.849332


In [39]:
a = list(jp['asciiname']) + list(city['name']) + list(countries['Country']) + list(added_locs['location'])

city_list = list(
    set(re.sub(r'[^a-zA-Z]', ' ', str(element)).strip() for element in a if element is not None)
)

city_list.sort()

In [40]:
locations = pd.DataFrame(city_list, columns=['location'])

In [41]:
locations

Unnamed: 0,location
0,Abashiri
1,Abashiri shi
2,Abiko
3,Abiko shi
4,Abira Cho
...,...
2958,Zushi
2959,Zushi Shi
2960,le de France
2961,saka fu


In [42]:
locations.to_csv('../datasets/jpgpe1.csv', index=False)

In [43]:
len(city_list)

2963

In [44]:
jsonl_data = []

for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    content = row["text"]
    hashtags = city_list

    entities = []
    
    for hashtag in hashtags:
        if isinstance(hashtag, str):
            lowercase_hashtag = hashtag.lower()
        else:
            lowercase_hashtag = str(hashtag).lower()
        
        pattern = r'\b' + re.escape(lowercase_hashtag) + r'\b'
        matches = re.finditer(pattern, content.lower())
        for match in matches:
            start_idx = match.start()
            end_idx = match.end()
            if [start_idx, end_idx, "GPE"] not in entities:
                entities.append([start_idx, end_idx, "GPE"])
    
    jsonl_data.append([f"{content} ", {"entities": entities}])

with open("../datasets/test/1xjp.jsonl", "w", encoding="utf-8") as jsonl_file:
    for entry in jsonl_data:
        json.dump(entry, jsonl_file)
        jsonl_file.write("\n")

Processing rows: 100%|██████████| 755/755 [01:11<00:00, 10.49it/s]


In [45]:
disaster_keywords = [
    "earthquake", "tremor", "aftershock", "seismic", "fault", "epicenter",
    "magnitude", "Richter scale", "shaking", "ground", "foreshock",
    "tectonic", "plate", "shockwave", "aftermath", "felt", "feel",
    "strong", "massive", "devastating", "violent", "powerful", "intense",
    "mild", "deep", "surface", "shallow", "damage", "collapse", "ruins", "wreckage",
    "destroyed", "cracks", "crumbling", "impact", "disaster", "displaced",
    "homeless", "injury", "injuries", "fatalities", "debris", "rubble", "casualties",
    "trapped", "died", "alert", "warning", "evacuation", "rescue",
    "search", "emergency", "relief", "assistance", "volunteers", "preparedness",
    "shelter", "efforts", "response team", "seismograph", "seismology",
    "intensity", "measurement", "USGS", "depth", "geological", "seismometer",
    "tsunami", "landslide", "fire", "eruption", "volcano", "flood", "pray", "thoughts",
    "fear", "panic", "trauma", "loss", "tragedy", "devastation", "solidarity", "support"
]

In [46]:
input_file = "../datasets/test/1xjp.jsonl"  
jsonl_data = []

with open(input_file, "r", encoding="utf-8") as file:
    for line in file:
        jsonl_data.append(json.loads(line))

updated_jsonl_data = []

for entry in tqdm(jsonl_data, desc="Processing rows"):
    content = entry[0]
    entities = entry[1]["entities"]

    for keyword in disaster_keywords:
        if isinstance(keyword, str):
            lowercase_keyword = keyword.lower()
        else:
            lowercase_keyword = str(keyword).lower()
        
        pattern = r'\b' + re.escape(lowercase_keyword) + r'\b'
        matches = re.finditer(pattern, content.lower())
        for match in matches:
            start_idx = match.start()
            end_idx = match.end()
            if [start_idx, end_idx, "DISASTER"] not in entities:
                entities.append([start_idx, end_idx, "DISASTER"])
    
    updated_jsonl_data.append([content, {"entities": entities}])

output_file = "../datasets/test/2xjp.jsonl"

with open(output_file, "w") as jsonl_file:
    for entry in updated_jsonl_data:
        json.dump(entry, jsonl_file)
        jsonl_file.write("\n")

Processing rows:   0%|          | 0/755 [00:00<?, ?it/s]

Processing rows: 100%|██████████| 755/755 [00:00<00:00, 1851.73it/s]


In [47]:
def has_overlap(entities):
    for i, (start1, end1, _) in enumerate(entities):
        if start1 >= end1: 
            raise ValueError("Invalid entity: start must be less than end")
        for j, (start2, end2, _) in enumerate(entities):
            if start2 >= end2:
                raise ValueError("Invalid entity: start must be less than end")
            if i != j and (start1 < end2 and start2 < end1):  
                return True
    return False

def keep_largest_entity(entities):
    
    entities.sort(key=lambda x: (x[0], x[1] - x[0]), reverse=True)
    
    non_overlapping_entities = []
    
    for entity in entities:
        overlap_found = False
        for existing_entity in non_overlapping_entities:
            if (entity[0] < existing_entity[1] and entity[1] > existing_entity[0]):
                overlap_found = True
                break
        if not overlap_found:
            non_overlapping_entities.append(entity)
    
    return non_overlapping_entities

def filter_non_overlapping_lines(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
        for line in infile:
            data = json.loads(line)
            content = data[0]
            entities = data[1].get("entities", [])
            
            adjusted_entities = keep_largest_entity(entities)
            
            if not has_overlap(adjusted_entities):  
                data[1]['entities'] = adjusted_entities 
                json.dump(data, outfile)
                outfile.write("\n")

input_file = "../datasets/test/2xjp.jsonl"
output_file = "../datasets/test/3xjp.jsonl"
filter_non_overlapping_lines(input_file, output_file)


In [48]:
def remove_empty_entities(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
        for line in infile:
            tweet = json.loads(line)

            if tweet[1]["entities"]:
                json.dump(tweet, outfile, ensure_ascii=False)
                outfile.write("\n")

input_file = "../datasets/test/3xjp.jsonl"  
output_file = "../datasets/test/4xjp.jsonl"  

remove_empty_entities(input_file, output_file)


In [49]:
from langdetect import detect

def filter_english_lines(input_file, output_file):
    filtered_data = []
    
    with open(input_file, "r", encoding="utf-8") as infile:
        for line in infile:
            text, annotations = json.loads(line)
            try:
                if detect(text) == "en":
                    filtered_data.append((text, annotations))
            except Exception as e:
                print(f"Error detecting language for line: {line.strip()}\n{e}")

    with open(output_file, "w", encoding="utf-8") as outfile:
        for item in filtered_data:
            outfile.write(json.dumps(item) + "\n")
    
    print(f"Filtered data saved to {output_file}")

input_file = "../datasets/test/4xjp.jsonl"  
output_file = "../datasets/test/5xjp.jsonl"  
filter_english_lines(input_file, output_file)


Filtered data saved to ../datasets/test/5xjp.jsonl
