In [1]:
import pandas as pd
import sys
import re
from date_measurement import DateMeasurement as DM 
from date_tool import dt_search as dt_search
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
re_disaster = { 
    'gempa' : r'gempa|tektonik',
    'tsunami' : r'tsunami',
    'erupsi' : r'vulkanik|erupsi|letusan|awan panas|lava',
    'kekeringan' : r'kekeringan',
    'banjir' : r'banjir',
    'angin_topan' : r'badai|puting beliung|angin topan|tornado|angin kencang',
    'longsor' : r'longsor',
    'karhutla' : r'kebakaran hutan|kebakaran lahan|titik panas',
}


In [3]:
def refine_content_date(content_date, pub_date):
    if 0 < len(content_date):
        l_date = content_date.split('|')
        y_pub = pub_date[:4]
        for i, val in enumerate(l_date):
            if 10 > len(val):
                l_date[i] = y_pub + "-" + val
    else:
        l_date = [pub_date]

    l_date = list(set(l_date))
    return '|'.join(l_date)

In [4]:
def extract_date(pub_date_text, content_text):
    pub_date = dt_search.find_dates(pub_date_text, mmdd=False)
    content_date = dt_search.find_dates(content_text)    
    content_date = refine_content_date(content_date, pub_date)

    return pub_date, content_date

In [5]:
def get_re_string(text_id):
    ret = r''

    for k, v in re_disaster.items():
        if k in text_id.lower():
            ret = v
            break

    return ret

In [6]:
def filter_context_text(text_id, text):
    st_tokens = sent_tokenize(text)
    re_string = get_re_string(text_id)
    st_tokens = [ s for s in st_tokens if re.search(re_string, s.lower(), re.I) ]
    return ' '.join(st_tokens)


In [7]:
f_in = 'C:\\Users\\dharmapu\\Documents\\personal\\ui\\KA-AMSD_src\\paper-submission\\anotated_data\\date-time\\date-time_text.xlsx'
f_ref = 'C:\\Users\\dharmapu\\Documents\\personal\\ui\\KA-AMSD_src\\paper-submission\\anotated_data\\date-time\\date-time.xlsx'

df = pd.read_excel(f_in)
df_ref = pd.read_excel(f_ref)
df['date_text'] = df_ref['date']

In [8]:
for index, row in df.iterrows():
    text_id = row['id']
    pub_date_text = row['date']
    content_text = row['title'] + ' ' + row['content']
    content_text = filter_context_text(text_id, content_text)

    pub_date, content_date = extract_date(pub_date_text, content_text)

    df.loc[index, 'pred_date_pub'] = pub_date
    df.loc[index, 'pred_date_text'] = content_date

    # prepare 
    if "nan".lower() == str(row['date_text']).lower():
        ref_date = pub_date
    else:  
        ref_date = row['date_text']
    
    df.loc[index, 'ref_date'] = ref_date

    pred_date = content_date
    l_true = ref_date.split('|')
    l_pred = pred_date.split('|')

    # clean
    l_true = [ e.replace('\'', '') for e in l_true ]
    l_pred = [ e.replace('\'', '') for e in l_pred ]
    
    dm = DM(l_true, l_pred)
    df.loc[index, 'jc_sim'] = dm.jaccard_simmilarity()
    df.loc[index, 'pre'] = dm.precision()
    df.loc[index, 'rec'] = dm.recall()
    df.loc[index, 'f1'] = dm.f1_score()

In [9]:
df_out = df[['id', 'date_text', 'ref_date', 'pred_date_pub', 'pred_date_text', 'jc_sim', 'pre', 'rec', 'f1']]

columns = ['jc_sim', 'pre', 'rec', 'f1'] 
means_list = [ "{:.6f}".format(df_out[c].mean()) for c in columns ]
#means_list

lst = ['MEAN', 'N/A', 'N/A', 'N/A', 'N/A']
lst = lst + means_list
print(lst)

df_tmp = df_out.copy()
df_tmp.loc[len(df_tmp)] = lst
df_tmp

['MEAN', 'N/A', 'N/A', 'N/A', 'N/A', '0.662537', '0.683484', '0.716944', '0.685879']


Unnamed: 0,id,date_text,ref_date,pred_date_pub,pred_date_text,jc_sim,pre,rec,f1
0,angin_topan_0001,2018-09-30,2018-09-30,2018-10-01,2018-09-30,1,1,1,1
1,angin_topan_0002,2016-11-04,2016-11-04,2016-11-11,2016-11-04,1,1,1,1
2,angin_topan_0003,2018-01-03,2018-01-03,2018-01-04,2018-01-04,0,0,0,0
3,angin_topan_0005,2017-09-04,2017-09-04,2017-04-10,2017-04-09,0,0,0,0
4,angin_topan_0006,2019-08-19|2019-08-18,2019-08-19|2019-08-18,2019-08-20,2019-08-19,0.5,1,0.5,0.666667
...,...,...,...,...,...,...,...,...,...
2208,tsunami_1154,2018-09-28,2018-09-28,2018-09-29,2018-09-28,1,1,1,1
2209,tsunami_1158,,2018-10-03,2018-10-03,2018-10-03,1,1,1,1
2210,tsunami_1160,2018-12-22,2018-12-22,2018-12-25,2018-12-22|2018-12-24,0.5,0.5,1,0.666667
2211,tsunami_1163,2018-09-28,2018-09-28,2018-09-29,2018-09-29,0,0,0,0


In [10]:

f_out = f_in.replace('.xlsx', '_rslt_spec_st.xlsx')
df_tmp.to_excel(f_out, index=False)