In [2]:
import pandas as pd
import numpy as np
import json
import glob
import ast

In [1]:
priority_mapping = {
    "Critical" : 10,
    "High" : 7.5,
    "Medium" : 5,
    "Low" : 2.5,
    "Unknown" : 0,
}

event_int_map =	{
  "athensEarthquake2020": 35,
  "baltimoreFlashFlood2020": 36,
  "brooklynBlockPartyShooting2020": 37,
  "daytonOhioShooting2020": 38,
  "elPasoWalmartShooting2020": 39,
  "gilroygarlicShooting2020": 40,
  "hurricaneBarry2020": 41,
  "indonesiaEarthquake2020": 42,
  "keralaFloods2020": 43,
  "myanmarFloods2020": 44,
  "papuaNewguineaEarthquake2020": 45,
  "siberianWildfires2020": 46,
  "typhoonKrosa2020": 47,
  "typhoonLekima2020": 48,
  "whaleyBridgeCollapse2020": 49
}

# Load the training data

- Tweets
    - A `Ground Truth Set` of tweets. 
- Labels
    - `TRECIS-2018-2020A.topics`
- Information Types
    - `TRECIS-2020-ITypes-Task1.json`

> TREC-IS provides multiple Twitter datasets collected from a range of past wildfire, earthquake, flood, typhoon/hurricane, bombing and shooting events. Human annotators manually label this data into 25 information types based on the information each tweet contains, such as 'contains location' or is a 'search and rescue request'. 

Each tweet is also assigned a ***priority* label**, that indicates how critical the information within that tweet is for a response officer to see. 



# Labels
`TRECIS-2018-2020A-labels.json` -> `labels_df`

```json
[
  {
    "eventID": "joplinTornado2011",
    "eventName": "2011 Joplin Tornado",
    "eventDescription": "The 2011 Joplin tornado was a catastrophic EF5-rated multiple-vortex tornado that struck Joplin, Missouri, late in the afternoon of Sunday, May 22, 2011. The user is a response officer in the Missouri command and control center responsible for impact to the state. <a href='https://en.wikipedia.org/wiki/2011_Joplin_tornado' target='_blank'>Wikipedia Page<a>",
    "eventType": "Unknown",
    "postID": "72676276212731904",
    "postCategories": [
      "Factoid",
      "Hashtags",
      "News"
    ],
    "postPriority": "Low"
  },
 ```

In [5]:
labels_df = pd.read_json("../../../0-data/raw/data/2020/2020-A/labels/TRECIS-2018-2020A-labels.json")
labels_df

Unnamed: 0,eventID,eventName,eventDescription,eventType,postID,postCategories,postPriority
0,joplinTornado2011,2011 Joplin Tornado,The 2011 Joplin tornado was a catastrophic EF5...,Unknown,72676276212731904,"[Factoid, Hashtags, News]",Low
1,joplinTornado2011,2011 Joplin Tornado,The 2011 Joplin tornado was a catastrophic EF5...,Unknown,72678400833228800,"[ServiceAvailable, Official, Hashtags, News]",Critical
2,joplinTornado2011,2011 Joplin Tornado,The 2011 Joplin tornado was a catastrophic EF5...,Unknown,72682396750848000,"[Sentiment, Irrelevant]",Low
3,joplinTornado2011,2011 Joplin Tornado,The 2011 Joplin tornado was a catastrophic EF5...,Unknown,72693931619528704,"[ThirdPartyObservation, Hashtags, News]",Medium
4,joplinTornado2011,2011 Joplin Tornado,The 2011 Joplin tornado was a catastrophic EF5...,Unknown,72698562223407104,"[ThirdPartyObservation, Hashtags, Irrelevant]",Low
...,...,...,...,...,...,...,...
42946,typhoonKrosa2020,typhoonKrosa2020,Placeholder,Unknown,1161999740080291840,[Irrelevant],Low
42947,typhoonKrosa2020,typhoonKrosa2020,Placeholder,Unknown,1162004768904163328,"[Location, MultimediaShare, ContextualInformat...",Low
42948,typhoonKrosa2020,typhoonKrosa2020,Placeholder,Unknown,1162005174468132864,"[Location, MultimediaShare]",Low
42949,typhoonKrosa2020,typhoonKrosa2020,Placeholder,Unknown,1162005861075750912,"[Location, MultimediaShare, Hashtags]",Low


In [6]:
# Map the categories to nuemric values
mymap = {'Advice':1, 'CleanUp':2, 'ContextualInformation':3, 'Discussion':4, 'Donations':5, 
        'EmergingThreats':6, 'Factoid':7, 'FirstPartyObservation':8, 'GoodsServices':9, 'Hashtags':10, 
        'InformationWanted':11,'Irrelevant':12, 'Location':13, 'MovePeople':14, 
         'MultimediaShare':15, 'NewSubEvent':16, 'News':17,
        'Official':18, 'OriginalEvent':19, 'SearchAndRescue':20, 'Sentiment':21, 'ServiceAvailable':22, 
         'ThirdPartyObservation':23,'Volunteer':24, 'Weather':25}

df2 = pd.DataFrame(labels_df["postCategories"].to_list(), columns=['cat1', 'cat2', 'cat3',
                                                                   'cat4', 'cat5', 'cat6',
                                                                   'cat7', 'cat8', 'cat9', 'cat10'])



df2 = df2.applymap(lambda s: mymap.get(s) if s in mymap else s)
df2 = df2.fillna("0")
df = labels_df.join(df2)
df

labels_df = df


## Ground Truth

`TRECIS2020A-t12-assr*.json`

Read the responses for each assessor and append them to an array of assessor data


```json
"events": [
{"eventid": "siberianWildfires2020",
"tweets": [
{
  "postID" : "1157446798564306945",
  "timestamp" : "3 Mar 2020 12:28:12 GMT",
  "categories" : [ "Irrelevant" ],
  "indicatorTerms" : [ ],
  "priority" : "Low",
  "text" : "Trump offers Vladimir Putin help fighting forest fires in Siberia. (When California was consumed in biblical flames, Trump blamed the state's firefighters and slashed federal funding to stop wildfires.) https://t.co/TKTw3d0NLS"
},
```

In [7]:
training_data = []

def read_annotations(in_f_path):
    l_training_data = []
    
    with open(in_f_path, "rb") as in_file:
        annotator_content = in_file.read().decode("latin-1")
        trecis_training = json.loads(annotator_content)
        l_training_data.append(trecis_training)
        
    return l_training_data
        

# 2020 
for in_f_path in glob.iglob("../../../0-data/raw/data/2020/2020-A/ground-truth-set/*assr*.json"):
    training_data.extend(read_annotations(in_f_path))
    
print("Annotations:", len(training_data))
#print(training_data)


Annotations: 5


## Map the annotated tweets

outputs 

**tweet_category_map**

```python
{'Advice': [243413475681001473,
  ```
**tweet_id_to_priority**

```python
[{'tweet_id': 243377845072715777, 'priority': 'Low'},

```

**tweet_id_to_category**
```python
{243413475681001473: 1,
```


In [8]:
tweet_to_category = []
tweet_id_to_priority = []

# tweet_to_category -> df[tweet_to_category] -> category_df -> tweet_category_map[for cat in category_df]

# category_df
for annotator in training_data:
    local_events = annotator["events"]
    for event in local_events:
        for tweet in event["tweets"]:
            # Pull out categories from the tweet dictionary
            for category in tweet["categories"]:
                #print(category)
                tweet_to_category.append({
                    # KnownAlready Official KnownAlready KnownAlready Official KnownAlready KnownAlready ContinuingNews ContinuingNews
                    "tweet_id": np.int64(tweet["postID"]),
                    "category": category
                })
                
            # Pull out priority, of which there should be only one
            tweet_id_to_priority.append({
                "tweet_id": np.int64(tweet["postID"]),
                "priority": tweet["priority"]
            })

print("Tweet to Category Map:", len(tweet_to_category))
print("Tweet ID to Priority Map:", len(tweet_id_to_priority))

category_df = pd.DataFrame(tweet_to_category)

print("Tweets with Category:", category_df["tweet_id"].value_counts().index.shape[0])

# Export to CSV
#category_df.to_csv("../3-csv/tweet_to_category.csv", index=False)
#tweet_to_category
category_df

Tweet to Category Map: 27036
Tweet ID to Priority Map: 12227
Tweets with Category: 6658


Unnamed: 0,tweet_id,category
0,1155671863911051265,Irrelevant
1,1155716993351204864,Location
2,1155716993351204864,MultimediaShare
3,1155716993351204864,News
4,1155914024690814979,InformationWanted
...,...,...
27031,1157292521376092160,Weather
27032,1157292521376092160,Location
27033,1157292521376092160,EmergingThreats
27034,1157292521376092160,MultimediaShare


## categoryMap()
- `category_df`
- `tweet_category_map`
- `tweet_id_to_category`

In [19]:
# categoryMap()
# Maps a list of tweetIDs associated with categories

tweet_category_map = {}
category_df = pd.read_csv("../3-csv/tweet_to_category.csv")

i = 0
for category, tweets in category_df.groupby("category"):
    i += 1
    tweet_category_map[category] = list(tweets["tweet_id"])
    
# Get a count of the category labels
category_to_label = {c:i+1 for i, c in enumerate(tweet_category_map.keys())}

tweet_id_to_category = {}

for category, tweet_ids in tweet_category_map.items():
   
        
    for tweet_id in tweet_ids:
        tweet_id_to_category[np.int64(tweet_id)] = category_to_label[category]
        
#tweet_id_to_category
#tweet_category_map
#tweet_id_to_priority

print("Labels:", sum([len(v) for v in tweet_category_map.values()]))


Maps the tweetID in `tweet_id_to_priority` to it's numerical priority value. We can then use this to calculate the error against our run

Outputs `priority_df` and `priority_map` (identical)

In [17]:
# Map the Priority to numeric value
priority_df = pd.DataFrame(tweet_id_to_priority)

temp_merged_priorities = []
for tweet_id, group in priority_df.groupby("tweet_id"):
    priority_list = list(group["priority"])
    p_scores = [priority_mapping[p] for p in priority_list]
    temp_merged_priorities.append({
        "tweet_id": tweet_id,
        "priority": np.mean(p_scores),
    })

priority_df = pd.DataFrame(temp_merged_priorities)

#priority_map = {row["tweet_id"]: row["priority"] for idx, row in priority_df.iterrows()}

priority_df.head()

Unnamed: 0,tweet_id,priority
0,1128285482784366592,7.5
1,1128285665186197504,2.5
2,1128285690779795459,2.5
3,1128285757624311808,2.5
4,1128285778306428934,2.5


#### ID -> EventID

Returns a map of events with all identified tweet IDs

```
{'albertaFloods2013': [347686624563429376,
  347766337344503808,
  347783236191129600,
  347793432514801664,
```

In [16]:
merged_df = pd.merge(priority_df, labels_df, left_on = 'tweet_id', right_on = 'postID', how = 'inner')

merged_df

Unnamed: 0,tweet_id,priority,eventID,eventName,eventDescription,eventType,postID,postCategories,postPriority,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10
0,1128285482784366592,7.50,papuaNewguineaEarthquake2020,papuaNewguineaEarthquake2020,Placeholder,Unknown,1128285482784366592,"[Location, Factoid]",High,13,7,0,0,0,0,0,0,0,0
1,1128285665186197504,2.50,papuaNewguineaEarthquake2020,papuaNewguineaEarthquake2020,Placeholder,Unknown,1128285665186197504,"[Location, Factoid, Hashtags]",Low,13,7,10,0,0,0,0,0,0,0
2,1128285757624311808,2.50,papuaNewguineaEarthquake2020,papuaNewguineaEarthquake2020,Placeholder,Unknown,1128285757624311808,"[Location, Factoid]",Low,13,7,0,0,0,0,0,0,0,0
3,1128286330356686848,2.50,papuaNewguineaEarthquake2020,papuaNewguineaEarthquake2020,Placeholder,Unknown,1128286330356686848,"[Location, MultimediaShare, Factoid, Hashtags]",Low,13,15,7,10,0,0,0,0,0,0
4,1128286351760265216,2.50,papuaNewguineaEarthquake2020,papuaNewguineaEarthquake2020,Placeholder,Unknown,1128286351760265216,"[Location, Factoid, Hashtags]",Low,13,7,10,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3724,1161986155442778112,3.75,typhoonKrosa2020,typhoonKrosa2020,Placeholder,Unknown,1161986155442778112,"[MovePeople, Weather, Location, Advice]",Medium,14,25,13,1,0,0,0,0,0,0
3725,1161988642698670080,2.50,typhoonKrosa2020,typhoonKrosa2020,Placeholder,Unknown,1161988642698670080,"[Location, MultimediaShare, Hashtags]",Low,13,15,10,0,0,0,0,0,0,0
3726,1161992743268343808,3.75,typhoonKrosa2020,typhoonKrosa2020,Placeholder,Unknown,1161992743268343808,"[Weather, Location]",Medium,25,13,0,0,0,0,0,0,0,0
3727,1161998036991561728,3.75,typhoonKrosa2020,typhoonKrosa2020,Placeholder,Unknown,1161998036991561728,[Irrelevant],Low,12,0,0,0,0,0,0,0,0,0


In [None]:
# Count the labels
merged_df['num'] = merged_df['postCategories_y'].str.len()
merged_df

In [None]:
# Map the events to numeric values
merged_df = merged_df.replace({'eventID': event_int_map})
merged_df

In [None]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    df.astype(int)
    return df[indices_to_keep].astype(np.float64)

df.sort_values(by=['eventID'])

In [None]:
# Drop irrelevant columns
df = merged_df.drop(['postID','eventName','eventDescription','postCategories_y','postCategories_x', 'postPriority', 'eventType', 'category'],axis =1)
df.to_csv("../3-csv/labels.csv", index=False)
df


# Topic Statements

For each incident, we have a stream of related tweets, collected using hashtags, keyword, user, and geolocation monitoring. Each incident/event is accompanied by a brief "topic statement" in the TREC style:


`TRECIS-2018-2020A.topics`

```xml
<top>
<num>TRECIS-CTIT-H-Training-001</num>
<dataset>fireColorado2012</dataset>
<title>2012 Colorado wildfires</title>
<type>wildfire</type>
<url>https://en.wikipedia.org/wiki/2012_Colorado_wildfires</url>
<narr> The Colorado wildfires were an unusually devastating series of fires
in the US state of Colorado, which occurred throughout June, July, and
August 2012.
</narr>
</top>
```


 -> `topic_ids`
```python
{'athensEarthquake2019': 'TRECIS-CTIT-H-Test-035',
 'baltimoreFloods2019': 'TRECIS-CTIT-H-Test-036',
```

In [20]:
'''
topic_ids = {}

with open("../../../0-data/raw/data/2020/2020-A/topics/TRECIS-2020-A.topics", "r") as in_file:
    topic_num = ""
    topic_id = ""
    
    for line in in_file:
        
        if line.strip() == "</top>":
            topic_ids[topic_id] = topic_num
        
        if line.startswith("<num>"):
            topic_num = line.partition(">")[-1].partition("<")[0]
              
        if line.startswith("<dataset>"):
            topic_id = line.partition(">")[-1].partition("<")[0]

topic_ids
'''

{'athensEarthquake2019': 'TRECIS-CTIT-H-Test-035',
 'baltimoreFloods2019': 'TRECIS-CTIT-H-Test-036',
 'brooklynBlockPartyShooting2019': 'TRECIS-CTIT-H-Test-037',
 'daytonOhioShooting2019': 'TRECIS-CTIT-H-Test-038',
 'elPasoWalmartShooting2019': 'TRECIS-CTIT-H-Test-039',
 'gilroygarlicShooting2019': 'TRECIS-CTIT-H-Test-040',
 'hurricaneBarry2019': 'TRECIS-CTIT-H-Test-041',
 'indonesiaEarthquake2019': 'TRECIS-CTIT-H-Test-042',
 'keralaFloods2019': 'TRECIS-CTIT-H-Test-043',
 'myanmarFloods2019': 'TRECIS-CTIT-H-Test-044',
 'papuaNewGuineaEarthquake': 'TRECIS-CTIT-H-Test-045',
 'siberianWildfires2019': 'TRECIS-CTIT-H-Test-046',
 'typhoonKrosa2019': 'TRECIS-CTIT-H-Test-047',
 'typhoonLekima2019': 'TRECIS-CTIT-H-Test-048',
 'whaleyBridgeDamCollapse2019': 'TRECIS-CTIT-H-Test-049'}

In [12]:
# Instantiate a new DataFrame to hold the categorised_tweets
#cat_df = pd.DataFrame.from_records([tweet_id_to_category])
#cat_df = cat_df.transpose()
#cat_df = cat_df.reset_index()
#cat_df.columns = ['tweet_id', 'postCategories']
#cat_df

## Merge the category and priority

In [13]:
# Join priority_df and category_df
#tweet_to_category_priority_df = priority_df.join(category_df.set_index('tweet_id'), on='tweet_id')

# Join tweet_to_category_priority_df and cat_df
#tweet_to_category_id_priority_df = tweet_to_category_priority_df.join(cat_df.set_index('tweet_id'), on='tweet_id')

#
#tweet_to_category_id_priority_df['postCategories'] = tweet_to_category_id_priority_df['postCategories']

#tweet_to_category_id_priority_df.sort_values('priority')

In [14]:
#df = tweet_to_category_id_priority_df
#df['category'] = df[['tweet_id','priority', 'category', 'postCategories']].groupby(['tweet_id','priority', 'postCategories'])['category'].transform(lambda x: ','.join(x))
#df[['tweet_id','priority', 'postCategories']].drop_duplicates()


# Low-Level Information Types
``TRECIS-2020-ITypes-Task1.json``

This loads the ontology file containing the **25** `Information Types` that we need to assign to the unlabelled tweets

In [21]:
df = pd.read_json("../../../0-data/raw/data/2020/2020-A/types/TRECIS-2020-ITypes-Task1.json", orient='columns')
df_split = df.join(pd.DataFrame(df.pop('informationTypes').tolist()))
informationTypes = df_split.drop(['identifier','description','level'],axis =1) # drop irrelevant (level?)
informationTypes

Unnamed: 0,id,desc,intentType,exampleLowLevelTypes
0,Request-GoodsServices,The user is asking for a particular service or...,Request,"[PsychiatricNeed, Equipment, ShelterNeeded, Ve..."
1,Request-SearchAndRescue,The user is requesting a rescue (for themselve...,Request,"[SelfRescue, OtherRescue]"
2,Request-InformationWanted,The user is requesting information,Request,"[PersonsNews, MissingPersons, EventStatus]"
3,CallToAction-Volunteer,The user is asking people to volunteer to help...,CallToAction,[RegisterNow]
4,CallToAction-Donations,The user is asking people to donate goods/money,CallToAction,"[DonateMoney, DonateGoods, PromoteFundRaising]"
5,CallToAction-MovePeople,The user is asking people to leave an area or ...,CallToAction,"[EvacuateNow, GatherAt]"
6,Report-FirstPartyObservation,The user is giving an eye-witness account,Report,"[Group/IndividualMovement, PeopleEvacuating, D..."
7,Report-ThirdPartyObservation,The user is reporting a information that they ...,Report,"[Group/IndividualMovement, PeopleEvacuating, D..."
8,Report-Weather,The user is providing a weather report (curren...,Report,"[Current, Forecast]"
9,Report-Location,The post contains information about the user o...,Report,"[UserLocation, IncidentLocation]"
