In [1]:
import pandas as pd
import numpy as np

# Merge

This notebook merges the information types, topics and ground truth set.

Outputs:
- `df_int.csv`
- `df_char.csv`

In [2]:
def clean_dataset_char(df):
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df.dropna(inplace=True)
    return df


def clean_dataset_int(df):
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    #    df.round()
    df.replace(np.nan,0)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    df.astype(int)
    df = df.replace(r'\D+', '', regex=True)
    return df[indices_to_keep].astype(np.float64)

In [3]:
# Labelled Tweets

df_t = pd.read_csv("dataset_labelled_tweets.csv")
df_t['category'] = df_t['category'].str.lower()
df_t.head()

Unnamed: 0,tweet_id,priority,category,postCategories_x,eventID,eventType
0,211565974422425600,0.75,serviceavailable,19,fireColorado2012,Unknown
1,211654415503990784,0.5,news,15,fireColorado2012,Unknown
2,211681309368655872,0.25,news,15,fireColorado2012,Unknown
3,211685621125742592,0.25,official,16,fireColorado2012,Unknown
4,211877049147736064,0.25,factoid,7,fireColorado2012,Unknown


In [4]:
# Information Types

df_i = pd.read_csv("dataset_information_types.csv")
df_i['id'] = df_i['id'].str.lower().str.split('-')
df_i.head()

Unnamed: 0,id,desc,intentType,exampleLowLevelTypes
0,"[request, goodsservices]",The user is asking for a particular service or...,Request,"['PsychiatricNeed', 'Equipment', 'ShelterNeede..."
1,"[request, searchandrescue]",The user is requesting a rescue (for themselve...,Request,"['SelfRescue', 'OtherRescue']"
2,"[request, informationwanted]",The user is requesting information,Request,"['PersonsNews', 'MissingPersons', 'EventStatus']"
3,"[calltoaction, volunteer]",The user is asking people to volunteer to help...,CallToAction,['RegisterNow']
4,"[calltoaction, donations]",The user is asking people to donate goods/money,CallToAction,"['DonateMoney', 'DonateGoods', 'PromoteFundRai..."


In [5]:
# Ground truth set
df = pd.read_csv("before_selection.csv")
df_b = df
df_b.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,positive_sentiment,negative_sentiment,numb_of_questions,tweet_length,userFollowersCount,userFriendsCount,user_list_count,dict_precision,dict_recall,dict_f_measure
0,0,1.155485e+18,20,660,150,990,180,420,0,11109999656677246,11109999656677246,11109999656677246
1,1,1.155487e+18,10,390,520,2570,77090,28140,3920,0,0,0
2,2,1.155488e+18,20,630,170,980,5370,14860,200,0,0,0
3,3,1.155494e+18,40,570,630,2570,2950,21450,20,100,11109999656677246,10529999732971191
4,4,1.155495e+18,150,420,400,2170,16820,1040,510,0,0,0


## Mapping

In [6]:
#TODO: Incomplete, matching correctly but not merging fully

In [7]:
df_t['matchedName'] = df_i['id'].apply(lambda x: [item for item in x if item in df_t['category'].tolist()])
df_t

Unnamed: 0,tweet_id,priority,category,postCategories_x,eventID,eventType,matchedName
0,211565974422425600,0.75,serviceavailable,19,fireColorado2012,Unknown,[]
1,211654415503990784,0.50,news,15,fireColorado2012,Unknown,[]
2,211681309368655872,0.25,news,15,fireColorado2012,Unknown,[informationwanted]
3,211685621125742592,0.25,official,16,fireColorado2012,Unknown,[volunteer]
4,211877049147736064,0.25,factoid,7,fireColorado2012,Unknown,[donations]
...,...,...,...,...,...,...,...
938,396336012726525952,0.25,news,15,laAirportShooting2013,Unknown,
939,396336079856345088,0.25,news,15,laAirportShooting2013,Unknown,
940,396336243442589696,0.25,news,15,laAirportShooting2013,Unknown,
941,396336297968562176,0.25,factoid,7,laAirportShooting2013,Unknown,


In [8]:
# map eventID -> numeric value

In [9]:
# Drop until we match / populate these
df = df_t.drop(['category','matchedName','eventType','eventID'],axis =1)
df

Unnamed: 0,tweet_id,priority,postCategories_x
0,211565974422425600,0.75,19
1,211654415503990784,0.50,15
2,211681309368655872,0.25,15
3,211685621125742592,0.25,16
4,211877049147736064,0.25,7
...,...,...,...
938,396336012726525952,0.25,15
939,396336079856345088,0.25,15
940,396336243442589696,0.25,15
941,396336297968562176,0.25,7


In [10]:
df_char = clean_dataset_char(df)
df_char

Unnamed: 0,tweet_id,priority,postCategories_x
0,211565974422425600,0.75,19
1,211654415503990784,0.50,15
2,211681309368655872,0.25,15
3,211685621125742592,0.25,16
4,211877049147736064,0.25,7
...,...,...,...
938,396336012726525952,0.25,15
939,396336079856345088,0.25,15
940,396336243442589696,0.25,15
941,396336297968562176,0.25,7


In [11]:
#
#df.sort_values(by=['label_id'])
df_int = clean_dataset_int(df)
df_int

Unnamed: 0,tweet_id,priority,postCategories_x
0,2.115660e+17,0.75,19.0
1,2.116544e+17,0.50,15.0
2,2.116813e+17,0.25,15.0
3,2.116856e+17,0.25,16.0
4,2.118770e+17,0.25,7.0
...,...,...,...
938,3.963360e+17,0.25,15.0
939,3.963361e+17,0.25,15.0
940,3.963362e+17,0.25,15.0
941,3.963363e+17,0.25,7.0


In [12]:
df_int.to_csv("df_int.csv", index=True)
df_char.to_csv("df_char.csv", index=True)