In [6]:
#!pip3 install missing packages

import numpy as np
import re
import requests
import pandas as pd
import seaborn as sns
from functools import reduce

from igel import Igel

import matplotlib.pyplot as plt
from matplotlib import rcParams

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Helper Functions

In [7]:
def clean_dataset(df):
    #assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    #df.dropna(inplace=True)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df.replace(np.nan,0)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    df = df.replace(r'\D+', '', regex=True)
    return df
    #return df[indices_to_keep].astype(np.float64)
    

def clean_dataset_int(df):
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df.replace(np.nan,0)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    df = df.replace(r'\D+', '', regex=True)
    return df[indices_to_keep].astype(np.float64)

## Mappings

Defining various mappings we'll use when manipulating the data.

In [8]:
priority_scorer = {
    '10' : 'Critical',
    '9' : 'Critical',
    '8' : 'Critical',
    '7' : 'High',
    '6' : 'High',
    '5' : 'Medium',
    '4' : 'Medium',
    '3' : 'Low',
    '2' : 'Low',
    '1.0' : 'Low',
    '0.0' : 'Low',
}

priority_mapping = {
    "Critical" : 10,
    "High" : 7.5,
    "Medium" : 5,
    "Low" : 2.5,
    "Unknown" : 0,
}

# What we consider to be highly important categories of information
highCategoriser = {
    0.0 : 'Other-Advice',
    1.0 : 'Other-Advice',
    2.0 : 'Report-CleanUp',
    3.0 : 'ContextualInformation',
    4.0 : 'Other-ContextualInformation',
    5.0 : 'CallToAction-Donations',
    6.0 : 'Report-EmergingThreats',
    7.0 : 'Report-Factoid',
    8.0 : 'Report-FirstPartyObservation',
    9.0 : 'Request-GoodsServices',
    10.0 : 'Report-Hashtags',
    11.0 : 'Request-InformationWanted',
    12.0 : 'Other-Irrelevant',
    13.0 : 'Report-Location',
    14.0 : 'CallToAction-MovePeople',
    15.0 : 'Report-MultimediaShare',
    16.0 : 'Report-NewSubEvent',
    17.0 : 'Report-News',
    18.0 : 'Report-Official',
    19.0 : 'Report-OriginalEvent',
    20.0 : 'Request-SearchAndRescue',
    21.0 : 'Other-Sentiment',
    22.0 : 'Report-ServiceAvailable',
    23.0 : 'Report-ThirdPartyObservation',
    24.0 : 'CallToAction-Volunteer',
    25.0 : 'Report-Weather',
    26.0 : 'hmm',
}

event_int_map =	{
  "guatemalaEarthquake2012": 7,
  "joplinTornado2011": 16,
  "athensEarthquake2020": 35,
  "baltimoreFlashFlood2020": 36,
  "brooklynBlockPartyShooting2020": 37,
  "daytonOhioShooting2020": 38,
  "elPasoWalmartShooting2020": 39,
  "gilroygarlicShooting2020": 40,
  "hurricaneBarry2020": 41,
  "indonesiaEarthquake2020": 42,
  "keralaFloods2020": 43,
  "myanmarFloods2020": 44,
  "papuaNewguineaEarthquake2020": 45,
  "siberianWildfires2020": 46,
  "typhoonKrosa2020": 47,
  "typhoonLekima2020": 48,
  "whaleyBridgeCollapse2020": 49
}

mymap = {'Advice':1, 'CleanUp':2, 'ContextualInformation':3, 'Discussion':4, 'Donations':5, 
        'EmergingThreats':6, 'Factoid':7, 'FirstPartyObservation':8, 'GoodsServices':9, 'Hashtags':10, 
        'InformationWanted':11,'Irrelevant':12, 'Location':13, 'MovePeople':14, 
         'MultimediaShare':15, 'NewSubEvent':16, 'News':17,
        'Official':18, 'OriginalEvent':19, 'SearchAndRescue':20, 'Sentiment':21, 'ServiceAvailable':22, 
         'ThirdPartyObservation':23,'Volunteer':24, 'Weather':25}

## FeatureAPI

Load the feature vector in from Play

In [9]:
import requests


#url = 'http://localhost:9000/stored_tweets'
import json
def parse_json_stream(stream):
    decoder = json.JSONDecoder()
    while stream:
        obj, idx = decoder.raw_decode(stream)
        yield obj
        stream = stream[idx:].lstrip()

url = 'http://tweetminer-2336003gproject.ida.dcs.gla.ac.uk/stored_tweets'

data = requests.get(url).text  

# delete unused parameters to parse to JSON format
dataFormat = data.replace("\r\n  ","").replace("\r\n","").replace("}\n{", "},\n{").replace(",}", "}").replace("\n", "")

dataFormat = dataFormat.replace(' \\"', " ").replace('\\" ', " ")

# convert to valid structure
dataFormat='{"datas":['+dataFormat+']}'

# convert to JSON format
dataJson = json.loads(dataFormat)

dfNonFormatte= pd.DataFrame(dataJson)

dfNonFormatte

# convert our JSON to DATAFRAM
a = 0
df = []
for element in dfNonFormatte["datas"]:
  df1 = dfNonFormatte["datas"][a] 
  df2 = pd.json_normalize(df1['tweets'])
  df.append(df2)
  a = a + 1

    
# Concat result
df = pd.concat(df, axis=0, join='outer', ignore_index=True)
df

Unnamed: 0,tweet_id,tweet_text,offset,sentiment,embeddings,features.numb_of_urls,features.numb_of_hashtags,features.numb_of_personal_pronouns,features.numb_of_present_tenses,features.weighted_length,features.permillage,features.tweet_created_at,features.tweet_id_str,features.positive_sentiment,features.negative_sentiment,features.numb_of_mentions,features.numb_of_media,features.numb_of_past_tenses,features.numb_of_weird_chars,features.numb_of_questions,features.numb_of_emoticons,features.numb_of_swearing_word,features.numb_of_slang_words,features.numb_of_intensifiers,features.tweet_length,features.userFollowersCount,features.userFriendsCount,features.user_numb_of_tweets,features.user_list_count,features.dict_precision,features.dict_recall,features.dict_f_measure,features.offset,features.is_verified,tfIdf.typhoon,tfIdf.krosa,tfIdf.japans,tfIdf.storm,tfIdf.agency,tfIdf.japan,tfIdf.prefecture,tfIdf.city,tfIdf.landfall,tfIdf.thursday,tfIdf.update,tfIdf.kure,tfIdf.tropical,tfIdf.meteorological,tfIdf.hiroshima,tfIdf.severe,tfIdf.officials,tfIdf.weather,tfIdf.warning,tfIdf.western,tfIdf.news,tfIdf.canceled,tfIdf.flights,tfIdf.pleaseee,tfIdf.bless,tfIdf.god,tfIdf.braces,tfIdf.operations,tfIdf.august,tfIdf.chubu,tfIdf.irregular,tfIdf.experience,tfIdf.intl,tfIdf.airport,tfIdf.nagoya,tfIdf.joint,tfIdf.center,tfIdf.issued,tfIdf.rain,tfIdf.image,tfIdf.utc,tfIdf.analysis,tfIdf.cone,tfIdf.tracks,tfIdf.gem,tfIdf.total,tfIdf.hour,tfIdf.gfs,tfIdf.gefs,tfIdf.track,tfIdf.jma,tfIdf.pacificwest,tfIdf.lashes,tfIdf.japanese,tfIdf.powerful,tfIdf.snarls,tfIdf.travel,tfIdf.holiday,tfIdf.breaking,tfIdf.wreath,tfIdf.christmas,tfIdf.crapjokemonday,tfIdf.help,tfIdf.expected,tfIdf.jeep,tfIdf.tomorrow,tfIdf.assistance,tfIdf.storms,tfIdf.injured,tfIdf.heading,tfIdf.safe,tfIdf.typhoonkrosa,tfIdf.evacuate,tfIdf.advised,tfIdf.wallop,tfIdf.looms,tfIdf.course,tfIdf.closer,tfIdf.data,tfIdf.night,tfIdf.weatherupdate,tfIdf.type,tfIdf.local,tfIdf.cyclone,tfIdf.nagasaki,tfIdf.days,tfIdf.emc,tfIdf.speeds,tfIdf.wind,tfIdf.lekima,tfIdf.featuring,tfIdf.bulletin,tfIdf.chugoku,tfIdf.flight,tfIdf.affect,tfIdf.check,tfIdf.kyushu,tfIdf.domestic,tfIdf.miyazaki,tfIdf.ana,tfIdf.kochi,tfIdf.details,tfIdf.shikoku,tfIdf.info,tfIdf.jal,tfIdf.west,tfIdf.cancelled,tfIdf.evacuations,tfIdf.disrupts,tfIdf.ahead,tfIdf.transportation,tfIdf.eastern,tfIdf.shipping,tfIdf.web,tfIdf.asia,tfIdf.benzinga,tfIdf.ecosearch,tfIdf.troubles,tfIdf.tokyokrosa,tfIdf.heavy,tfIdf.tokyo,tfIdf.typhoons,tfIdf.threatening,tfIdf.photographed,tfIdf.asian,tfIdf.churn,tfIdf.photo,tfIdf.continued,tfIdf.countries,tfIdf.aero,tfIdf.pacific,tfIdf.space,tfIdf.destructive,tfIdf.east,tfIdf.ocean,tfIdf.winds,tfIdf.national,tfIdf.credit,tfIdf.twin,tfIdf.reuters,tfIdf.nyt,tfIdf.season,tfIdf.peak,tfIdf.leader,tfIdf.supertyphoon,tfIdf.gt,tfIdf.aug,tfIdf.attention,tfIdf.mainichi,tfIdf.bullet,tfIdf.trains,tfIdf.travelers,tfIdf.red,tfIdf.barrel,tfIdf.alert,tfIdf.beijing,tfIdf.issues,tfIdf.nears,tfIdf.urges,tfIdf.cancels,tfIdf.scores,tfIdf.res,tfIdf.russian,tfIdf.smoke,tfIdf.uwssec,tfIdf.uwcimss,tfIdf.wildland,tfIdf.version,tfIdf.snpp,tfIdf.noaa,tfIdf.view,tfIdf.composite,tfIdf.truecolor,tfIdf.jpss,tfIdf.layers,tfIdf.realearth,tfIdf.hourly,tfIdf.orbit,tfIdf.swath,tfIdf.flooding,tfIdf.rains,tfIdf.authorities,tfIdf.lashed,tfIdf.risk,tfIdf.advising,tfIdf.mph,tfIdf.nw,tfIdf.hurricane,tfIdf.tracked,tfIdf.position,tfIdf.southern,tfIdf.hitting,tfIdf.allah,tfIdf.elijah,tfIdf.china,tfIdf.messenger,tfIdf.video,tfIdf.weatherchannel,tfIdf.muhammad,tfIdf.approaches,tfIdf.prepares,tfIdf.nearing,tfIdf.hits,tfIdf.torrential,tfIdf.physorg,tfIdf.krosas,tfIdf.approach,tfIdf.nasa,tfIdf.nation,tfIdf.friday,tfIdf.influence,tfIdf.jtwc,tfIdf.bracing,tfIdf.himawari,tfIdf.ahi,tfIdf.rgb,...,tfIdf.registrationkerala,tfIdf.volunteer,tfIdf.registration,tfIdf.register,tfIdf.medical,tfIdf.woman,tfIdf.belt,tfIdf.rope,tfIdf.technique,tfIdf.team,tfIdf.crossed,tfIdf.pregnant,tfIdf.brothers,tfIdf.doforkerala,tfIdf.keralasfc,tfIdf.hatsoff,tfIdf.items,tfIdf.waterlogged,tfIdf.dcnation,tfIdf.receiving,tfIdf.battered,tfIdf.heros,tfIdf.persons,tfIdf.waterloggeded,tfIdf.kuttiady,tfIdf.paddy,tfIdf.alappuzhafloods,tfIdf.overflowing,tfIdf.arrives,tfIdf.diaster,tfIdf.held,tfIdf.taluks,tfIdf.mannarkkud,tfIdf.pinarayivijayan,tfIdf.offiice,tfIdf.cmokerala,tfIdf.emergencyresponse,tfIdf.policecontrolroom,tfIdf.ernakulampolice,tfIdf.keralapolice,tfIdf.aluva,tfIdf.measure,tfIdf.precautionary,tfIdf.cial,tfIdf.saved,tfIdf.hind,tfIdf.valor,tfIdf.congratulate,tfIdf.daughters,tfIdf.jai,tfIdf.ravaged,tfIdf.singh,tfIdf.prithviraj,tfIdf.ji,tfIdf.kasaragod,tfIdf.provisions,tfIdf.distributes,tfIdf.prevails,tfIdf.heavyrainfall,tfIdf.adoor,tfIdf.nil,tfIdf.konni,tfIdf.salute,tfIdf.extensive,tfIdf.schoolsafety,tfIdf.schoolsout,tfIdf.kalyanpar,tfIdf.tommorow,tfIdf.nemom,tfIdf.assam,tfIdf.bhakts,tfIdf.restricted,tfIdf.bollywood,tfIdf.spell,tfIdf.stateis,tfIdf.extremely,tfIdf.subside,tfIdf.tollacross,tfIdf.mathrubhumi,tfIdf.sounded,tfIdf.rainfury,tfIdf.deathtoll,tfIdf.jn,tfIdf.antyodaya,tfIdf.kochuveli,tfIdf.travelling,tfIdf.travellers,tfIdf.avoid,tfIdf.emigrants,tfIdf.sevabhaarathikeralam,tfIdf.nagambadom,tfIdf.overflows,tfIdf.temple,tfIdf.kasargode,tfIdf.imd,tfIdf.girls,tfIdf.tirur,tfIdf.standwithkerala,tfIdf.togetherforkerala,tfIdf.monsoonrain,tfIdf.overflow,tfIdf.confuse,tfIdf.driver,tfIdf.ambulance,tfIdf.boy,tfIdf.child,tfIdf.narendramodi,tfIdf.reduce,tfIdf.start,tfIdf.kozhikodefloods,tfIdf.damages,tfIdf.jobs,tfIdf.enterprises,tfIdf.deathtollrisesto,tfIdf.touches,tfIdf.traders,tfIdf.govt,tfIdf.keralavyaparivyavasayiekopanasamithi,tfIdf.failure,tfIdf.recurring,tfIdf.preserve,tfIdf.blame,tfIdf.mnm,tfIdf.closure,tfIdf.committee,tfIdf.secretary,tfIdf.sinha,tfIdf.chairs,tfIdf.decrease,tfIdf.deceased,tfIdf.spot,tfIdf.nadiad,tfIdf.gujarats,tfIdf.pond,tfIdf.crocodile,tfIdf.artificial,tfIdf.bignews,tfIdf.ahmedabadrain,tfIdf.westernrailway,tfIdf.ahmedabadrains,tfIdf.fall,tfIdf.zone,tfIdf.waybad,tfIdf.kolhapur,tfIdf.soniagandhi,tfIdf.coimbatore,tfIdf.continous,tfIdf.tamilnadu,tfIdf.coimbatorerain,tfIdf.nilgiris,tfIdf.monsoonready,tfIdf.carried,tfIdf.waded,tfIdf.gesture,tfIdf.knee,tfIdf.heartwarming,tfIdf.stuff,tfIdf.fleece,tfIdf.unchecked,tfIdf.wishers,tfIdf.crores,tfIdf.led,tfIdf.siphoned,tfIdf.thug,tfIdf.excuse,tfIdf.tragedy,tfIdf.comrades,tfIdf.floodskerala,tfIdf.spiderman,tfIdf.kerela,tfIdf.nilambur,tfIdf.underway,tfIdf.kavalappara,tfIdf.college,tfIdf.usual,tfIdf.student,tfIdf.campus,tfIdf.awkwardly,tfIdf.sacred,tfIdf.quarrying,tfIdf.repurposing,tfIdf.illegal,tfIdf.factors,tfIdf.chose,tfIdf.controllable,tfIdf.constructions,tfIdf.mkstalin,tfIdf.stalin,tfIdf.party,tfIdf.rajdhani,tfIdf.batter,tfIdf.spate,tfIdf.wreak,tfIdf.submerged,tfIdf.tourist,tfIdf.attraction,tfIdf.malapuram,tfIdf.methods,tfIdf.planning,tfIdf.construction,tfIdf.exposed,tfIdf.indian,tfIdf.floodsituation,tfIdf.molitics,tfIdf.donation,tfIdf.santhi,tfIdf.chaliyar,tfIdf.pothukalpanchayat,tfIdf.chaliyarriver,tfIdf.dropping,tfIdf.helicopter,tfIdf.marooned,tfIdf.prayforkerala,tfIdf.victorious,tfIdf.respite,tfIdf.converging,tfIdf.impacted,tfIdf.deepening,tfIdf.parties,tfIdf.effort,tfIdf.sir,tfIdf.humaneexpress,tfIdf.father,tfIdf.girl,tfIdf.adopt,tfIdf.floodcoveragewithtnie,tfIdf.mitigate,tfIdf.haunt,tfIdf.ghosts,tfIdf.continuous,tfIdf.tackling,tfIdf.assured,tfIdf.motor,tfIdf.shouldnt,tfIdf.buried,tfIdf.body,tfIdf.youth,tfIdf.bike,tfIdf.repeat,tfIdf.scenes,tfIdf.wrecked,tfIdf.sitting,tfIdf.excavated,tfIdf.eid,tfIdf.eidmubarak,tfIdf.mubarak,tfIdf.bakrid,tfIdf.sacrifice,tfIdf.improve,tfIdf.taanation
0,1161444994772656128,"[typhoon, krosa]",435876.0,"-0.13 , 0.01 , 0.15 , -0.06 , 0.05 , 0.05 , 0...","{-0.25539,-0.25723,0.13169,-0.042688,0.21817,-...",,0.0,0.0,0.0,33.0,117.0,1.565745e+12,1.161445e+18,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,16.0,11.0,58.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.441944,0.072934,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1161899515705565184,"[typhoon, update, japans, meteorological, agen...",544242.0,"-0.17 , -0.01 , 0.12 , 0.01 , 0.07 , 0.01 , 0...","{-6.4724126,-6.723974,-3.5606878,-5.038674,1.6...",,0.0,0.0,0.0,242.0,864.0,1.565853e+12,1.161900e+18,0.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,181.0,17497.0,14127.0,0.0,208.0,0.0,0.0,0.0,0.0,0.0,0.044194,0.007293,0.195701,0.052023,0.182583,0.027336,0.195701,0.213535,0.082095,0.14422,0.241516,0.213535,0.058018,0.182583,0.186585,0.12277,0.2559,0.151928,0.175428,0.07402,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1161933320315424768,"[typhoon, krosa, landfall, japan, flights, can...",552301.0,"0.05 , 0.04 , 0.20 , -0.13 , 0.04 , 0.09 , 0....","{-1.7787836,-2.1009903,-0.13006201,-0.39967805...",,1.0,0.0,0.0,116.0,414.0,1.565861e+12,1.161933e+18,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,58.0,3368.0,181.0,0.0,1548.0,0.0,0.0,0.0,0.0,0.0,0.126270,0.020838,,,,0.078104,,,0.234556,,,,,,,,,,,,0.368431,0.440159,0.262083,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1161657339277012998,"[pleaseee, god, bless, japan, japan, braces, t...",486502.0,"-0.04 , -0.07 , 0.06 , -0.04 , 0.10 , -0.01 ,...","{-3.3788335,-3.8022702,-1.1094,-2.9061859,0.83...",,0.0,0.0,0.0,92.0,328.0,1.565796e+12,1.161657e+18,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,60.0,379.0,127.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.110486,0.018234,,,,0.136681,,,,,,,,,,,,,,,,,,0.777076,0.777076,0.777076,0.332657,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1161451155630616578,"[intl, august, flights, nagoya, chubu, airport...",437344.0,"-0.09 , 0.11 , 0.18 , -0.11 , 0.04 , 0.02 , 0...","{-3.086079,-3.0274096,1.9695101,-1.5520902,1.0...",,0.0,0.0,0.0,145.0,517.0,1.565747e+12,1.161451e+18,0.0,0.0,0.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,98.0,21447.0,2.0,0.0,208.0,0.0,0.0,0.0,0.0,1.0,0.080353,0.013261,,,,,,,,,,,,,,,,,,,,,0.166780,,,,,0.31896,0.223219,0.388245,0.31896,0.313092,0.388245,0.365398,0.376106,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3502,1161282602894024705,"[shocking, scenes, kavalappara, kerala, landsl...",507498.0,"-0.12 , -0.06 , 0.04 , -0.03 , 0.19 , -0.09 ,...","{-11.012937,-5.807516,-1.5672508,-2.5821385,4....",,4.0,0.0,0.0,275.0,982.0,1.565706e+12,1.161283e+18,0.0,0.0,0.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,0.0,206.0,4163.0,2589.0,0.0,106.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.230144,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.259025,0.259025,0.259025,0.259025,0.259025,0.259025,0.259025,0.259025,0.259025,0.259025,0.259025,,,,,,,
3503,1160792986818506752,"[eid, mubarak, pray, floods, kerala, india, ei...",390765.0,"0.00 , 0.11 , 0.13 , -0.12 , 0.00 , 0.06 , 0....","{-4.02581,-5.2378297,0.099971,-1.4532611,1.876...",,6.0,0.0,0.0,232.0,828.0,1.565590e+12,1.160793e+18,0.0,0.0,0.0,1.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,105.0,597.0,561.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.518051,0.518051,0.518051,0.518051,0.518051,,
3504,1162005634813849602,"[died, august, flood, incidents, kerala, injur...",679883.0,"-0.04 , -0.03 , 0.06 , -0.04 , 0.10 , 0.04 , ...","{-1.9977798,-2.4004693,0.45704,-0.91141105,0.9...",,1.0,0.0,0.0,124.0,442.0,1.565879e+12,1.162006e+18,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,72.0,65.0,98.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.379819,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.390695,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3505,1161578539608694784,"[heavy, rains, lash, kerala, flood, situation,...",578055.0,"0.01 , -0.12 , 0.05 , -0.04 , 0.21 , -0.10 , ...","{-5.519667,-7.700042,-0.49457493,-0.713958,2.6...",,6.0,0.0,0.0,209.0,746.0,1.565777e+12,1.161579e+18,0.0,0.0,0.0,0.0,0.0,34.0,0.0,0.0,0.0,0.0,0.0,161.0,63587.0,658.0,0.0,497.0,0.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.128897,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.223426,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.324909,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.365683,


# Normalise the sentiment and the embeddings

> Extract the sentiment and embeddings into multiple columns depending on need                                                 

In [None]:
def expand_sentiment(df):
    df['sentiment'] = df[df['sentiment'].apply(lambda x:pd.Series(x))]
    df_sentiment=pd.concat([df['sentiment'].str.split(',', expand=True)], axis=1, keys="s")
    # Join back onto the original dataframe
    df.join(df_sentiment)
    return df
    
def expand_embeddings(df):
    df['embeddings'] = df['embeddings'].str.extract('(\d+)', expand=False)
    df['embeddings'] = df['embeddings'].str[1:]
    df_embeddings = pd.concat([df['embeddings'].str.split(',', expand=True)], axis=1, keys="e")
    df_embeddings
    df.join(df_embeddings) # Join back onto the original dataframe
    df_emb_sent=df_sentiment.join(df_embeddings)
    df_emb_sent
    df.join(df_emb_sent)
    return df

In [None]:
df

In [None]:
df = expand_sentiment(df)

In [None]:
df = expand_embeddings(df)

In [None]:
# Drop the string categories we no longer need
df.drop(['tweet_text', 'embeddings', 'sentiment'], axis = 1, inplace = True)


In [None]:
# Group by tweet
feature_vector_input = df.groupby(['tweet_id']).agg('first')
feature_vector_input

## Load the labelled data

These are generated in 0_Labels.ipynb

In [None]:
# dtypes needs to be specified or long ints will change
labels_df = pd.read_json("../../../data/input/raw/data/2020/2020-A/labels/TRECIS-2018-2020A-labels.json", dtype={} )

# replace the event with a numeric value
labels_df = labels_df.replace({'eventID': event_int_map})

# Count the number of labels
labels_df['num'] = labels_df['postCategories'].str.len()

# Map the priority to a numeric value
labels_df = labels_df.replace({"postPriority": priority_mapping})

# Split categories and map to numeric values
category_list = pd.DataFrame(labels_df["postCategories"].to_list(), columns=['cat1', 'cat2', 'cat3',
                                                                   'cat4', 'cat5', 'cat6',
                                                                   'cat7', 'cat8', 'cat9', 'cat10'])


# Map the categories to numeric values
category_list = category_list.applymap(lambda s: mymap.get(s) if s in mymap else s)


# Join back onto our original list
labels = labels_df.join(category_list)

# Drop the string categories
labels.drop(['postCategories'], axis = 1, inplace = True)

# Tidy
labels = labels.drop(['eventName', 'eventDescription', 'eventType'], axis=1)

# Fill the NaN slots with 0
labels = labels.fillna("0")

# Export
labels.to_csv("../labels.csv", index=False)
labels = clean_dataset_int(labels)
#labels = clean_dataset_new(labels)

labels

# Train.csv


We merge the feature vector with the annotated data, this is used to train the classifier.

In [None]:
feature_vector_input.drop(['tweet_text'], axis = 1, inplace = True)
feature_vector_input.drop(['embeddings'], axis = 1, inplace = True)
feature_vector_input.drop(['sentiment'], axis = 1, inplace = True)

In [None]:
# Merges the input feature vector with the labels
train = pd.merge(labels, feature_vector_input, left_on = 'postID', right_on = 'tweet_id', how = 'inner')

# reset the index
train = train.reset_index()

# fill NaN and replace Infinity
train.fillna(0, inplace=True)
train.replace([np.inf, -np.inf], np.nan, inplace=True)
train = train.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

# export to csv
train.to_csv("../train.csv", index=False)

train

# Test.csv


Drops the categories, number of categories and priority so we can make our prediction

In [None]:
# Take a copy of the train dataframe
test = train

# Drop cat*
test.drop(list(test.filter(regex = 'cat\d+$')), axis = 1, inplace = True)


# Drop priority / num (of labels)
test.drop(['postPriority', 'num'], axis = 1, inplace = True)

#api_input.drop(['tweet_text_y'], axis = 1, inplace = True)
#api_input.drop(['embeddings_x'], axis = 1, inplace = True)
#api_input.drop(['sentiment_x'], axis = 1, inplace = True)

# export
test.to_csv("../test.csv", index=False)

test

# Igel

This notebook implements `Igel` and supports all sklearn's machine learning functionality.

Caution must be taken to avoid overfitting. See `docs/ml.md` for more information

Igel's supported models:

        +--------------------+----------------------------+-------------------------+
        |      regression    |        classification      |        clustering       |
        +--------------------+----------------------------+-------------------------+
        |   LinearRegression |         LogisticRegression |                  KMeans |
        |              Lasso |                      Ridge |     AffinityPropagation |
        |          LassoLars |               DecisionTree |                   Birch |
        | BayesianRegression |                  ExtraTree | AgglomerativeClustering |
        |    HuberRegression |               RandomForest |    FeatureAgglomeration |
        |              Ridge |                 ExtraTrees |                  DBSCAN |
        |  PoissonRegression |                        SVM |         MiniBatchKMeans |
        |      ARDRegression |                  LinearSVM |    SpectralBiclustering |
        |  TweedieRegression |                      NuSVM |    SpectralCoclustering |
        | TheilSenRegression |            NearestNeighbor |      SpectralClustering |
        |    GammaRegression |              NeuralNetwork |               MeanShift |
        |   RANSACRegression | PassiveAgressiveClassifier |                  OPTICS |
        |       DecisionTree |                 Perceptron |                    ---- |
        |          ExtraTree |               BernoulliRBM |                    ---- |
        |       RandomForest |           BoltzmannMachine |                    ---- |
        |         ExtraTrees |       CalibratedClassifier |                    ---- |
        |                SVM |                   Adaboost |                    ---- |
        |          LinearSVM |                    Bagging |                    ---- |
        |              NuSVM |           GradientBoosting |                    ---- |
        |    NearestNeighbor |        BernoulliNaiveBayes |                    ---- |
        |      NeuralNetwork |      CategoricalNaiveBayes |                    ---- |
        |         ElasticNet |       ComplementNaiveBayes |                    ---- |
        |       BernoulliRBM |         GaussianNaiveBayes |                    ---- |
        |   BoltzmannMachine |      MultinomialNaiveBayes |                    ---- |
        |           Adaboost |                       ---- |                    ---- |
        |            Bagging |                       ---- |                    ---- |
        |   GradientBoosting |                       ---- |                    ---- |
        +--------------------+----------------------------+-------------------------+

In [None]:
# Train the model

params = {
        'cmd': 'fit',    
        'data_path': "../train.csv",
        'yaml_path': 'yaml/multi.yaml'  # DecisionTree
}

Igel(**params)

In [None]:
# Use model to predict on missing values

params = {
        'cmd': 'predict',    
        'data_path': "../test.csv",
        'yaml_path': 'yaml/hyper.yaml'
}
Igel(**params)

# Predictions

1. View the raw predictions
2. Map the labels to their High Level Information Types
3. Merge the predictions back into the training set


In [None]:
def is_neg_predictions(predictions):
    predictions = predictions.sort_values(by=['postPriority'])
    predictions = predictions[(predictions > 0).all(1)]
    predictions.round()

    
predictions = pd.read_csv("model_results/predictions.csv")
predictions

#is_neg_predictions(predictions)

### Merge the new predictions back onto dataframe with the missing columns

In [None]:
# Map the labels to their High Level Information Types
cat_list = predictions.filter(regex='cat', axis=1).round().applymap(lambda x: highCategoriser[x])

#
predictions = cat_list.combine_first(predictions)

# Merge the predictions back into the training set
df = test.merge(predictions, left_index=True, right_index=True)


# Append the predicted categories to a list in a new column
df['predicted_categories'] = df[['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10']].values.tolist()


# Get the number of categories into something we can use to index
df['num'] = df['num'].astype(float).astype(int)

# Remove categories beyond what the tweet is predicted to have
df['categories'] = df.apply(lambda x: x['predicted_categories'][0:x['num']], axis=1)


df

## Export

Export in the TRECIS format

In [None]:
# write to .run file
with open("marks2.run" , "w") as out_file:
    for row in df.drop_duplicates(subset="postID").itertuples():
        #print("row:", row)
        content = [
            "TRECIS-CTIT-H-Test-0" + str(int(row.eventID)),
            "Q0",          
            np.int64(row.postID),   
            getattr(row, 'Index'),  #ToDo: Fix?
            #row.priority,
            str(priority_scorer[str(round(row.postPriority))[:3]]),  #ToDo: Fix
            row.categories,
            "marksrun2"
        ]
        out_file.write("\t".join([str(x) for x in content]) + "\n")

In [None]:
#

In [None]:
#df['tweets'] = df['tweets'].str[2:-1]
df


In [3]:
# JSONDecodeError: Expecting ',' delimiter: 
#json_response = json.loads('[' + q.text + '],')

#print(data)

In [4]:
#file_list = ["../output/run2.json"] #"../output/run.json", "../output/run0.json", 
#dfs = [] # an empty list to store the data frames

#for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
    dfs.append(json_data)
df = pd.concat(dfs, sort=False) # or sort=True depending on your needs
df




NameError: name 'pd' is not defined

In [5]:
#df = pd.read_json("../output/run.json", lines=True)
from pandas.io.json import json_normalize
df = pd.concat([pd.DataFrame(json_normalize(x)) for x in df['tweets']],ignore_index=True)
df

NameError: name 'pd' is not defined