In [None]:
#!pip3 install missing packages

import numpy as np
import re
import requests
import pandas as pd
import seaborn as sns
from functools import reduce

from igel import Igel

import matplotlib.pyplot as plt
from matplotlib import rcParams

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Helper Functions

In [None]:
def clean_dataset(df):
    #assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    #df.dropna(inplace=True)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df.replace(np.nan,0)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    df = df.replace(r'\D+', '', regex=True)
    return df
    #return df[indices_to_keep].astype(np.float64)
    

def clean_dataset_int(df):
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df.replace(np.nan,0)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    df = df.replace(r'\D+', '', regex=True)
    return df[indices_to_keep].astype(np.float64)

## Mappings

Defining various mappings we'll use when manipulating the data.

In [None]:
priority_scorer = {
    '10' : 'Critical',
    '9' : 'Critical',
    '8' : 'Critical',
    '7' : 'High',
    '6' : 'High',
    '5' : 'Medium',
    '4' : 'Medium',
    '3' : 'Low',
    '2' : 'Low',
    '1.0' : 'Low',
    '0.0' : 'Low',
}

priority_mapping = {
    "Critical" : 10,
    "High" : 7.5,
    "Medium" : 5,
    "Low" : 2.5,
    "Unknown" : 0,
}

# What we consider to be highly important categories of information
highCategoriser = {
    0.0 : 'Other-Advice',
    1.0 : 'Other-Advice',
    2.0 : 'Report-CleanUp',
    3.0 : 'ContextualInformation',
    4.0 : 'Other-ContextualInformation',
    5.0 : 'CallToAction-Donations',
    6.0 : 'Report-EmergingThreats',
    7.0 : 'Report-Factoid',
    8.0 : 'Report-FirstPartyObservation',
    9.0 : 'Request-GoodsServices',
    10.0 : 'Report-Hashtags',
    11.0 : 'Request-InformationWanted',
    12.0 : 'Other-Irrelevant',
    13.0 : 'Report-Location',
    14.0 : 'CallToAction-MovePeople',
    15.0 : 'Report-MultimediaShare',
    16.0 : 'Report-NewSubEvent',
    17.0 : 'Report-News',
    18.0 : 'Report-Official',
    19.0 : 'Report-OriginalEvent',
    20.0 : 'Request-SearchAndRescue',
    21.0 : 'Other-Sentiment',
    22.0 : 'Report-ServiceAvailable',
    23.0 : 'Report-ThirdPartyObservation',
    24.0 : 'CallToAction-Volunteer',
    25.0 : 'Report-Weather',
    26.0 : 'hmm',
}

event_int_map =	{
  "guatemalaEarthquake2012": 7,
  "joplinTornado2011": 16,
  "athensEarthquake2020": 35,
  "baltimoreFlashFlood2020": 36,
  "brooklynBlockPartyShooting2020": 37,
  "daytonOhioShooting2020": 38,
  "elPasoWalmartShooting2020": 39,
  "gilroygarlicShooting2020": 40,
  "hurricaneBarry2020": 41,
  "indonesiaEarthquake2020": 42,
  "keralaFloods2020": 43,
  "myanmarFloods2020": 44,
  "papuaNewguineaEarthquake2020": 45,
  "siberianWildfires2020": 46,
  "typhoonKrosa2020": 47,
  "typhoonLekima2020": 48,
  "whaleyBridgeCollapse2020": 49
}

mymap = {'Advice':1, 'CleanUp':2, 'ContextualInformation':3, 'Discussion':4, 'Donations':5, 
        'EmergingThreats':6, 'Factoid':7, 'FirstPartyObservation':8, 'GoodsServices':9, 'Hashtags':10, 
        'InformationWanted':11,'Irrelevant':12, 'Location':13, 'MovePeople':14, 
         'MultimediaShare':15, 'NewSubEvent':16, 'News':17,
        'Official':18, 'OriginalEvent':19, 'SearchAndRescue':20, 'Sentiment':21, 'ServiceAvailable':22, 
         'ThirdPartyObservation':23,'Volunteer':24, 'Weather':25}

## FeatureAPI

Load the feature vector in from Play

In [None]:
#url = 'http://localhost:9000/stored_tweets'
import json
def parse_json_stream(stream):
    decoder = json.JSONDecoder()
    while stream:
        obj, idx = decoder.raw_decode(stream)
        yield obj
        stream = stream[idx:].lstrip()

#url = 'http://tweetminer-2336003gproject.ida.dcs.gla.ac.uk/stored_tweets'

#data = requests.get(url)#.json()

#result = json.loads(data.content.decode('utf-8'))

#q = requests.get(url)

#json_response = json.loads(q.text.replace('�', ''))



In [None]:
# JSONDecodeError: Expecting ',' delimiter: 
#json_response = json.loads('[' + q.text + '],')

#print(data)

In [83]:
file_list = ["../output/run.json","../output/run0.json"] #"../output/run.json", 
dfs = [] # an empty list to store the data frames

for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
    dfs.append(json_data)
df = pd.concat(dfs, sort=False) # or sort=True depending on your needs
df

Unnamed: 0,fileName,tweets
0,data/tweets/run/brook_selected.jsonl,"[{'tweet_id': 1155531927056551936, 'tweet_text..."
1,data/tweets/run/el.jsonl,"[{'tweet_id': 1157817871835324416, 'tweet_text..."
2,data/tweets/run/gil.jsonl,"[{'tweet_id': 1155854599221329920, 'tweet_text..."
0,data/tweets/run0/indo.jsonl,"[{'tweet_id': 1157394630561808384, 'tweet_text..."
1,data/tweets/run0/indo.jsonl,"[{'tweet_id': 1157394630561808384, 'tweet_text..."
2,data/tweets/run0/barry.jsonl,"[{'tweet_id': 1149871010231070720, 'tweet_text..."
3,data/tweets/run0/barry.jsonl,"[{'tweet_id': 1149871010231070720, 'tweet_text..."
4,data/tweets/run0/dayton_selected.jsonl,"[{'tweet_id': 1158114685738266627, 'tweet_text..."
5,data/tweets/run0/dayton_selected.jsonl,"[{'tweet_id': 1158114685738266627, 'tweet_text..."
6,data/tweets/run0/baltimore_selected.jsonl,"[{'tweet_id': 1157961073921810432, 'tweet_text..."


In [84]:
#df = pd.read_json("../output/run.json", lines=True)
from pandas.io.json import json_normalize
new_df = pd.concat([pd.DataFrame(json_normalize(x)) for x in df['tweets']],ignore_index=True)
new_df

Unnamed: 0,tweet_id,tweet_text,offset,sentiment,embeddings,features.numb_of_urls,features.numb_of_hashtags,features.numb_of_personal_pronouns,features.numb_of_present_tenses,features.weighted_length,features.permillage,features.tweet_created_at,features.tweet_id_str,features.positive_sentiment,features.negative_sentiment,features.numb_of_mentions,features.numb_of_media,features.numb_of_past_tenses,features.numb_of_weird_chars,features.numb_of_questions,features.numb_of_emoticons,features.numb_of_swearing_word,features.numb_of_slang_words,features.numb_of_intensifiers,features.tweet_length,features.userFollowersCount,features.userFriendsCount,features.user_numb_of_tweets,features.user_list_count,features.dict_precision,features.dict_recall,features.dict_f_measure,features.offset,features.is_verified,tfIdf.brooklyn,tfIdf.shooting,tfIdf.everybodys,tfIdf.social,tfIdf.ahead,tfIdf.brownsville,tfIdf.claimed,tfIdf.media,tfIdf.community,tfIdf.claiming,tfIdf.gunfire,tfIdf.amazing,tfIdf.celebration,tfIdf.crazy,tfIdf.days,tfIdf.course,tfIdf.kids,tfIdf.deadly,tfIdf.southall,tfIdf.ashley,tfIdf.nyt,tfIdf.block,tfIdf.beloved,tfIdf.party,tfIdf.marred,tfIdf.gun,tfIdf.tragedy,tfIdf.low,tfIdf.suffering,tfIdf.forward,tfIdf.nyc,tfIdf.crime,tfIdf.event,tfIdf.yearly,tfIdf.constituents,tfIdf.communities,tfIdf.violence,tfIdf.wounded,tfIdf.left,tfIdf.dead,tfIdf.injured,tfIdf.jerusalem,tfIdf.leaves,tfIdf.nypd,tfIdf.park,tfIdf.shattered,tfIdf.mayor,tfIdf.york,tfIdf.blasio,tfIdf.neighborhood,tfIdf.peaceful,tfIdf.ideal,tfIdf.shooters,tfIdf.water,tfIdf.shot,tfIdf.throwing,tfIdf.police,tfIdf.wonder,tfIdf.officers,tfIdf.provide,tfIdf.person,tfIdf.officials,tfIdf.updates,tfIdf.vows,tfIdf.justice,tfIdf.killed,tfIdf.commissioner,tfIdf.gunman,tfIdf.mass,tfIdf.held,tfIdf.city,tfIdf.scene,tfIdf.attack,tfIdf.rushed,tfIdf.playground,tfIdf.paramedics,tfIdf.forget,tfIdf.night,tfIdf.talking,tfIdf.worse,tfIdf.crowded,tfIdf.seriously,tfIdf.cover,tfIdf.running,tfIdf.outdoor,tfIdf.late,tfIdf.hundreds,tfIdf.thousands,tfIdf.hurt,tfIdf.normal,tfIdf.multiple,tfIdf.stoptheviolence,tfIdf.prayers,tfIdf.damn,tfIdf.minutes,tfIdf.drove,tfIdf.spot,tfIdf.canarsie,tfIdf.happened,tfIdf.news,tfIdf.gunmen,tfIdf.nypds,tfIdf.oneill,tfIdf.todau,tfIdf.catch,tfIdf.didn,tfIdf.wasn,tfIdf.wonderful,tfIdf.talk,tfIdf.leadership,tfIdf.safe,tfIdf.ny,tfIdf.conditions,tfIdf.adverse,tfIdf.hey,tfIdf.oldtimers,tfIdf.week,tfIdf.st,tfIdf.bed,tfIdf.ave,tfIdf.stuy,tfIdf.report,tfIdf.howard,tfIdf.marked,tfIdf.pieces,tfIdf.evidence,tfIdf.condemning,tfIdf.morning,tfIdf.columbus,tfIdf.loose,tfIdf.hospitalized,tfIdf.head,tfIdf.single,tfIdf.timers,tfIdf.wound,tfIdf.erupted,tfIdf.died,tfIdf.bullet,tfIdf.fox,tfIdf.god,tfIdf.tw,tfIdf.amp,tfIdf.htmlsocsrc,tfIdf.soctrk,tfIdf.lives,tfIdf.shooter,tfIdf.devastated,tfIdf.interrupted,tfIdf.family,tfIdf.residence,tfIdf.arrests,tfIdf.update,tfIdf.fighting,tfIdf.gunwoman,tfIdf.stupid,tfIdf.lies,tfIdf.danger,tfIdf.breaking,tfIdf.trumps,tfIdf.fault,tfIdf.maga,tfIdf.president,tfIdf.guess,tfIdf.playing,tfIdf.killing,tfIdf.gathered,tfIdf.injuring,tfIdf.firing,tfIdf.annual,tfIdf.set,tfIdf.weekend,tfIdf.scendsei,tfIdf.snuck,tfIdf.tremendous,tfIdf.summer,tfIdf.believes,tfIdf.peace,tfIdf.silver,tfIdf.blessing,tfIdf.critical,tfIdf.festival,tfIdf.condition,tfIdf.leaving,tfIdf.close,tfIdf.coming,tfIdf.sadly,tfIdf.saturday,tfIdf.represents,tfIdf.democrat,tfIdf.district,tfIdf.bra,tfIdf.cue,tfIdf.music,tfIdf.woman,tfIdf.true,tfIdf.theme,tfIdf.ass,tfIdf.play,tfIdf.assault,tfIdf.period,tfIdf.medical,tfIdf.ceases,tfIdf.colors,tfIdf.america,tfIdf.disappoint,tfIdf.son,tfIdf.children,tfIdf.future,tfIdf.ground,tfIdf.charged,tfIdf.fear,tfIdf.narrative,tfIdf.interrupt,tfIdf.standard,tfIdf.london,tfIdf.evening,...,tfIdf.bus,tfIdf.scare,tfIdf.gleybers,tfIdf.translation,tfIdf.awaits,tfIdf.tonights,tfIdf.raven,tfIdf.sunset,tfIdf.loch,tfIdf.reservoir,tfIdf.filled,tfIdf.bills,tfIdf.ransomware,tfIdf.funding,tfIdf.federal,tfIdf.soared,tfIdf.sank,tfIdf.outchea,tfIdf.leoseason,tfIdf.tufff,tfIdf.echoing,tfIdf.chrome,tfIdf.fordtaurussho,tfIdf.rgfxcustoms,tfIdf.ford,tfIdf.delete,tfIdf.mchromedelete,tfIdf.stealthmode,tfIdf.taurus,tfIdf.glossblack,tfIdf.sho,tfIdf.blackedout,tfIdf.chromedelete,tfIdf.ingrates,tfIdf.phonys,tfIdf.geeking,tfIdf.rapid,tfIdf.oakland,tfIdf.yucaipa,tfIdf.wellston,tfIdf.missouri,tfIdf.outweigh,tfIdf.timbuktu,tfIdf.ravensflock,tfIdf.ravensnation,tfIdf.representing,tfIdf.edreed,tfIdf.wbal,tfIdf.organizes,tfIdf.delia,tfIdf.foley,tfIdf.looneys,tfIdf.kisslings,tfIdf.smalltimore,tfIdf.dog,tfIdf.corruption,tfIdf.richmond,tfIdf.hampton,tfIdf.michigan,tfIdf.saginaw,tfIdf.lg,tfIdf.mant,tfIdf.homicides,tfIdf.ryan,tfIdf.lemon,tfIdf.bette,tfIdf.midler,tfIdf.athletes,tfIdf.climate,tfIdf.schiff,tfIdf.squad,tfIdf.mccain,tfIdf.cummins,tfIdf.defending,tfIdf.weeks,tfIdf.dumpster,tfIdf.unaccounted,tfIdf.greatawakening,tfIdf.methods,tfIdf.thread,tfIdf.falseflag,tfIdf.bastards,tfIdf.flush,tfIdf.carolina,tfIdf.pennsylvania,tfIdf.manson,tfIdf.georgia,tfIdf.villar,tfIdf.cycle,tfIdf.woodbourne,tfIdf.mccabe,tfIdf.baltimorecity,tfIdf.lordzofwar,tfIdf.blakluck,tfIdf.trumovement,tfIdf.eastsyde,tfIdf.psa,tfIdf.waits,tfIdf.libs,tfIdf.muniland,tfIdf.horrors,tfIdf.comparatively,tfIdf.gronk,tfIdf.aaron,tfIdf.combo,tfIdf.hernandez,tfIdf.orange,tfIdf.metro,tfIdf.severe,tfIdf.thunderstorm,tfIdf.conservativestakecharge,tfIdf.hel,tfIdf.crew,tfIdf.pot,tfIdf.smokes,tfIdf.addams,tfIdf.nicori,tfIdf.ensemble,tfIdf.baltimorecleanup,tfIdf.aborted,tfIdf.babies,tfIdf.unroll,tfIdf.grounds,tfIdf.stomping,tfIdf.ignores,tfIdf.decay,tfIdf.trenton,tfIdf.robbins,tfIdf.stockton,tfIdf.jersey,tfIdf.bonjour,tfIdf.attached,tfIdf.burgled,tfIdf.goads,tfIdf.pitch,tfIdf.heartland,tfIdf.resistors,tfIdf.compton,tfIdf.overton,tfIdf.rapids,tfIdf.alexandria,tfIdf.va,tfIdf.chesapeake,tfIdf.cedar,tfIdf.cascilla,tfIdf.stoled,tfIdf.meetings,tfIdf.sir,tfIdf.deadliest,tfIdf.lordy,tfIdf.partial,tfIdf.bronycon,tfIdf.approved,tfIdf.subprime,tfIdf.series,tfIdf.bleep,tfIdf.behaving,tfIdf.admits,tfIdf.carr,tfIdf.aquille,tfIdf.crimestopper,tfIdf.oldiesbutgoodies,tfIdf.slaughtering,tfIdf.helloooo,tfIdf.solo,tfIdf.helped,tfIdf.volunteers,tfIdf.maxine,tfIdf.leagues,tfIdf.rooney,tfIdf.burglarized,tfIdf.pine,tfIdf.tenn,tfIdf.arkansas,tfIdf.birmingham,tfIdf.elizabethtown,tfIdf.bluff,tfIdf.kentucky,tfIdf.shreveport,tfIdf.covington,tfIdf.outcry,tfIdf.fransisco,tfIdf.staffers,tfIdf.sociopath,tfIdf.solon,tfIdf.tennessee,tfIdf.township,tfIdf.targeting,tfIdf.thee,tfIdf.liar,tfIdf.katy,tfIdf.rockford,tfIdf.fresno,tfIdf.persecution,tfIdf.courts,tfIdf.asylum,tfIdf.granted,tfIdf.valid,tfIdf.denied,tfIdf.preview,tfIdf.mlb,tfIdf.mets,tfIdf.picks,tfIdf.twins,tfIdf.minnesota,tfIdf.rays,tfIdf.tigers,tfIdf.productive,tfIdf.hype,tfIdf.brakes,tfIdf.sincerly,tfIdf.bo,tfIdf.dudefrom,tfIdf.drastic,tfIdf.votethemout,tfIdf.nomoredemocrats,tfIdf.stronghold,tfIdf.maltreatment,tfIdf.squeegee,tfIdf.applying,tfIdf.mailed,tfIdf.skyrocket,tfIdf.lo,tfIdf.abbeville,tfIdf.sc,tfIdf.pa,tfIdf.jolla,tfIdf.cares,tfIdf.sportsgambling,tfIdf.parlaypicks,tfIdf.marlins,tfIdf.parlay,tfIdf.cubs,tfIdf.dodgers,tfIdf.brewers,tfIdf.checkmeout,tfIdf.baseballpicks,tfIdf.rl,tfIdf.bamma,tfIdf.bammas,tfIdf.goheadmoe,tfIdf.aye,tfIdf.human,tfIdf.suprmacists,tfIdf.tewwwww,tfIdf.lmao,tfIdf.themed,tfIdf.powerplantlive,tfIdf.crawls,tfIdf.shthole
0,1155531927056551936,"[brooklyn, shooting]",149344.0,"-0.12 , 0.06 , 0.15 , -0.08 , 0.03 , 0.02 , 0...","{-0.25539,-0.25723,0.13169,-0.042688,0.21817,-...",,0.0,0.0,0.0,53.0,189.0,1.564335e+12,1.155532e+18,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,20.0,3.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.178559,0.377875,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1155531839873966080,"[amazing, claimed, course, brownsville, days, ...",149324.0,"0.19 , -0.02 , -0.01 , 0.19 , 0.03 , 0.25 , 0...","{-4.7590685,-4.1655083,1.6122192,0.4315409,1.2...",,0.0,0.0,0.0,280.0,1000.0,1.564335e+12,1.155532e+18,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,135.0,8.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.544849,0.437553,0.498639,0.137417,0.498639,0.360009,0.145752,0.544849,0.204858,0.437553,0.373852,0.373852,0.345133,0.437553,0.364312,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1155531461052829696,"[beloved, brooklyn, block, party, marred, dead...",149233.0,"-0.14 , -0.11 , 0.02 , 0.01 , 0.10 , -0.06 , ...","{-1.1279607,-2.6594982,1.8382361,-0.89134943,1...",,0.0,0.0,0.0,134.0,478.0,1.564335e+12,1.155531e+18,0.0,0.0,0.0,1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,82.0,120.0,155.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.035712,0.075575,,,,,,,,,,,,,,,,0.365094,0.553367,0.553367,0.568782,0.180283,0.560778,0.169576,0.638097,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1155531083477377024,"[tragedy, constituents, forward, yearly, event...",149143.0,"0.16 , 0.11 , 0.06 , -0.18 , -0.05 , 0.15 , 0...","{-2.6689384,-1.5716823,3.0461571,-1.2470062,2....",,1.0,0.0,0.0,247.0,882.0,1.564335e+12,1.155531e+18,0.0,0.0,0.0,0.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,112.0,9925.0,1604.0,0.0,215.0,0.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.254561,0.423474,0.531747,0.623298,0.531747,0.244691,0.369588,0.16469,0.623298,0.623298,0.450012,0.29898,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1155530972466765824,"[shooting, brooklyn, left, dead, wounded]",149117.0,"-0.01 , 0.16 , 0.20 , -0.14 , 0.03 , 0.01 , 0...","{-1.418931,-1.6138601,0.40521997,0.005207952,0...",,0.0,0.0,0.0,91.0,325.0,1.564335e+12,1.155531e+18,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,41.0,12286.0,709.0,0.0,317.0,0.0,0.0,0.0,0.0,1.0,0.071424,0.151150,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.602735,0.796615,0.210619,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8047,1157803414119538688,"[racist, calling, baltimore, shthole]",0.0,"-0.05 , 0.04 , 0.22 , -0.13 , 0.05 , 0.05 , 0...","{-0.42823195,-0.8011701,0.72685003,0.40797597,...",,0.0,0.0,0.0,68.0,242.0,1.564877e+12,1.157803e+18,0.0,0.0,2.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,37.0,70.0,140.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.554151
8048,1158461150536880131,"[help, baltimore]",156817.0,"-0.15 , -0.00 , 0.06 , -0.03 , 0.01 , 0.10 , ...","{-0.25539,-0.25723,0.13169,-0.042688,0.21817,-...",,0.0,0.0,0.0,71.0,253.0,1.565034e+12,1.158461e+18,0.0,0.0,1.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,17.0,6622.0,7275.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8049,1158506469085474816,"[posted, photo, baltimore, county, maryland]",167621.0,"-0.12 , -0.02 , 0.12 , -0.07 , 0.09 , -0.02 ,...","{-2.233442,-0.67434,0.31131,-0.570872,1.288870...",,0.0,0.0,0.0,72.0,257.0,1.565044e+12,1.158506e+18,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,44.0,384.0,599.0,0.0,80.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8050,1158224772527996941,"[cummings, money, baltimore]",100460.0,"-0.11 , 0.04 , 0.19 , -0.09 , 0.05 , 0.03 , 0...","{-0.89948,-0.81822,0.41608998,0.08451401,0.454...",,0.0,0.0,0.0,83.0,296.0,1.564977e+12,1.158225e+18,0.0,0.0,1.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,28.0,2945.0,3309.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
#df = pd.DataFrame(new_df)
#df = pd.json_normalize(df['tweets'])

# Normalise the sentiment

In [None]:
df['sentiment'] = df[df['sentiment'].apply(lambda x:pd.Series(x))]
df_sentiment=pd.concat([df['sentiment'].str.split(',', expand=True)], axis=1, keys="s")
# df_sentiment

In [None]:
# Join back onto the original dataframe
df.join(df_sentiment)

# Normalise the embeddings

In [None]:
df['embeddings'] = df['embeddings'].str.extract('(\d+)', expand=False)
df['embeddings'] = df['embeddings'].str[1:]
#df['embeddings']
df_embeddings = pd.concat([df['embeddings'].str.split(',', expand=True)], axis=1, keys="e")
df_embeddings

In [None]:
# Join back onto the original dataframe
df.join(df_embeddings)

In [None]:
df_emb_sent=df_sentiment.join(df_embeddings)
df_emb_sent
df.join(df_emb_sent)

# Export
df.to_csv("temp/api_input.csv", index=False)

api_input = pd.read_csv("temp/api_input.csv")
api_input

In [None]:
# Drop the string categories
api_input.drop(['tweet_text'], axis = 1, inplace = True)
api_input.drop(['embeddings'], axis = 1, inplace = True)
api_input.drop(['sentiment'], axis = 1, inplace = True)

api_input.describe()

In [None]:
# Group by tweet
feature_vector_input = api_input.groupby(['tweet_id']).agg('first')
feature_vector_input

## Load the labelled data

These are generated in 0_Labels.ipynb

In [None]:
# dtypes needs to be specified or long ints will change
labels_df = pd.read_json("../../../data/input/raw/data/2020/2020-A/labels/TRECIS-2018-2020A-labels.json", dtype={} )

# replace the event with a numeric value
labels_df = labels_df.replace({'eventID': event_int_map})

# Count the number of labels
labels_df['num'] = labels_df['postCategories'].str.len()

# Map the priority to a numeric value
labels_df = labels_df.replace({"postPriority": priority_mapping})

# Split categories and map to numeric values
category_list = pd.DataFrame(labels_df["postCategories"].to_list(), columns=['cat1', 'cat2', 'cat3',
                                                                   'cat4', 'cat5', 'cat6',
                                                                   'cat7', 'cat8', 'cat9', 'cat10'])


# Map the categories to numeric values
category_list = category_list.applymap(lambda s: mymap.get(s) if s in mymap else s)


# Join back onto our original list
labels = labels_df.join(category_list)

# Drop the string categories
labels.drop(['postCategories'], axis = 1, inplace = True)

# Tidy
labels = labels.drop(['eventName', 'eventDescription', 'eventType'], axis=1)

# Fill the NaN slots with 0
labels = labels.fillna("0")

# Export
labels.to_csv("../labels.csv", index=False)
labels = clean_dataset_int(labels)
#labels = clean_dataset_new(labels)

labels

# Train.csv


We merge the feature vector with the annotated data, this is used to train the classifier.

In [None]:
# Merges the input feature vector with the labels
train = pd.merge(labels, feature_vector_input, left_on = 'postID', right_on = 'tweet_id', how = 'inner')

# reset the index
train = train.reset_index()

# fill NaN and replace Infinity
train.fillna(0, inplace=True)
train.replace([np.inf, -np.inf], np.nan, inplace=True)
train = train.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

# export to csv
train.to_csv("../train.csv", index=False)

train

# Test.csv


Drops the categories, number of categories and priority so we can make our prediction

In [None]:
# Take a copy of the train dataframe
test = train

# Drop cat*
test.drop(list(test.filter(regex = 'cat\d+$')), axis = 1, inplace = True)

# Drop priority / num (of labels)
test.drop(['postPriority', 'num'], axis = 1, inplace = True)

# export
test.to_csv("../test.csv", index=False)

test

# Igel

This notebook implements `Igel` and supports all sklearn's machine learning functionality.

Caution must be taken to avoid overfitting. See `docs/ml.md` for more information

Igel's supported models:

        +--------------------+----------------------------+-------------------------+
        |      regression    |        classification      |        clustering       |
        +--------------------+----------------------------+-------------------------+
        |   LinearRegression |         LogisticRegression |                  KMeans |
        |              Lasso |                      Ridge |     AffinityPropagation |
        |          LassoLars |               DecisionTree |                   Birch |
        | BayesianRegression |                  ExtraTree | AgglomerativeClustering |
        |    HuberRegression |               RandomForest |    FeatureAgglomeration |
        |              Ridge |                 ExtraTrees |                  DBSCAN |
        |  PoissonRegression |                        SVM |         MiniBatchKMeans |
        |      ARDRegression |                  LinearSVM |    SpectralBiclustering |
        |  TweedieRegression |                      NuSVM |    SpectralCoclustering |
        | TheilSenRegression |            NearestNeighbor |      SpectralClustering |
        |    GammaRegression |              NeuralNetwork |               MeanShift |
        |   RANSACRegression | PassiveAgressiveClassifier |                  OPTICS |
        |       DecisionTree |                 Perceptron |                    ---- |
        |          ExtraTree |               BernoulliRBM |                    ---- |
        |       RandomForest |           BoltzmannMachine |                    ---- |
        |         ExtraTrees |       CalibratedClassifier |                    ---- |
        |                SVM |                   Adaboost |                    ---- |
        |          LinearSVM |                    Bagging |                    ---- |
        |              NuSVM |           GradientBoosting |                    ---- |
        |    NearestNeighbor |        BernoulliNaiveBayes |                    ---- |
        |      NeuralNetwork |      CategoricalNaiveBayes |                    ---- |
        |         ElasticNet |       ComplementNaiveBayes |                    ---- |
        |       BernoulliRBM |         GaussianNaiveBayes |                    ---- |
        |   BoltzmannMachine |      MultinomialNaiveBayes |                    ---- |
        |           Adaboost |                       ---- |                    ---- |
        |            Bagging |                       ---- |                    ---- |
        |   GradientBoosting |                       ---- |                    ---- |
        +--------------------+----------------------------+-------------------------+

In [None]:
# Train the model

params = {
        'cmd': 'fit',    
        'data_path': "../train.csv",
        'yaml_path': 'yaml/multi.yaml'  # DecisionTree
}

Igel(**params)

In [None]:
# Use model to predict on missing values

params = {
        'cmd': 'predict',    
        'data_path': "../test.csv",
        'yaml_path': 'yaml/hyper.yaml'
}
Igel(**params)

# Predictions

1. View the raw predictions
2. Map the labels to their High Level Information Types
3. Merge the predictions back into the training set


In [None]:
def is_neg_predictions(predictions):
    predictions = predictions.sort_values(by=['postPriority'])
    predictions = predictions[(predictions > 0).all(1)]
    predictions.round()

    
predictions = pd.read_csv("model_results/predictions.csv")
predictions

#is_neg_predictions(predictions)

### Merge the new predictions back onto dataframe with the missing columns

In [None]:
# Map the labels to their High Level Information Types
cat_list = predictions.filter(regex='cat', axis=1).round().applymap(lambda x: highCategoriser[x])

#
predictions = cat_list.combine_first(predictions)

# Merge the predictions back into the training set
df = test.merge(predictions, left_index=True, right_index=True)


# Append the predicted categories to a list in a new column
df['predicted_categories'] = df[['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10']].values.tolist()


# Get the number of categories into something we can use to index
df['num'] = df['num'].astype(float).astype(int)

# Remove categories beyond what the tweet is predicted to have
df['categories'] = df.apply(lambda x: x['predicted_categories'][0:x['num']], axis=1)


df

## Export

Export in the TRECIS format

In [None]:
# write to .run file
with open("marks2.run" , "w") as out_file:
    for row in df.drop_duplicates(subset="postID").itertuples():
        #print("row:", row)
        content = [
            "TRECIS-CTIT-H-Test-0" + str(int(row.eventID)),
            "Q0",          
            np.int64(row.postID),   
            getattr(row, 'Index'),  #ToDo: Fix?
            #row.priority,
            str(priority_scorer[str(round(row.postPriority))[:3]]),  #ToDo: Fix
            row.categories,
            "marksrun2"
        ]
        out_file.write("\t".join([str(x) for x in content]) + "\n")

In [None]:
#

In [78]:
#df['tweets'] = df['tweets'].str[2:-1]
df


Unnamed: 0,fileName,tweets
0,data/tweets/run/brook_selected.jsonl,"[{'tweet_id': 1155531927056551936, 'tweet_text..."
1,data/tweets/run/el.jsonl,"[{'tweet_id': 1157817871835324416, 'tweet_text..."
2,data/tweets/run/gil.jsonl,"[{'tweet_id': 1155854599221329920, 'tweet_text..."
0,,"[{'tweet_id': 1157394630561808384, 'tweet_text..."
1,,"[{'tweet_id': 1157394630561808384, 'tweet_text..."
2,,"[{'tweet_id': 1149871010231070720, 'tweet_text..."
3,,"[{'tweet_id': 1149871010231070720, 'tweet_text..."
4,,"[{'tweet_id': 1158114685738266627, 'tweet_text..."
5,,"[{'tweet_id': 1158114685738266627, 'tweet_text..."
6,,"[{'tweet_id': 1157961073921810432, 'tweet_text..."
