# myrun

This notebook imports the feature vector, merges it with the labelled data and then makes predictions on specified columns


In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
from functools import reduce

# Igel
from igel import Igel

# Compare Algorithms
import matplotlib.pyplot as plt
from matplotlib import rcParams

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
def clean_dataset(df):
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df.replace(np.nan,0)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    df = df.replace(r'\D+', '', regex=True)
    return df
    #return df[indices_to_keep].astype(np.float64)

def clean_dataset_new(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

def clean_dataset_int(df):
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df.replace(np.nan,0)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    df = df.replace(r'\D+', '', regex=True)
    return df[indices_to_keep].astype(np.float64)

In [3]:
# Set the column names
feature_vector_keys = ["timestamp", #0
                       "tweet_id",  
                       "positive_sentiment", 
                       "negative_sentiment", 
                       "numb_of_mentions", 
                       "numb_of_media",
                       "numb_of_urls", 
                       "numb_of_hashtags", 
                       "numb_of_personal_pronouns", 
                       "numb_of_present_tenses", 
                       "numb_of_past_tenses", #10 
                       #"sent_from_web",
                       "numb_of_named_entites",
                       "numb_of_weird_chars", 
                       "numb_of_questions", 
                       "numb_of_emoticons", 
                       "numb_of_swearing_words", 
                       "numb_of_slang_words", 
                       "numb_of_intensifiers", 
                       "tweet_length", 
                       "userFollowersCount",
                       "userFriendsCount", #20
                       "user_numb_of_tweets",
                       "user_list_count",
                       #"tfidf_fire",                      # old
                       "dict_precision",                  # measures against the keyword TFIDF
                       "dict_recall", 
                       "dict_f_measure",
                       "offset",
                       "weighted_length", # the overall length of the Tweet with code points weighted per the ranges
                       "permillage", # (indicates the proportion (per thousand) of the weighted length in comparison to the max weighted length.
                       "is_verified", # column 29  #30 Non-word-embedding-features (0-29)
                       "unknown1",
                       "unknown2",
                       "unknown3",
                       "wEmbed1",
                       "wEmbed2",
                       "wEmbed3",
                       "wEmbed4",
                       "wEmbed5",
                       "wEmbed6",
                       "wEmbed7",
                       "wEmbed8",
                       "wEmbed9",
                       "wEmbed10",
                       "wEmbed11",
                       "wEmbed12",
                       "wEmbed13",
                       "wEmbed14",
                       "wEmbed15",
                       "wEmbed16",
                       "wEmbed17",
                       "wEmbed18",
                       "wEmbed19",
                       "wEmbed20",
                       "wEmbed21",
                       "wEmbed22",
                       "wEmbed23",
                       "wEmbed24",
                       "wEmbed25",
                       "wEmbed26",
                       "wEmbed27",
                       "wEmbed28",
                       "wEmbed29",
                       "wEmbed30",
                       "wEmbed31",
                       "wEmbed32",
                       "wEmbed33",
                       "wEmbed34",
                       "wEmbed35",
                       "wEmbed36",
                       "wEmbed37",
                       "wEmbed38",
                       "wEmbed39",
                       "wEmbed40",
                       "wEmbed41",
                       "wEmbed42",
                       "wEmbed43",
                       "wEmbed44",
                       "wEmbed45",
                       "wEmbed46",
                       "wEmbed47",
                       "wEmbed48",
                       "wEmbed49",
                       "wEmbed51",
                       "wEmbed52",
                       "wEmbed53",
                       "wEmbed54",
                       "wEmbed55",
                       "wEmbed56",
                       "wEmbed57",
                       "wEmbed58",
                       "wEmbed59",
                       "wEmbed60",
                       "wEmbed61",
                       "wEmbed62",
                       "wEmbed63",
                       "wEmbed64",
                       "wEmbed65",
                       "wEmbed66",
                       "wEmbed67",
                       "wEmbed68",
                       "wEmbed69",
                       "wEmbed70",
                       "wEmbed71",
                       "wEmbed72",
                       "wEmbed73",
                       "wEmbed74",
                       "wEmbed75",
                       "wEmbed76",
                       "wEmbed77",
                       "wEmbed78",
                       "wEmbed79",
                       "wEmbed80",
                       "wEmbed81",
                       "wEmbed82",
                       "wEmbed83",
                       "wEmbed84",
                       "wEmbed85",
                       "wEmbed86",
                       "wEmbed87",
                       "wEmbed88",
                       "wEmbed89",
                       "wEmbed80",
                       "wEmbed91",
                       "wEmbed92",
                       "wEmbed93",
                       "wEmbed94",
                       "wEmbed95",
                       "wEmbed96",
                       "wEmbed97",
                       "wEmbed98",
                       "wEmbed99",
                       "wEmbed100"
                       "wEmbed101",
                       "wEmbed102",
                       "wEmbed103",
                       "wEmbed104",
                       "wEmbed105",
                       "wEmbed106",
                       "wEmbed107",
                       "wEmbed108",
                       "wEmbed109",
                       "wEmbed110",
                       "wEmbed111",
                       "wEmbed112",
                       "wEmbed113",
                       "wEmbed114",
                       "wEmbed115",
                       "wEmbed116",
                       "wEmbed117",
                       "wEmbed118",
                       "wEmbed119",
                       "wEmbed120",
                       "wEmbed121",
                       "wEmbed122",
                       "wEmbed123",
                       "wEmbed124",
                       "wEmbed125",
                       "wEmbed126",
                       "wEmbed127",
                       "wEmbed128",
                       "wEmbed129",
                       "wEmbed130",
                       "wEmbed131",
                       "wEmbed132",
                       "wEmbed133",
                       "wEmbed134",
                       "wEmbed135",
                       "wEmbed136",
                       "wEmbed137",
                       "wEmbed138",
                       "wEmbed139",
                       "wEmbed140",
                       "wEmbed141",
                       "wEmbed142",
                       "wEmbed143",
                       "wEmbed144",
                       "wEmbed145",
                       "wEmbed146",
                       "wEmbed147",
                       "wEmbed148",
                       "wEmbed149",
                       "wEmbed151",
                       "wEmbed152",
                       "wEmbed153",
                       "wEmbed154",
                       "wEmbed155",
                       "wEmbed156",
                       "wEmbed157",
                       "wEmbed158",
                       "wEmbed159",
                       "wEmbed160",
                       "wEmbed161",
                       "wEmbed162",
                       "wEmbed163",
                       "wEmbed164",
                       "wEmbed165",
                       "wEmbed166",
                       "wEmbed167",
                       "wEmbed168",
                       "wEmbed169",
                       "wEmbed170",
                       "wEmbed171",
                       "wEmbed172",
                       "wEmbed173",
                       "wEmbed174",
                       "wEmbed175",
                       "wEmbed176",
                       "wEmbed177",
                       "wEmbed178",
                       "wEmbed179",
                       "wEmbed180",
                       "wEmbed181",
                       "wEmbed182",
                       "wEmbed183",
                       "wEmbed184",
                       "wEmbed185",
                       "wEmbed186",
                       "wEmbed187",
                       "wEmbed188",
                       "wEmbed189",
                       "wEmbed180",
                       "wEmbed191",
                       "wEmbed192",
                       "wEmbed193",
                       "wEmbed194",
                       "wEmbed195",
                       "wEmbed196",
                       "wEmbed197",
                       "wEmbed198",
                       "wEmbed199",
                       "wEmbed200"
                       ]


priority_scorer = {
    '10' : 'Critical',
    '9' : 'Critical',
    '8' : 'Critical',
    '7' : 'High',
    '6' : 'High',
    '5' : 'Medium',
    '4' : 'Medium',
    '3' : 'Low',
    '2' : 'Low',
    '1.0' : 'Low',
    '0.0' : 'Low',
}

priority_mapping = {
    "Critical" : 10,
    "High" : 7.5,
    "Medium" : 5,
    "Low" : 2.5,
    "Unknown" : 0,
}

# What we consider to be highly important categories of information
highCategoriser = {
    0.0 : 'Other-Advice',
    1.0 : 'Other-Advice',
    2.0 : 'Report-CleanUp',
    3.0 : 'ContextualInformation',
    4.0 : 'Other-ContextualInformation',
    5.0 : 'CallToAction-Donations',
    6.0 : 'Report-EmergingThreats',
    7.0 : 'Report-Factoid',
    8.0 : 'Report-FirstPartyObservation',
    9.0 : 'Request-GoodsServices',
    10.0 : 'Report-Hashtags',
    11.0 : 'Request-InformationWanted',
    12.0 : 'Other-Irrelevant',
    13.0 : 'Report-Location',
    14.0 : 'CallToAction-MovePeople',
    15.0 : 'Report-MultimediaShare',
    16.0 : 'Report-NewSubEvent',
    17.0 : 'Report-News',
    18.0 : 'Report-Official',
    19.0 : 'Report-OriginalEvent',
    20.0 : 'Request-SearchAndRescue',
    21.0 : 'Other-Sentiment',
    22.0 : 'Report-ServiceAvailable',
    23.0 : 'Report-ThirdPartyObservation',
    24.0 : 'CallToAction-Volunteer',
    25.0 : 'Report-Weather',
    26.0 : 'hmm',
}

event_int_map =	{
  "guatemalaEarthquake2012": 7,
  "joplinTornado2011": 16,
  "athensEarthquake2020": 35,
  "baltimoreFlashFlood2020": 36,
  "brooklynBlockPartyShooting2020": 37,
  "daytonOhioShooting2020": 38,
  "elPasoWalmartShooting2020": 39,
  "gilroygarlicShooting2020": 40,
  "hurricaneBarry2020": 41,
  "indonesiaEarthquake2020": 42,
  "keralaFloods2020": 43,
  "myanmarFloods2020": 44,
  "papuaNewguineaEarthquake2020": 45,
  "siberianWildfires2020": 46,
  "typhoonKrosa2020": 47,
  "typhoonLekima2020": 48,
  "whaleyBridgeCollapse2020": 49
}

mymap = {'Advice':1, 'CleanUp':2, 'ContextualInformation':3, 'Discussion':4, 'Donations':5, 
        'EmergingThreats':6, 'Factoid':7, 'FirstPartyObservation':8, 'GoodsServices':9, 'Hashtags':10, 
        'InformationWanted':11,'Irrelevant':12, 'Location':13, 'MovePeople':14, 
         'MultimediaShare':15, 'NewSubEvent':16, 'News':17,
        'Official':18, 'OriginalEvent':19, 'SearchAndRescue':20, 'Sentiment':21, 'ServiceAvailable':22, 
         'ThirdPartyObservation':23,'Volunteer':24, 'Weather':25}

## Feature Vector

Load the feature vector in from Play

In [4]:
# Currently loading the feature vector from a .txt file exported from Play
feature_vector_input = pd.read_csv("../../../0-data/processed/new_with_offset.txt", sep=",", header=None, error_bad_lines=False)

# Remove superflous "]"
del feature_vector_input[130]

feature_vector_input

FileNotFoundError: [Errno 2] No such file or directory: '../../../0-data/processed/new_with_offset.txt'

In [None]:
# Set the column names
feature_vector_input.columns = feature_vector_keys



feature_vector_input#.describe()

In [None]:
# Group by tweet
feature_vector_input = feature_vector_input.groupby(['tweet_id']).agg('first')

# Reset the index
feature_vector_input.reset_index(level=0, inplace=True)

# Create a numeric version for our model
feature_vector_input = clean_dataset(feature_vector_input)



In [None]:
# Remove superflous "]"
del feature_vector_input["wEmbed200"]

# Create a numeric version for our model
feature_vector_input = clean_dataset_int(feature_vector_input)

feature_vector_input#.describe()

## Load the labelled data

These are generated in 0_Labels.ipynb

In [None]:
# dtypes needs to be specified or long ints will change
labels_df = pd.read_json("../../../0-data/raw/data/2020/2020-A/labels/TRECIS-2018-2020A-labels.json", dtype={} )

# replace the event with a numeric value
labels_df = labels_df.replace({'eventID': event_int_map})

# Count the number of labels
labels_df['num'] = labels_df['postCategories'].str.len()


# Map the priority to a numeric value
labels_df = labels_df.replace({"postPriority": priority_mapping})

# Split categories and map to numeric values
category_list = pd.DataFrame(labels_df["postCategories"].to_list(), columns=['cat1', 'cat2', 'cat3',
                                                                   'cat4', 'cat5', 'cat6',
                                                                   'cat7', 'cat8', 'cat9', 'cat10'])


# Map the categories to numeric values
category_list = category_list.applymap(lambda s: mymap.get(s) if s in mymap else s)


# Join back onto our original list
labels = labels_df.join(category_list)

# Drop the string categories
labels.drop(['postCategories'], axis = 1, inplace = True)

# Tidy
labels = labels.drop(['eventName', 'eventDescription', 'eventType'], axis=1)


# Fill the NaN slots with 0
labels = labels.fillna("0")

# Export
labels.to_csv("../labels.csv", index=False)

labels = clean_dataset_int(labels)

labels

# Train.csv


Merges the input feature vector with the labels

In [None]:
#dfs = [labels, feature_vector_input]
#train = reduce(lambda left,right: pd.merge(left = 'tweet_id', right = 'postID'), dfs)

train = pd.merge(labels, feature_vector_input, left_on = 'postID', right_on = 'tweet_id', how = 'inner')

train.to_csv("../train.csv", index=False)

train

# Test.csv


Drops the categories, number of categories and priority so we can make our prediction

In [None]:
test = train

# Drop cat*
test.drop(list(test.filter(regex = 'cat')), axis = 1, inplace = True)

# Drop priority / num (of labels)
test.drop(['postPriority', 'num'], axis = 1, inplace = True)

# export
test.to_csv("../test.csv", index=False)

test

## Igel

This notebook implements `Igel`

> Igel supports all sklearn's machine learning functionality,

Igel's supported models:

        +--------------------+----------------------------+-------------------------+
        |      regression    |        classification      |        clustering       |
        +--------------------+----------------------------+-------------------------+
        |   LinearRegression |         LogisticRegression |                  KMeans |
        |              Lasso |                      Ridge |     AffinityPropagation |
        |          LassoLars |               DecisionTree |                   Birch |
        | BayesianRegression |                  ExtraTree | AgglomerativeClustering |
        |    HuberRegression |               RandomForest |    FeatureAgglomeration |
        |              Ridge |                 ExtraTrees |                  DBSCAN |
        |  PoissonRegression |                        SVM |         MiniBatchKMeans |
        |      ARDRegression |                  LinearSVM |    SpectralBiclustering |
        |  TweedieRegression |                      NuSVM |    SpectralCoclustering |
        | TheilSenRegression |            NearestNeighbor |      SpectralClustering |
        |    GammaRegression |              NeuralNetwork |               MeanShift |
        |   RANSACRegression | PassiveAgressiveClassifier |                  OPTICS |
        |       DecisionTree |                 Perceptron |                    ---- |
        |          ExtraTree |               BernoulliRBM |                    ---- |
        |       RandomForest |           BoltzmannMachine |                    ---- |
        |         ExtraTrees |       CalibratedClassifier |                    ---- |
        |                SVM |                   Adaboost |                    ---- |
        |          LinearSVM |                    Bagging |                    ---- |
        |              NuSVM |           GradientBoosting |                    ---- |
        |    NearestNeighbor |        BernoulliNaiveBayes |                    ---- |
        |      NeuralNetwork |      CategoricalNaiveBayes |                    ---- |
        |         ElasticNet |       ComplementNaiveBayes |                    ---- |
        |       BernoulliRBM |         GaussianNaiveBayes |                    ---- |
        |   BoltzmannMachine |      MultinomialNaiveBayes |                    ---- |
        |           Adaboost |                       ---- |                    ---- |
        |            Bagging |                       ---- |                    ---- |
        |   GradientBoosting |                       ---- |                    ---- |
        +--------------------+----------------------------+-------------------------+

### Fit



In [None]:
params = {
        'cmd': 'fit',    
        'data_path': "../train.csv",
        'yaml_path': 'yaml/multi.yaml'  # DecisionTree
}

Igel(**params)

### Evaluate



In [None]:
params = {
        'cmd': 'evaluate',    
        'data_path': "../train.csv",
        'yaml_path': 'yaml/hyper.yaml'
} 
Igel(**params)

### Predict



In [None]:
params = {
        'cmd': 'predict',    
        'data_path': "../test.csv",
        'yaml_path': 'yaml/hyper.yaml'
}
Igel(**params)

## View the predictions

In [None]:
predictions = pd.read_csv("model_results/predictions.csv")
predictions

def is_neg_predictions(predictions):
    predictions = predictions.sort_values(by=['postPriority'])
    predictions = predictions[(predictions > 0).all(1)]
    predictions.round()
    
#is_neg_predictions(predictions)

### Merge the new predictions back onto dataframe with the missing columns

In [None]:
# Map the labels to their High Level Information Types
cat_list = predictions.filter(regex='cat', axis=1).round().applymap(lambda x: highCategoriser[x])
predictions = cat_list.combine_first(predictions)

# Merge the predictions back into the training set
df = test.merge(predictions, left_index=True, right_index=True)
df.describe()

In [None]:
# Append the predicted categories to a list in a new column
df['predicted_categories'] = df[['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10']].values.tolist()


# Get the number of categories into something we can use to index
df['num'] = df['num'].astype(float).astype(int)

# Remove categories beyond what the tweet is predicted to have
df['categories'] = df.apply(lambda x: x['predicted_categories'][0:x['num']], axis=1)


# Clean
#df = df.filter(['eventID', 'tweet_id', 'postPriority', 'categories'], axis=1).round()

#df = clean_dataset_int(df)
df


## Export

Export in the TRECIS format

In [None]:
#testy = pd.read_csv("../3-csv/testy.csv")
#testy

In [None]:
# write to .run file
with open("marks2.run" , "w") as out_file:
    for row in df.drop_duplicates(subset="tweet_id").itertuples():
        #print("row:", row)
        content = [
            "TRECIS-CTIT-H-Test-0" + str(int(row.eventID)),
            "Q0",          
            np.int64(row.tweet_id),   
            getattr(row, 'Index'),  #ToDo: Fix?
            #row.priority,
            str(priority_scorer[str(round(row.postPriority))[:3]]),  #ToDo: Fix
            row.categories,
            "marksrun2"
        ]
        out_file.write("\t".join([str(x) for x in content]) + "\n")

In [None]:
# Set the figure size
rcParams['figure.figsize'] = 20,10

In [None]:
round(2.5)

In [None]:
# Algorithm Comparison Boxplot

# load dataset
array = train.values
X = array[:,0:8]
Y = array[:,8]

# prepare configuration for cross validation test harness
seed = 7

# prepare models
models = []
models.append(('LogisticRegression', LogisticRegression()))
models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
models.append(('KNeighborsClassifier', KNeighborsClassifier()))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
models.append(('GaussianNB', GaussianNB()))
models.append(('SVC', SVC()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# boxplot algorithm comparison
fig = plt.figure()
#fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
#plt.boxplot(results)
sns.boxplot(data=results, palette="Set3")
ax.set_xticklabels(names)
plt.title('Comparison of Model by Classification Metric')
plt.show()

plt.savefig('../../../0-data/screenshots/benchmark_models_performance.png',dpi=300)



In [None]:
# example run
# Run             & NDCG    &  AW-H     & AW-A      & CF1-H   & CF1-A   & CAcc   & PErr-H & PErr-A \\
# njit-sub01.text & 0.4632  & -0.4801   & -0.2493   & 0.0792  & 0.1582  & 0.9025 & 0.1524 & 0.2198 \\