# myrun

This notebook imports the feature vector, merges it with the labelled data and then makes predictions on specified columns


In [25]:
import pandas as pd
import re
import numpy as np
from functools import reduce
import pandas as pd
import seaborn as sns
from igel import Igel

In [26]:
def clean_dataset_int(df):
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df.replace(np.nan,0)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    df = df.replace(r'\D+', '', regex=True)
    return df[indices_to_keep].astype(np.float64)

def clean_final_dataset(df):
    df = df.drop(['category','eventType'],axis =1)
    return df

def merge_labels(df):
    dfs = [labels, df]
    df_final = reduce(lambda left,right: pd.merge(left,right,on='tweet_id'), dfs)
    return df_final



In [27]:
# Set the column names
feature_vector_keys = ["timestamp", "tweet_id", "positive_sentiment", "negative_sentiment", \
                       "numb_of_mentions", "numb_of_urls", "numb_of_media","numb_of_hashtags", \
                       "numb_of_personal_pronouns", "numb_of_present_tenses", "numb_of_past_tenses", \
                       "sent_from_web", "numb_of_weird_chars", "numb_of_questions", "numb_of_emoticons", \
                       "numb_of_swearing_words", "numb_of_slang_words", "numb_of_intensifiers", \
                       "tweet_length", "userFollowersCount","userFriendsCount", "user_numb_of_tweets",\
                       "user_list_count", "tfidf_fire", "dict_precision", "dict_recall", "dict_f_measure"
                       ]


priority_scorer = {
    '10.' : 'Critical',
    '9.0' : 'Critical',
    '8.0' : 'High',
    '7.0' : 'High',
    '6.0' : 'Medium',
    '5.0' : 'Medium',
    '4.0' : 'Medium',
    '3.0' : 'Low',
    '2.0' : 'Low',
    '1.0' : 'Low',
    '0.0' : 'Low',
}

# What we consider to be highly important categories of information
highCategoriser = {
    0.0 : np.nan,
    1.0 : 'Other-Advice',
    2.0 : 'Report-CleanUp',
    3.0 : 'ContextualInformation',
    4.0 : 'Other-ContextualInformation',
    5.0 : 'CallToAction-Donations',
    6.0 : 'Report-EmergingThreats',
    7.0 : 'Report-Factoid',
    8.0 : 'Report-FirstPartyObservation',
    9.0 : 'Request-GoodsServices',
    10.0 : 'Report-Hashtags',
    11.0 : 'Request-InformationWanted',
    12.0 : 'Other-Irrelevant',
    13.0 : 'Report-Location',
    14.0 : 'CallToAction-MovePeople',
    15.0 : 'Report-MultimediaShare',
    16.0 : 'Report-NewSubEvent',
    17.0 : 'Report-News',
    18.0 : 'Report-Official',
    19.0 : 'Report-OriginalEvent',
    20.0 : 'Request-SearchAndRescue',
    21.0 : 'Other-Sentiment',
    22.0 : 'Report-ServiceAvailable',
    23.0 : 'Report-ThirdPartyObservation',
    24.0 : 'CallToAction-Volunteer',
    25.0 : 'Report-Weather',
}

## Feature Vector

Load the feature vector in from Play

In [28]:
# Currently loading the feature vector from a .txt file exported from Play
feature_vector_input = pd.read_csv("../../../0-data/processed/all_new.txt", sep=",", header=None)

# Set the column names
feature_vector_input.columns = feature_vector_keys

# Group by tweet
feature_vector_input = feature_vector_input.groupby(['tweet_id']).agg('first')

# Reset the index
feature_vector_input.reset_index(level=0, inplace=True)

# Clean & drop empty rows
#feature_vector_input = clean_initial_feature_vector(feature_vector_input)

# Create a numeric version for our model
feature_vector_input = clean_dataset_int(feature_vector_input)
feature_vector_input

Unnamed: 0,tweet_id,timestamp,positive_sentiment,negative_sentiment,numb_of_mentions,numb_of_urls,numb_of_media,numb_of_hashtags,numb_of_personal_pronouns,numb_of_present_tenses,...,numb_of_intensifiers,tweet_length,userFollowersCount,userFriendsCount,user_numb_of_tweets,user_list_count,tfidf_fire,dict_precision,dict_recall,dict_f_measure
0,1.128285e+18,1.557839e+11,3.0,60.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,219.0,4416.0,4860.0,0.0,29.0,0.0,0.0,0.0,0.0
1,1.128286e+18,1.557839e+11,2.0,66.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,139.0,2420.0,490.0,0.0,11.0,0.0,0.0,0.0,0.0
2,1.128286e+18,1.557839e+11,3.0,60.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,220.0,232.0,841.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.128286e+18,1.557839e+11,3.0,65.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,114.0,4680.0,4.0,0.0,107.0,0.0,0.0,0.0,0.0
4,1.128286e+18,1.557839e+11,3.0,65.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,114.0,3179.0,3.0,0.0,64.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9702,1.162005e+18,1.565879e+09,5.0,57.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,113.0,5543.0,169.0,0.0,200.0,0.0,0.0,0.0,0.0
9703,1.162005e+18,1.565879e+11,8.0,51.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,98.0,83.0,328.0,0.0,0.0,0.0,0.0,0.0,0.0
9704,1.162006e+18,1.565879e+11,15.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,124.0,65.0,98.0,0.0,0.0,0.0,0.0,0.0,0.0
9705,1.162006e+18,1.565879e+10,37.0,22.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,239.0,1129.0,1049.0,0.0,174.0,0.0,0.0,0.0,0.0


## Load the labelled data

These are generated in 0_Labels.ipynb

In [29]:
labels = pd.read_csv("../3-csv/labels.csv")
labels

Unnamed: 0,tweet_id,priority,postCategories_x,eventID,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,num
0,1128285482784366592,7.5,13,45,13.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,1128285482784366592,7.5,13,45,13.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,1128285665186197504,2.5,13,45,13.0,7.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,1128285665186197504,2.5,13,45,13.0,7.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
4,1128285665186197504,2.5,13,45,13.0,7.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15264,1162006062867918848,2.5,17,47,13.0,15.0,10.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,4
15265,1162006062867918848,2.5,17,47,13.0,15.0,10.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,4
15266,1162006062867918848,2.5,17,47,13.0,15.0,10.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,4
15267,1162006062867918848,2.5,17,47,13.0,15.0,10.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,4


### Merge

Merges the input feature vector with the labels

In [30]:
train_char = merge_labels(feature_vector_input)

train_char

Unnamed: 0,tweet_id,priority,postCategories_x,eventID,cat1,cat2,cat3,cat4,cat5,cat6,...,numb_of_intensifiers,tweet_length,userFollowersCount,userFriendsCount,user_numb_of_tweets,user_list_count,tfidf_fire,dict_precision,dict_recall,dict_f_measure
0,1128285482784366592,7.5,13,45,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,219.0,4416.0,4860.0,0.0,29.0,0.0,0.0,0.0,0.0
1,1128285482784366592,7.5,13,45,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,219.0,4416.0,4860.0,0.0,29.0,0.0,0.0,0.0,0.0
2,1128285757624311808,2.5,13,45,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,114.0,4680.0,4.0,0.0,107.0,0.0,0.0,0.0,0.0
3,1128285757624311808,2.5,13,45,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,114.0,4680.0,4.0,0.0,107.0,0.0,0.0,0.0,0.0
4,1128286351760265216,2.5,13,45,13.0,7.0,10.0,0.0,0.0,0.0,...,0.0,235.0,13.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12765,1162006062867918848,2.5,17,47,13.0,15.0,10.0,17.0,0.0,0.0,...,0.0,117.0,3368.0,181.0,0.0,1548.0,0.0,0.0,0.0,0.0
12766,1162006062867918848,2.5,17,47,13.0,15.0,10.0,17.0,0.0,0.0,...,0.0,117.0,3368.0,181.0,0.0,1548.0,0.0,0.0,0.0,0.0
12767,1162006062867918848,2.5,17,47,13.0,15.0,10.0,17.0,0.0,0.0,...,0.0,117.0,3368.0,181.0,0.0,1548.0,0.0,0.0,0.0,0.0
12768,1162006062867918848,2.5,17,47,13.0,15.0,10.0,17.0,0.0,0.0,...,0.0,117.0,3368.0,181.0,0.0,1548.0,0.0,0.0,0.0,0.0


### Export our Training data

In [7]:
train = clean_final_dataset(train_char)

# Export as Igel takes a .csv input
train.to_csv("../3-csv/train.csv", index=False)
train

Unnamed: 0,tweet_id,priority,postCategories_x,eventID,cat1,cat2,cat3,cat4,cat5,cat6,...,numb_of_intensifiers,tweet_length,userFollowersCount,userFriendsCount,user_numb_of_tweets,user_list_count,tfidf_fire,dict_precision,dict_recall,dict_f_measure
0,1128285482784366592,7.5,13,45,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,219.0,4416.0,4860.0,0.0,29.0,0.0,0.0,0.0,0.0
1,1128285482784366592,7.5,13,45,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,219.0,4416.0,4860.0,0.0,29.0,0.0,0.0,0.0,0.0
2,1128285757624311808,2.5,13,45,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,114.0,4680.0,4.0,0.0,107.0,0.0,0.0,0.0,0.0
3,1128285757624311808,2.5,13,45,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,114.0,4680.0,4.0,0.0,107.0,0.0,0.0,0.0,0.0
4,1128286351760265216,2.5,13,45,13.0,7.0,10.0,0.0,0.0,0.0,...,0.0,235.0,13.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12765,1162006062867918848,2.5,17,47,13.0,15.0,10.0,17.0,0.0,0.0,...,0.0,117.0,3368.0,181.0,0.0,1548.0,0.0,0.0,0.0,0.0
12766,1162006062867918848,2.5,17,47,13.0,15.0,10.0,17.0,0.0,0.0,...,0.0,117.0,3368.0,181.0,0.0,1548.0,0.0,0.0,0.0,0.0
12767,1162006062867918848,2.5,17,47,13.0,15.0,10.0,17.0,0.0,0.0,...,0.0,117.0,3368.0,181.0,0.0,1548.0,0.0,0.0,0.0,0.0
12768,1162006062867918848,2.5,17,47,13.0,15.0,10.0,17.0,0.0,0.0,...,0.0,117.0,3368.0,181.0,0.0,1548.0,0.0,0.0,0.0,0.0


### Export our Test Data

Remove the columns you want to predict from the dataframe

In [8]:
test = train

test.drop(list(test.filter(regex = 'cat')), axis = 1, inplace = True)
test.drop(['priority', 'num'], axis = 1, inplace = True)


test.to_csv("../3-csv/test.csv", index=False)
test

Unnamed: 0,tweet_id,postCategories_x,eventID,timestamp,positive_sentiment,negative_sentiment,numb_of_mentions,numb_of_urls,numb_of_media,numb_of_hashtags,...,numb_of_intensifiers,tweet_length,userFollowersCount,userFriendsCount,user_numb_of_tweets,user_list_count,tfidf_fire,dict_precision,dict_recall,dict_f_measure
0,1128285482784366592,13,45,1.557839e+11,3.0,60.0,0.0,0.0,2.0,0.0,...,0.0,219.0,4416.0,4860.0,0.0,29.0,0.0,0.0,0.0,0.0
1,1128285482784366592,13,45,1.557839e+11,3.0,60.0,0.0,0.0,2.0,0.0,...,0.0,219.0,4416.0,4860.0,0.0,29.0,0.0,0.0,0.0,0.0
2,1128285757624311808,13,45,1.557839e+11,3.0,65.0,0.0,0.0,1.0,0.0,...,0.0,114.0,4680.0,4.0,0.0,107.0,0.0,0.0,0.0,0.0
3,1128285757624311808,13,45,1.557839e+11,3.0,65.0,0.0,0.0,1.0,0.0,...,0.0,114.0,4680.0,4.0,0.0,107.0,0.0,0.0,0.0,0.0
4,1128286351760265216,13,45,1.557839e+11,3.0,54.0,0.0,0.0,1.0,0.0,...,0.0,235.0,13.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12765,1162006062867918848,17,47,1.565879e+11,5.0,61.0,0.0,0.0,2.0,0.0,...,0.0,117.0,3368.0,181.0,0.0,1548.0,0.0,0.0,0.0,0.0
12766,1162006062867918848,17,47,1.565879e+11,5.0,61.0,0.0,0.0,2.0,0.0,...,0.0,117.0,3368.0,181.0,0.0,1548.0,0.0,0.0,0.0,0.0
12767,1162006062867918848,17,47,1.565879e+11,5.0,61.0,0.0,0.0,2.0,0.0,...,0.0,117.0,3368.0,181.0,0.0,1548.0,0.0,0.0,0.0,0.0
12768,1162006062867918848,17,47,1.565879e+11,5.0,61.0,0.0,0.0,2.0,0.0,...,0.0,117.0,3368.0,181.0,0.0,1548.0,0.0,0.0,0.0,0.0


# Machine Learning

Select the machine learning algorithms to use in the next cell.

In [9]:
# Select the options for ML: [cluster, hyper, iris, regres, forest, multi]
#Fit
algo_fit = 'yaml/multi.yaml'

# Evaluate
algo_eval = 'yaml/hyper.yaml'

# Predict
algo_predict = 'yaml/hyper.yaml'   

### Fit



In [10]:
params = {
        'cmd': 'fit',    # provide the command you want to use. whether fit, evaluate or predict
        'data_path': "../3-csv/train.csv",
        'yaml_path': algo_fit
}

Igel(**params)

INFO - Entered CLI args: {'cmd': 'fit', 'data_path': '../3-csv/train.csv', 'yaml_path': 'yaml/multi.yaml'}
INFO - Executing command: fit ...
INFO - reading data from ../3-csv/train.csv
INFO - You passed the configurations as a yaml file.
INFO - your chosen configuration: {'dataset': {'split': {'test_size': 0.2, 'shuffle': True, 'stratify': 'default'}}, 'model': {'type': 'regression', 'algorithm': 'RandomForest'}, 'target': ['priority', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'num']}
INFO - dataset_props: {'split': {'test_size': 0.2, 'shuffle': True, 'stratify': 'default'}} 
model_props: {'type': 'regression', 'algorithm': 'RandomForest'} 
 target: ['priority', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'num'] 

INFO - dataset shape: (12770, 41)
INFO - dataset attributes: ['tweet_id', 'priority', 'postCategories_x', 'eventID', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'n

<igel.igel.Igel at 0x7fa8616d8340>

### Evaluate



In [11]:
params = {
        'cmd': 'evaluate',    
        'data_path': "../3-csv/train.csv",
        'yaml_path': algo_eval
}

Igel(**params)

INFO - Entered CLI args: {'cmd': 'evaluate', 'data_path': '../3-csv/train.csv', 'yaml_path': 'yaml/hyper.yaml'}
INFO - Executing command: evaluate ...
INFO - reading data from ../3-csv/train.csv
INFO - path of the pre-fitted model => /Users/mark/Documents/GitHub/HelpMe/1-src/2-python/2-notebooks/model_results/model.sav
INFO - result path: /Users/mark/Documents/GitHub/HelpMe/1-src/2-python/2-notebooks/model_results 
INFO - loading model form /Users/mark/Documents/GitHub/HelpMe/1-src/2-python/2-notebooks/model_results/model.sav 
INFO - dataset shape: (12770, 41)
INFO - dataset attributes: ['tweet_id', 'priority', 'postCategories_x', 'eventID', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'num', 'timestamp', 'positive_sentiment', 'negative_sentiment', 'numb_of_mentions', 'numb_of_urls', 'numb_of_media', 'numb_of_hashtags', 'numb_of_personal_pronouns', 'numb_of_present_tenses', 'numb_of_past_tenses', 'sent_from_web', 'numb_of_weird_chars', 'numb_of_quest

<igel.igel.Igel at 0x7fa8637aabb0>

### Predict



In [12]:
params = {
        'cmd': 'predict',    
        'data_path': "../3-csv/test.csv",
        'yaml_path': algo_predict
}

Igel(**params)

INFO - Entered CLI args: {'cmd': 'predict', 'data_path': '../3-csv/test.csv', 'yaml_path': 'yaml/hyper.yaml'}
INFO - Executing command: predict ...
INFO - reading data from ../3-csv/test.csv
INFO - path of the pre-fitted model => /Users/mark/Documents/GitHub/HelpMe/1-src/2-python/2-notebooks/model_results/model.sav
INFO - loading from /Users/mark/Documents/GitHub/HelpMe/1-src/2-python/2-notebooks/model_results/model.sav
INFO - dataset shape: (12770, 29)
INFO - dataset attributes: ['tweet_id', 'postCategories_x', 'eventID', 'timestamp', 'positive_sentiment', 'negative_sentiment', 'numb_of_mentions', 'numb_of_urls', 'numb_of_media', 'numb_of_hashtags', 'numb_of_personal_pronouns', 'numb_of_present_tenses', 'numb_of_past_tenses', 'sent_from_web', 'numb_of_weird_chars', 'numb_of_questions', 'numb_of_emoticons', 'numb_of_swearing_words', 'numb_of_slang_words', 'numb_of_intensifiers', 'tweet_length', 'userFollowersCount', 'userFriendsCount', 'user_numb_of_tweets', 'user_list_count', 'tfidf_f

<igel.igel.Igel at 0x7fa840db17c0>

## View the predictions

In [16]:
predictions = pd.read_csv("model_results/predictions.csv")

predictions = predictions.sort_values(by=['priority'])
predictions.describe()

Unnamed: 0,priority,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,num
count,12770.0,12770.0,12770.0,12770.0,12770.0,12770.0,12770.0,12770.0,12770.0,12770.0,12770.0,12770.0
mean,3.46371,16.537172,12.024493,8.544548,6.480515,4.572351,2.847861,0.812077,0.066617,0.016778,0.000882,3.702751
std,1.390379,5.512681,8.601498,6.332647,6.701624,6.623162,5.998513,3.489301,0.987317,0.484147,0.029353,1.934073
min,2.5,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,2.5,12.01,6.0,0.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
50%,2.525,13.0,13.0,9.9,6.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
75%,4.166667,23.0,15.0,13.0,14.64,10.0,0.17,0.0,0.0,0.0,0.0,5.0
max,10.0,25.0,25.0,25.0,22.0,22.0,21.0,21.0,17.0,17.0,1.0,10.0


### Merge the new predictions back onto dataframe with the missing columns

In [14]:
# Map the labels to their High Level Information Types
cat_list = predictions.filter(regex='cat', axis=1).round().applymap(lambda x: highCategoriser[x])
predictions = cat_list.combine_first(predictions)

# Merge the predictions back into the training set
df = train.merge(predictions, left_index=True, right_index=True)

# Append the predicted categories to a list in a new column
df['predicted_categories'] = df[['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10']].values.tolist()

# Get the number of categories into something we can use to index
df['num'] = df['num'].astype(float).astype(int)

# Remove categories beyond what the tweet is predicted to have
df['categories'] = df.apply(lambda x: x['predicted_categories'][0:x['num']], axis=1)

# Clean
df = df.filter(['tweet_id', 'eventID', 'priority','categories'], axis=1).round()

df

INFO - NumExpr defaulting to 8 threads.


Unnamed: 0,tweet_id,eventID,priority,categories
0,1128285482784366592,45,6.0,"[Report-Location, Report-Factoid]"
1,1128285482784366592,45,6.0,"[Report-Location, Report-Factoid]"
2,1128285757624311808,45,3.0,"[Report-Location, Report-Factoid]"
3,1128285757624311808,45,3.0,"[Report-Location, Report-Factoid]"
4,1128286351760265216,45,3.0,"[Report-Location, Report-Factoid]"
...,...,...,...,...
12765,1162006062867918848,47,2.0,"[Report-Location, Report-MultimediaShare, Repo..."
12766,1162006062867918848,47,2.0,"[Report-Location, Report-MultimediaShare, Repo..."
12767,1162006062867918848,47,2.0,"[Report-Location, Report-MultimediaShare, Repo..."
12768,1162006062867918848,47,2.0,"[Report-Location, Report-MultimediaShare, Repo..."


## Export

Export in the TRECIS format

In [24]:
# write to .run file
with open("marks2.run" , "w") as out_file:
    for row in df.drop_duplicates(subset="tweet_id").itertuples():
        #print("row:", row)
        content = [
            "TRECIS-CTIT-H-Test-0" + str(row.eventID),
            "Q0",          
            np.int64(row.tweet_id),   
            getattr(row, 'Index'),  #ToDo: Fix?
            #row.priority
            str(priority_scorer[str(row.priority)[:3]]),  #ToDo: Fix
            row.categories,
            "marksrun2"
        ]
        out_file.write("\t".join([str(x) for x in content]) + "\n")