# myrun

This notebook imports the feature vector, merges it with the labelled data and then makes predictions on specified columns


In [1]:
import pandas as pd
import re
import numpy as np
from functools import reduce
import pandas as pd
import seaborn as sns
from igel import Igel

In [2]:
def clean_dataset_int(df):
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df.replace(np.nan,0)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    df = df.replace(r'\D+', '', regex=True)
    return df[indices_to_keep].astype(np.float64)



def merge_labels(df):
    dfs = [labels, df]
    df_final = reduce(lambda left,right: pd.merge(left = 'tweet_id', right = 'postID'), dfs)


    return df_final



In [3]:
# Set the column names
feature_vector_keys = ["timestamp", "tweet_id", "positive_sentiment", "negative_sentiment", \
                       "numb_of_mentions", "numb_of_urls", "numb_of_media","numb_of_hashtags", \
                       "numb_of_personal_pronouns", "numb_of_present_tenses", "numb_of_past_tenses", \
                       "sent_from_web", "numb_of_weird_chars", "numb_of_questions", "numb_of_emoticons", \
                       "numb_of_swearing_words", "numb_of_slang_words", "numb_of_intensifiers", \
                       "tweet_length", "userFollowersCount","userFriendsCount", "user_numb_of_tweets",\
                       "user_list_count", "tfidf_fire", "dict_precision", "dict_recall", "dict_f_measure"
                       ]


priority_scorer = {
    '10.' : 'Critical',
    '9.0' : 'Critical',
    '8.0' : 'High',
    '7.0' : 'High',
    '6.0' : 'High',
    '5.0' : 'Medium',
    '4.0' : 'Medium',
    '3.0' : 'Low',
    '2.0' : 'Low',
    '1.0' : 'Low',
    '0.0' : 'Low',
}

# What we consider to be highly important categories of information
highCategoriser = {
    0.0 : 'Other-Advice',
    1.0 : 'Other-Advice',
    2.0 : 'Report-CleanUp',
    3.0 : 'ContextualInformation',
    4.0 : 'Other-ContextualInformation',
    5.0 : 'CallToAction-Donations',
    6.0 : 'Report-EmergingThreats',
    7.0 : 'Report-Factoid',
    8.0 : 'Report-FirstPartyObservation',
    9.0 : 'Request-GoodsServices',
    10.0 : 'Report-Hashtags',
    11.0 : 'Request-InformationWanted',
    12.0 : 'Other-Irrelevant',
    13.0 : 'Report-Location',
    14.0 : 'CallToAction-MovePeople',
    15.0 : 'Report-MultimediaShare',
    16.0 : 'Report-NewSubEvent',
    17.0 : 'Report-News',
    18.0 : 'Report-Official',
    19.0 : 'Report-OriginalEvent',
    20.0 : 'Request-SearchAndRescue',
    21.0 : 'Other-Sentiment',
    22.0 : 'Report-ServiceAvailable',
    23.0 : 'Report-ThirdPartyObservation',
    24.0 : 'CallToAction-Volunteer',
    25.0 : 'Report-Weather',
}

## Feature Vector

Load the feature vector in from Play

In [4]:
# Currently loading the feature vector from a .txt file exported from Play
feature_vector_input = pd.read_csv("../../../0-data/processed/all_new.txt", sep=",", header=None)

# Set the column names
feature_vector_input.columns = feature_vector_keys

# Group by tweet
feature_vector_input = feature_vector_input.groupby(['tweet_id']).agg('first')

# Reset the index
feature_vector_input.reset_index(level=0, inplace=True)

# Clean & drop empty rows
#feature_vector_input = clean_initial_feature_vector(feature_vector_input)

# Create a numeric version for our model
feature_vector_input = clean_dataset_int(feature_vector_input)
feature_vector_input

Unnamed: 0,tweet_id,timestamp,positive_sentiment,negative_sentiment,numb_of_mentions,numb_of_urls,numb_of_media,numb_of_hashtags,numb_of_personal_pronouns,numb_of_present_tenses,...,numb_of_intensifiers,tweet_length,userFollowersCount,userFriendsCount,user_numb_of_tweets,user_list_count,tfidf_fire,dict_precision,dict_recall,dict_f_measure
0,1.128285e+18,1.557839e+11,3.0,60.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,219.0,4416.0,4860.0,0.0,29.0,0.0,0.0,0.0,0.0
1,1.128286e+18,1.557839e+11,2.0,66.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,139.0,2420.0,490.0,0.0,11.0,0.0,0.0,0.0,0.0
2,1.128286e+18,1.557839e+11,3.0,60.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,220.0,232.0,841.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.128286e+18,1.557839e+11,3.0,65.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,114.0,4680.0,4.0,0.0,107.0,0.0,0.0,0.0,0.0
4,1.128286e+18,1.557839e+11,3.0,65.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,114.0,3179.0,3.0,0.0,64.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9702,1.162005e+18,1.565879e+09,5.0,57.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,113.0,5543.0,169.0,0.0,200.0,0.0,0.0,0.0,0.0
9703,1.162005e+18,1.565879e+11,8.0,51.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,98.0,83.0,328.0,0.0,0.0,0.0,0.0,0.0,0.0
9704,1.162006e+18,1.565879e+11,15.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,124.0,65.0,98.0,0.0,0.0,0.0,0.0,0.0,0.0
9705,1.162006e+18,1.565879e+10,37.0,22.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,239.0,1129.0,1049.0,0.0,174.0,0.0,0.0,0.0,0.0


## Load the labelled data

These are generated in 0_Labels.ipynb

In [5]:
labels = pd.read_csv("../3-csv/labels.csv")
labels

Unnamed: 0,eventID,postID,postPriority,num,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10
0,16,72676276212731904,2.5,3,7.0,10.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16,72678400833228800,10.0,4,22.0,18.0,10.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16,72682396750848000,2.5,2,21.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16,72693931619528704,5.0,3,23.0,10.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16,72698562223407104,2.5,3,23.0,10.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42946,47,1161999740080291843,2.5,1,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42947,47,1162004768904163329,2.5,3,13.0,15.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42948,47,1162005174468132869,2.5,2,13.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42949,47,1162005861075750918,2.5,3,13.0,15.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Merge

Merges the input feature vector with the labels

In [6]:
#train = merge_labels(feature_vector_input)
train = pd.merge(labels, feature_vector_input, left_on = 'postID', right_on = 'tweet_id', how = 'inner')


train

Unnamed: 0,eventID,postID,postPriority,num,cat1,cat2,cat3,cat4,cat5,cat6,...,numb_of_intensifiers,tweet_length,userFollowersCount,userFriendsCount,user_numb_of_tweets,user_list_count,tfidf_fire,dict_precision,dict_recall,dict_f_measure
0,45,1128285482784366592,7.5,2,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,219.0,4416.0,4860.0,0.0,29.0,0.0,0.0,0.0,0.0
1,45,1128285757624311808,2.5,2,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,114.0,4680.0,4.0,0.0,107.0,0.0,0.0,0.0,0.0
2,45,1128286351760265216,2.5,3,13.0,7.0,10.0,0.0,0.0,0.0,...,0.0,235.0,13.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0
3,45,1128286441790771200,2.5,2,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,75.0,175486.0,188245.0,0.0,1049.0,0.0,0.0,0.0,0.0
4,45,1128286900639473664,2.5,2,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,148.0,9588.0,1272.0,0.0,293.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3064,43,1161985840739786752,2.5,1,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,233.0,2794.0,646.0,0.0,9.0,0.0,0.0,0.0,0.0
3065,47,1161988642698670080,2.5,3,13.0,15.0,10.0,0.0,0.0,0.0,...,0.0,221.0,1129.0,1049.0,0.0,174.0,0.0,0.0,0.0,0.0
3066,47,1161992743268343808,5.0,2,25.0,13.0,0.0,0.0,0.0,0.0,...,0.0,79.0,3929.0,4.0,0.0,75.0,0.0,0.0,0.0,0.0
3067,47,1161998036991561728,2.5,1,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,100.0,729.0,305.0,0.0,35.0,0.0,0.0,0.0,0.0


### Export our Training data

In [7]:
train.to_csv("../3-csv/train.csv", index=False)
train

Unnamed: 0,eventID,postID,postPriority,num,cat1,cat2,cat3,cat4,cat5,cat6,...,numb_of_intensifiers,tweet_length,userFollowersCount,userFriendsCount,user_numb_of_tweets,user_list_count,tfidf_fire,dict_precision,dict_recall,dict_f_measure
0,45,1128285482784366592,7.5,2,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,219.0,4416.0,4860.0,0.0,29.0,0.0,0.0,0.0,0.0
1,45,1128285757624311808,2.5,2,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,114.0,4680.0,4.0,0.0,107.0,0.0,0.0,0.0,0.0
2,45,1128286351760265216,2.5,3,13.0,7.0,10.0,0.0,0.0,0.0,...,0.0,235.0,13.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0
3,45,1128286441790771200,2.5,2,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,75.0,175486.0,188245.0,0.0,1049.0,0.0,0.0,0.0,0.0
4,45,1128286900639473664,2.5,2,13.0,7.0,0.0,0.0,0.0,0.0,...,0.0,148.0,9588.0,1272.0,0.0,293.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3064,43,1161985840739786752,2.5,1,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,233.0,2794.0,646.0,0.0,9.0,0.0,0.0,0.0,0.0
3065,47,1161988642698670080,2.5,3,13.0,15.0,10.0,0.0,0.0,0.0,...,0.0,221.0,1129.0,1049.0,0.0,174.0,0.0,0.0,0.0,0.0
3066,47,1161992743268343808,5.0,2,25.0,13.0,0.0,0.0,0.0,0.0,...,0.0,79.0,3929.0,4.0,0.0,75.0,0.0,0.0,0.0,0.0
3067,47,1161998036991561728,2.5,1,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,100.0,729.0,305.0,0.0,35.0,0.0,0.0,0.0,0.0


### Export our Test Data

Remove the columns you want to predict from the dataframe

In [8]:
#test = train

#test.drop(list(test.filter(regex = 'cat')), axis = 1, inplace = True)
#test.drop(['priority', 'num'], axis = 1, inplace = True)


#test.to_csv("../3-csv/test.csv", index=False)
#test

# Machine Learning

Select the machine learning algorithms to use in the next cell.

In [9]:
# Select the options for ML: [cluster, hyper, iris, regres, forest, multi]
#Fit
algo_fit = 'yaml/multi.yaml'

# Evaluate
algo_eval = 'yaml/hyper.yaml'

# Predict
algo_predict = 'yaml/hyper.yaml'   

### Fit



In [10]:
params = {
        'cmd': 'fit',    # provide the command you want to use. whether fit, evaluate or predict
        'data_path': "../3-csv/train.csv",
        'yaml_path': algo_fit
}

Igel(**params)

INFO - Entered CLI args: {'cmd': 'fit', 'data_path': '../3-csv/train.csv', 'yaml_path': 'yaml/multi.yaml'}
INFO - Executing command: fit ...
INFO - reading data from ../3-csv/train.csv
INFO - You passed the configurations as a yaml file.
INFO - your chosen configuration: {'dataset': {'split': {'test_size': 0.2, 'shuffle': True, 'stratify': 'default', 'scale': {'method': 'standard', 'target': 'inputs'}}}, 'model': {'type': 'regression', 'algorithm': 'DecisionTree'}, 'target': ['postPriority', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'num']}
INFO - dataset_props: {'split': {'test_size': 0.2, 'shuffle': True, 'stratify': 'default', 'scale': {'method': 'standard', 'target': 'inputs'}}} 
model_props: {'type': 'regression', 'algorithm': 'DecisionTree'} 
 target: ['postPriority', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'num'] 

INFO - dataset shape: (3069, 41)
INFO - dataset attributes: ['eventID', 'postID', 'pos

<igel.igel.Igel at 0x7fd08b276d00>

### Evaluate



In [11]:
params = {
        'cmd': 'evaluate',    
        'data_path': "../3-csv/train.csv",
        'yaml_path': algo_eval
}

Igel(**params)

INFO - Entered CLI args: {'cmd': 'evaluate', 'data_path': '../3-csv/train.csv', 'yaml_path': 'yaml/hyper.yaml'}
INFO - Executing command: evaluate ...
INFO - reading data from ../3-csv/train.csv
INFO - path of the pre-fitted model => /Users/mark/Documents/GitHub/HelpMe/1-src/2-python/2-notebooks/model_results/model.sav
INFO - result path: /Users/mark/Documents/GitHub/HelpMe/1-src/2-python/2-notebooks/model_results 
INFO - loading model form /Users/mark/Documents/GitHub/HelpMe/1-src/2-python/2-notebooks/model_results/model.sav 
INFO - dataset shape: (3069, 41)
INFO - dataset attributes: ['eventID', 'postID', 'postPriority', 'num', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'tweet_id', 'timestamp', 'positive_sentiment', 'negative_sentiment', 'numb_of_mentions', 'numb_of_urls', 'numb_of_media', 'numb_of_hashtags', 'numb_of_personal_pronouns', 'numb_of_present_tenses', 'numb_of_past_tenses', 'sent_from_web', 'numb_of_weird_chars', 'numb_of_questions', 

<igel.igel.Igel at 0x7fd098b15100>

### Predict



In [12]:
params = {
        'cmd': 'predict',    
        'data_path': "../3-csv/test.csv",
        'yaml_path': algo_predict
}

Igel(**params)

INFO - Entered CLI args: {'cmd': 'predict', 'data_path': '../3-csv/test.csv', 'yaml_path': 'yaml/hyper.yaml'}
INFO - Executing command: predict ...
INFO - reading data from ../3-csv/test.csv
INFO - path of the pre-fitted model => /Users/mark/Documents/GitHub/HelpMe/1-src/2-python/2-notebooks/model_results/model.sav
INFO - loading from /Users/mark/Documents/GitHub/HelpMe/1-src/2-python/2-notebooks/model_results/model.sav
INFO - dataset shape: (3069, 28)
INFO - dataset attributes: ['tweet_id', 'eventID', 'timestamp', 'positive_sentiment', 'negative_sentiment', 'numb_of_mentions', 'numb_of_urls', 'numb_of_media', 'numb_of_hashtags', 'numb_of_personal_pronouns', 'numb_of_present_tenses', 'numb_of_past_tenses', 'sent_from_web', 'numb_of_weird_chars', 'numb_of_questions', 'numb_of_emoticons', 'numb_of_swearing_words', 'numb_of_slang_words', 'numb_of_intensifiers', 'tweet_length', 'userFollowersCount', 'userFriendsCount', 'user_numb_of_tweets', 'user_list_count', 'tfidf_fire', 'dict_precision

<igel.igel.Igel at 0x7fd098b10520>

## View the predictions

In [21]:
predictions = pd.read_csv("model_results/predictions.csv")
predictions
#predictions = predictions.sort_values(by=['priority'])
#predictions = predictions[(predictions > 0).all(1)]
#predictions.round()

Unnamed: 0,priority,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,num
0,7.50,13.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,2.50,13.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,2.50,13.0,7.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
3,2.50,13.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,2.50,13.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3064,2.50,12.0,0.0,10.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,4.0
3065,5.00,13.0,15.0,7.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
3066,3.75,25.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3067,3.75,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Merge the new predictions back onto dataframe with the missing columns

In [22]:
# Map the labels to their High Level Information Types
cat_list = predictions.filter(regex='cat', axis=1).round().applymap(lambda x: highCategoriser[x])
predictions = cat_list.combine_first(predictions)

# Merge the predictions back into the training set
df = train.merge(predictions, left_index=True, right_index=True)

df



Unnamed: 0,eventID,postID,postPriority,num_x,cat1_x,cat2_x,cat3_x,cat4_x,cat5_x,cat6_x,...,cat2_y,cat3_y,cat4_y,cat5_y,cat6_y,cat7_y,cat8_y,cat9_y,num_y,priority
0,45,1128285482784366592,7.5,2,13.0,7.0,0.0,0.0,0.0,0.0,...,Report-Factoid,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,2.0,7.50
1,45,1128285757624311808,2.5,2,13.0,7.0,0.0,0.0,0.0,0.0,...,Report-Factoid,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,2.0,2.50
2,45,1128286351760265216,2.5,3,13.0,7.0,10.0,0.0,0.0,0.0,...,Report-Factoid,Report-Hashtags,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,3.0,2.50
3,45,1128286441790771200,2.5,2,13.0,7.0,0.0,0.0,0.0,0.0,...,Report-Factoid,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,2.0,2.50
4,45,1128286900639473664,2.5,2,13.0,7.0,0.0,0.0,0.0,0.0,...,Report-Factoid,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,2.0,2.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3064,43,1161985840739786752,2.5,1,12.0,0.0,0.0,0.0,0.0,0.0,...,Other-Advice,Report-Hashtags,Other-Advice,Report-Official,Other-Advice,Other-Advice,Other-Advice,Other-Advice,4.0,2.50
3065,47,1161988642698670080,2.5,3,13.0,15.0,10.0,0.0,0.0,0.0,...,Report-MultimediaShare,Report-Factoid,Report-Hashtags,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,4.0,5.00
3066,47,1161992743268343808,5.0,2,25.0,13.0,0.0,0.0,0.0,0.0,...,Report-Location,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,2.0,3.75
3067,47,1161998036991561728,2.5,1,12.0,0.0,0.0,0.0,0.0,0.0,...,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,Other-Advice,1.0,3.75


In [23]:
# Append the predicted categories to a list in a new column
df['predicted_categories'] = df[['cat1_y', 'cat2_y', 'cat3_y', 'cat4_y', 'cat5_y', 'cat6_y', 'cat7_y', 'cat8_y', 'cat9_y', 'cat10_y']].values.tolist()

# Get the number of categories into something we can use to index
df['num'] = df['num_y'].astype(float).astype(int)

# Remove categories beyond what the tweet is predicted to have
df['categories'] = df.apply(lambda x: x['predicted_categories'][0:x['num']], axis=1)

# Clean
df = df.filter(['tweet_id', 'eventID', 'priority','categories'], axis=1).round()

df

Unnamed: 0,tweet_id,eventID,priority,categories
0,1.128285e+18,45,8.0,"[Report-Location, Report-Factoid]"
1,1.128286e+18,45,2.0,"[Report-Location, Report-Factoid]"
2,1.128286e+18,45,2.0,"[Report-Location, Report-Factoid, Report-Hasht..."
3,1.128286e+18,45,2.0,"[Report-Location, Report-Factoid]"
4,1.128287e+18,45,2.0,"[Report-Location, Report-Factoid]"
...,...,...,...,...
3064,1.161986e+18,43,2.0,"[Other-Irrelevant, Other-Advice, Report-Hashta..."
3065,1.161989e+18,47,5.0,"[Report-Location, Report-MultimediaShare, Repo..."
3066,1.161993e+18,47,4.0,"[Report-Weather, Report-Location]"
3067,1.161998e+18,47,4.0,[Other-Irrelevant]


## Export

Export in the TRECIS format

In [24]:
testy = pd.read_csv("../3-csv/testy.csv")
testy

Unnamed: 0,eventID,postID,postCategories,postPriority
0,45,1128285482784366592,"['Location', 'Factoid']",High
1,45,1128285665186197504,"['Location', 'Factoid', 'Hashtags']",Low
2,45,1128285690779795459,"['Location', 'Factoid']",Low
3,45,1128285757624311808,"['Location', 'Factoid']",Low
4,45,1128285778306428934,"['Location', 'Factoid']",Low
...,...,...,...,...
6678,47,1161999740080291843,['Irrelevant'],Low
6679,47,1162004768904163329,"['Location', 'MultimediaShare', 'ContextualInf...",Low
6680,47,1162005174468132869,"['Location', 'MultimediaShare']",Low
6681,47,1162005861075750918,"['Location', 'MultimediaShare', 'Hashtags']",Low


In [25]:
# write to .run file
with open("marks2.run" , "w") as out_file:
    for row in df.drop_duplicates(subset="tweet_id").itertuples():
        #print("row:", row)
        content = [
            "TRECIS-CTIT-H-Test-0" + str(row.eventID),
            "Q0",          
            np.int64(row.tweet_id),   
            getattr(row, 'Index'),  #ToDo: Fix?
            #row.priority,
            str(priority_scorer[str(row.priority)[:3]]),  #ToDo: Fix
            row.categories,
            "marksrun2"
        ]
        out_file.write("\t".join([str(x) for x in content]) + "\n")

In [None]:
#
stop

In [None]:

# Compare Algorithms
import pandas
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# load dataset
array = labels.values
X = array[:,0:8]
Y = array[:,8]
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()