# Evaluate the Prediction versus the real results

In [60]:
import re
import sqlite3
import pandas as pd
import numpy as np
from textblob import TextBlob
from collections import Counter
from random import randint

## Aux functions

In [61]:
def get_tweet_sentiment(tweet):
    """
    Utility function to classify sentiment of passed tweet
    using textblob's sentiment method
    """

    # create TextBlob object of passed tweet text
    analysis = TextBlob(clean_tweet(tweet['tweetText']))
    
    # set sentiment
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

In [62]:
def clean_tweet(tweet):
    '''
    Utility function to clean tweet text by removing links, special characters
    using simple regex statements.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

## Setup

In [63]:
# Setup sqlite
sqlite_file = 'eurovision_final.db'

# Connect to the database sqlite file
connection = sqlite3.connect(sqlite_file)
db = connection.cursor()

In [64]:
# set country hashtags - semifinal 1
all_hashtags = ['SWE', 'GEO', 'AUS', 'ALB', 'BEL', 'MNE', 'FIN', 'AZE', 'POR',\
                'POL', 'MDA', 'ISL', 'CZE', 'CYP', 'ARM', 'SLO', 'LAT', 'GRE',\
                'AUT', 'BLR', 'DEN', 'EST', 'MKD', 'HUN', 'IRL', 'ISR', 'LTU',\
                'NOR', 'ROM', 'SMR', 'SRB', 'SUI', 'NED', 'CRO', 'BUL', 'MLT',\
                'ITA', 'FRA', 'ESP', 'GBR', 'UKR', 'GER']

hashtags_semi1 = ['SWE', 'GEO', 'AUS', 'ALB', 'BEL', 'MNE', 'FIN', 'AZE', 'POR', 'GRE',\
                    'POL', 'MDA', 'ISL', 'CZE', 'CYP', 'ARM', 'SLO', 'LAT']

# set country hashtags - semifinal 2
hashtags_semi2 = ['AUT', 'BLR', 'DEN', 'EST', 'MKD', 'HUN', 'IRL', 'ISR', 'LTU', 'MLT', \
                    'NOR', 'ROM', 'SMR', 'SRB', 'SUI', 'NED', 'CRO', 'BUL']

# set country hashtags - final
hashtags_final = ['ARM', 'AZE', 'ITA', 'MDA', 'POL', 'POR', 'UKR', 'AUS', 'BEL', 'CYP', 'FRA',\
                  'GER', 'GRE', 'ESP', 'GBR', 'SWE', 'BUL', 'BLR', 'CRO', 'HUN', 'DEN',\
                  'ISR', 'ROM', 'NOR', 'NED', 'AUT']

hashtags = hashtags_final
print(len(hashtags))

26


## Count tweets and analyze sentiment

In [65]:
# read ALL tweets in english from db, evaluate sentiment, and count - SEMI 1
all_sentiments = []
for country in hashtags:

    # get tweets from DB
    country_tweets = pd.read_sql_query("SELECT * FROM TweetsRaw WHERE language='en' AND tweetText LIKE '%#{}%'".format(country), connection)

    # count number of sentiments
    sentiments_count = Counter(country_tweets.apply(get_tweet_sentiment, axis=1))
    
    # append country to list
    all_sentiments.append({'country': country, \
                           'positive': sentiments_count['positive'],\
                           'neutral': sentiments_count['neutral'],\
                           'negative': sentiments_count['negative']
                          })

In [66]:
# read all tweets (to just count)
all_tweet_counts = []
for country in hashtags:

    # get tweet count from DB
    db.execute("SELECT COUNT(*) AS count FROM TweetsRaw WHERE tweetText LIKE '%#{}%'".format(country))
    country_tweet_count = db.fetchone()[0]
    
    # append country to list
    all_tweet_counts.append({'country': country, \
                           'count': country_tweet_count
                          })

In [76]:
# transform to pandas dataframe from sentiments list
results = pd.DataFrame(all_sentiments)
results = results.set_index(['country'])

# add total tweet count
results['tweets'] = [tc['count'] for tc in all_tweet_counts]

# add percentages of features over the totals
results['positive_perc'] = results['positive'] / results['positive'].sum()
results['negative_perc'] = results['negative'] / results['negative'].sum()
results['neutral_perc'] = results['neutral'] / results['neutral'].sum()
results['tweets_perc'] = results['tweets'] / results['tweets'].sum()

# create log features
results['negative_log'] = np.log(1 + results['negative_perc'])
results['neutral_log'] = np.log(1 + results['neutral_perc'])
results['positive_log'] = np.log(1 + results['positive_perc'])
results['tweets_log'] = np.log(1 + results['tweets_perc'])

In [77]:
# Apply model coeficients to data and compute 
features = ['negative_log', 'neutral_log', 'positive_log', 'tweets_log']
model_coefs = np.array([-8.93416811e-03, -3.18005604e+01, 4.24127046e+00, 2.21510494e+01, 1.51818475e+01])[...,None]; # semis
X = results[features].values
X = np.append(np.ones(X.shape[0])[...,None] , X, axis=1)
results['predicted_score'] = np.dot(X, model_coefs)

In [78]:
# Compute and log to console
ranking = results[['negative', 'neutral', 'positive', 'tweets', 'predicted_score']].sort_values(by = 'predicted_score', ascending = False)

In [79]:
# set real position - vote of the jury
ranking.loc['POR', 'position_jury'] = 1
ranking.loc['BUL', 'position_jury'] = 2
ranking.loc['SWE', 'position_jury'] = 3
ranking.loc['AUS', 'position_jury'] = 4
ranking.loc['NED', 'position_jury'] = 5
ranking.loc['NOR', 'position_jury'] = 6
ranking.loc['ITA', 'position_jury'] = 7
ranking.loc['MDA', 'position_jury'] = 8
ranking.loc['BEL', 'position_jury'] = 9
ranking.loc['GBR', 'position_jury'] = 10
ranking.loc['AUT', 'position_jury'] = 11
ranking.loc['AZE', 'position_jury'] = 12
ranking.loc['DEN', 'position_jury'] = 13
ranking.loc['ROM', 'position_jury'] = 14
ranking.loc['ARM', 'position_jury'] = 15
ranking.loc['BLR', 'position_jury'] = 16
ranking.loc['HUN', 'position_jury'] = 17
ranking.loc['GRE', 'position_jury'] = 18
ranking.loc['FRA', 'position_jury'] = 19
ranking.loc['CYP', 'position_jury'] = 20
ranking.loc['ISR', 'position_jury'] = 21
ranking.loc['CRO', 'position_jury'] = 22
ranking.loc['POL', 'position_jury'] = 23
ranking.loc['UKR', 'position_jury'] = 24
ranking.loc['GER', 'position_jury'] = 25
ranking.loc['ESP', 'position_jury'] = 26

# set real position - public vote
ranking.loc['POR', 'position_people'] = 1
ranking.loc['BUL', 'position_people'] = 2
ranking.loc['MDA', 'position_people'] = 3
ranking.loc['BEL', 'position_people'] = 4
ranking.loc['ROM', 'position_people'] = 5
ranking.loc['ITA', 'position_people'] = 6
ranking.loc['HUN', 'position_people'] = 7
ranking.loc['SWE', 'position_people'] = 8
ranking.loc['CRO', 'position_people'] = 9
ranking.loc['FRA', 'position_people'] = 10
ranking.loc['AZE', 'position_people'] = 11
ranking.loc['POL', 'position_people'] = 12
ranking.loc['BLR', 'position_people'] = 13
ranking.loc['CYP', 'position_people'] = 14
ranking.loc['NOR', 'position_people'] = 15
ranking.loc['GRE', 'position_people'] = 16
ranking.loc['UKR', 'position_people'] = 17
ranking.loc['ARM', 'position_people'] = 18
ranking.loc['NED', 'position_people'] = 19
ranking.loc['GBR', 'position_people'] = 20
ranking.loc['DEN', 'position_people'] = 21
ranking.loc['ISR', 'position_people'] = 22
ranking.loc['ESP', 'position_people'] = 23
ranking.loc['GER', 'position_people'] = 24
ranking.loc['AUS', 'position_people'] = 25
ranking.loc['AUT', 'position_people'] = 26

# set real position - all
ranking.loc['POR', 'position'] = 1
ranking.loc['BUL', 'position'] = 2
ranking.loc['MDA', 'position'] = 3
ranking.loc['BEL', 'position'] = 4
ranking.loc['SWE', 'position'] = 5
ranking.loc['ITA', 'position'] = 6
ranking.loc['ROM', 'position'] = 7
ranking.loc['HUN', 'position'] = 8
ranking.loc['AUS', 'position'] = 9
ranking.loc['NOR', 'position'] = 10
ranking.loc['NED', 'position'] = 11
ranking.loc['FRA', 'position'] = 12
ranking.loc['CRO', 'position'] = 13
ranking.loc['AZE', 'position'] = 14
ranking.loc['GBR', 'position'] = 15
ranking.loc['AUT', 'position'] = 16
ranking.loc['BLR', 'position'] = 17
ranking.loc['ARM', 'position'] = 18
ranking.loc['GRE', 'position'] = 19
ranking.loc['DEN', 'position'] = 20
ranking.loc['CYP', 'position'] = 21
ranking.loc['POL', 'position'] = 22
ranking.loc['ISR', 'position'] = 23
ranking.loc['UKR', 'position'] = 24
ranking.loc['GER', 'position'] = 25
ranking.loc['ESP', 'position'] = 26

# set predicted position as a column
ranking = ranking.reset_index()
ranking['predicted_position'] = ranking.index + 1

In [80]:
# compute ranking prediction errors
print('RMSE predicted - final:  {}'.format(np.sqrt((1/ranking.shape[0])*np.sum(np.power(ranking['predicted_position'] - ranking['position'], 2))))) 
print('RMSE predicted - jury:   {}'.format(np.sqrt((1/ranking.shape[0])*np.sum(np.power(ranking['predicted_position'] - ranking['position_jury'], 2))))) 
print('RMSE predicted - people: {}'.format(np.sqrt((1/ranking.shape[0])*np.sum(np.power(ranking['predicted_position'] - ranking['position_people'], 2))))) 

RMSE predicted - final:  7.716116998754072
RMSE predicted - jury:   8.348191881617701
RMSE predicted - people: 8.862192817720768


In [95]:
# count top-5 errors
top_n = 3; print('Final Top-3 correct:  {}'.format(sum(ranking[0:top_n]['position'] <= top_n)))
top_n = 5; print('Final Top-5 correct:  {}'.format(sum(ranking[0:top_n]['position'] <= top_n)))
top_n = 10; print('Final Top-10 correct: {}'.format(sum(ranking[0:top_n]['position'] <= top_n)))
top_n = 3; print('Jury Top-3 correct:  {}'.format(sum(ranking[0:top_n]['position_jury'] <= top_n)))
top_n = 5; print('Jury Top-5 correct:  {}'.format(sum(ranking[0:top_n]['position_jury'] <= top_n)))
top_n = 10; print('Jury Top-10 correct: {}'.format(sum(ranking[0:top_n]['position_jury'] <= top_n)))
top_n = 3; print('People Top-3 correct:  {}'.format(sum(ranking[0:top_n]['position_peop'] <= top_n)))
top_n = 5; print('People Top-5 correct:  {}'.format(sum(ranking[0:top_n]['position'] <= top_n)))
top_n = 10; print('People Top-10 correct: {}'.format(sum(ranking[0:top_n]['position'] <= top_n)))

Final Top-3 correct:  2
Final Top-5 correct:  2
Final Top-10 correct: 6


In [87]:
ranking[0:5]['position']

0     1.0
1     3.0
2    11.0
3    13.0
4    17.0
Name: position, dtype: float64

In [83]:
ranking

Unnamed: 0,country,negative,neutral,positive,tweets,predicted_score,position_jury,position_people,position,predicted_position
0,POR,138,786,833,3377,1.940406,1.0,1.0,1.0,1
1,MDA,95,438,875,2024,1.875176,8.0,3.0,3.0,2
2,NED,124,922,563,2341,1.145005,5.0,19.0,11.0,3
3,CRO,119,754,450,1866,0.697183,22.0,9.0,13.0,4
4,BLR,82,559,334,1636,0.692392,16.0,13.0,17.0,5
5,AUS,116,537,483,1570,0.610721,4.0,25.0,9.0,6
6,BUL,106,313,488,1364,0.571229,2.0,2.0,2.0,7
7,FRA,41,210,179,1179,0.494266,19.0,10.0,12.0,8
8,BEL,118,494,409,1642,0.430291,9.0,4.0,4.0,9
9,ITA,184,1122,414,2739,0.351466,7.0,6.0,6.0,10
