# Predicting Eurovision Finalists results based on tweets

Script that reads tweets from a database and processes the information to predict the results of the Eurovision Song Contest.

In [1]:
import re
import sqlite3
import pandas as pd
from textblob import TextBlob
from collections import Counter

## Aux functions

In [2]:
def get_tweet_sentiment(tweet):
    """
    Utility function to classify sentiment of passed tweet
    using textblob's sentiment method
    """

    # create TextBlob object of passed tweet text
    analysis = TextBlob(clean_tweet(tweet['tweetText']))
    
    # set sentiment
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

In [3]:
def clean_tweet(tweet):
    '''
    Utility function to clean tweet text by removing links, special characters
    using simple regex statements.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

## Setup

In [19]:
# Setup sqlite
sqlite_file = 'eurovision_semis.db'

# Connect to the database sqlite file
connection = sqlite3.connect(sqlite_file)
db = connection.cursor()

In [20]:
# set country hashtags - semifinal 1
all_hashtags = ['SWE', 'GEO', 'AUS', 'ALB', 'BEL', 'MNE', 'FIN', 'AZE', 'POR',\
                'POL', 'MDA', 'ISL', 'CZE', 'CYP', 'ARM', 'SLO', 'LAT', 'GRE',\
                'AUT', 'BLR', 'DEN', 'EST', 'MKD', 'HUN', 'IRL', 'ISR', 'LTU',\
                'NOR', 'ROM', 'SMR', 'SRB', 'SUI', 'NED', 'CRO', 'BUL', 'MLT',\
                'ITA', 'FRA', 'ESP', 'GBR', 'UKR', 'GER']

hashtags_semi1 = ['SWE', 'GEO', 'AUS', 'ALB', 'BEL', 'MNE', 'FIN', 'AZE', 'POR', 'GRE',\
                    'POL', 'MDA', 'ISL', 'CZE', 'CYP', 'ARM', 'SLO', 'LAT']

# set country hashtags - semifinal 2
hashtags_semi2 = ['AUT', 'BLR', 'DEN', 'EST', 'MKD', 'HUN', 'IRL', 'ISR', 'LTU', 'MLT', \
                    'NOR', 'ROM', 'SMR', 'SRB', 'SUI', 'NED', 'CRO', 'BUL']

# set country hashtags - final
hashtags_final = ['ARM', 'AZE', 'ITA', 'MDA', 'POL', 'POR', 'UKR', 'AUS', 'BEL', 'CYP', 'FRA',\
                  'GER', 'GRE', 'ESP', 'GBR', 'SWE', 'BUL', 'BLR', 'CRO', 'HUN', 'DEN',\
                  'ISR', 'ROM', 'NOR', 'NED', 'AUT']

hashtags = hashtags_semi1 + hashtags_semi2
print(len(hashtags))

36


## Count tweets and analyze sentiment

In [21]:
# read ALL tweets in english from db, evaluate sentiment, and count - SEMI 1
all_sentiments = []
for country in hashtags:

    # get tweets from DB
    country_tweets = pd.read_sql_query("SELECT * FROM TweetsRaw WHERE language='en' AND tweetText LIKE '%#{}%'".format(country), connection)

    # count number of sentiments
    sentiments_count = Counter(country_tweets.apply(get_tweet_sentiment, axis=1))
    
    # append country to list
    all_sentiments.append({'country': country, \
                           'positive': sentiments_count['positive'],\
                           'neutral': sentiments_count['neutral'],\
                           'negative': sentiments_count['negative']
                          })

In [22]:
# read all tweets (to just count)
all_tweet_counts = []
for country in hashtags:

    # get tweet count from DB
    db.execute("SELECT COUNT(*) AS count FROM TweetsRaw WHERE tweetText LIKE '%#{}%'".format(country))
    country_tweet_count = db.fetchone()[0]
    
    # append country to list
    all_tweet_counts.append({'country': country, \
                           'count': country_tweet_count
                          })

In [67]:
# transform to pandas dataframe from sentiments list
results = pd.DataFrame(all_sentiments)
results = results.set_index(['country'])

# add total tweet count
results['tweets'] = [tc['count'] for tc in all_tweet_counts]

# add percentages of features over the totals
results['positive_perc'] = results['positive'] / results['positive'].sum()
results['negative_perc'] = results['negative'] / results['negative'].sum()
results['neutral_perc'] = results['neutral'] / results['neutral'].sum()
results['tweets_perc'] = results['tweets'] / results['tweets'].sum()

In [68]:
# add finalist 'column'
results['finalist'] = 0

results.loc['MDA','finalist'] = 1
results.loc['AZE','finalist'] = 1
results.loc['GRE','finalist'] = 1
results.loc['SWE','finalist'] = 1
results.loc['POR','finalist'] = 1
results.loc['POL','finalist'] = 1
results.loc['ARM','finalist'] = 1
results.loc['AUS','finalist'] = 1
results.loc['CYP','finalist'] = 1
results.loc['BEL','finalist'] = 1
results.loc['BUL','finalist'] = 1
results.loc['BLR','finalist'] = 1
results.loc['CRO','finalist'] = 1
results.loc['HUN','finalist'] = 1
results.loc['DEN','finalist'] = 1
results.loc['ISR','finalist'] = 1
results.loc['ROM','finalist'] = 1
results.loc['NOR','finalist'] = 1
results.loc['NED','finalist'] = 1
results.loc['AUT','finalist'] = 1

In [69]:
results

Unnamed: 0_level_0,negative,neutral,positive,tweets,positive_perc,negative_perc,neutral_perc,tweets_perc,finalist
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SWE,16,109,45,268,0.005629,0.007124,0.01046,0.008191,1
GEO,9,35,13,87,0.001626,0.004007,0.003359,0.002659,0
AUS,24,168,78,375,0.009756,0.010686,0.016121,0.011461,1
ALB,4,22,12,72,0.001501,0.001781,0.002111,0.0022,0
BEL,17,206,162,582,0.020263,0.007569,0.019768,0.017787,1
MNE,8,38,39,155,0.004878,0.003562,0.003646,0.004737,0
FIN,9,90,58,259,0.007255,0.004007,0.008636,0.007916,0
AZE,6,75,31,209,0.003877,0.002671,0.007197,0.006388,1
POR,30,195,293,1343,0.036648,0.013357,0.018712,0.041045,1
GRE,10,69,50,223,0.006254,0.004452,0.006621,0.006815,1


## Rank countries by simple feature-based score

In [70]:
# compute final scores
results['harm_score'] = results['tweets'] * results['positive'] / (results['tweets'] + results['positive'])

In [72]:
# Ranking
print("Accuracy: {}%".format(5*results.sort_values(by = 'harm_score', ascending = False)[0:20]['finalist'].sum()))
results.sort_values(by = 'harm_score', ascending = False)

Accuracy: 60%


Unnamed: 0_level_0,negative,neutral,positive,tweets,positive_perc,negative_perc,neutral_perc,tweets_perc,finalist,harm_score
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BUL,73,428,742,2016,0.092808,0.032502,0.041071,0.061614,1,542.375635
CRO,122,735,706,2229,0.088305,0.054319,0.070531,0.068123,1,536.175128
NED,103,967,552,2407,0.069043,0.045859,0.092793,0.073564,1,449.02467
ISR,71,866,469,2202,0.058662,0.031612,0.083101,0.067298,1,386.648446
IRL,189,710,456,1985,0.057036,0.08415,0.068132,0.060666,0,370.81524
AUT,199,400,409,1454,0.051157,0.088602,0.038384,0.044438,1,319.208803
MKD,114,359,403,1376,0.050407,0.050757,0.03445,0.042054,0,311.707701
BLR,67,333,383,1367,0.047905,0.029831,0.031955,0.041779,1,299.177714
SUI,105,500,365,1585,0.045654,0.04675,0.04798,0.048441,0,296.679487
SRB,87,285,372,1129,0.046529,0.038736,0.027349,0.034505,0,279.805463


## Logistic Regression Prediction model

In [73]:
import numpy as np
from patsy import dmatrices
import sklearn.linear_model as lm
import sklearn.metrics as metrics

### Feature engineering

In [75]:
# create normalized features
"""
results['negative_norm'] = (results['negative'] - results['negative'].mean() ) / results['negative'].std()
results['neutral_norm'] = (results['neutral'] - results['neutral'].mean() ) / results['neutral'].std()
results['positive_norm'] = (results['positive'] - results['positive'].mean() ) / results['positive'].std()
results['tweets_norm'] = (results['tweets'] - results['tweets'].mean() ) / results['tweets'].std()
results['negative_norm'] = results['negative'] / results['negative'].sum()
results['neutral_norm'] = results['neutral'] / results['neutral'].sum()
results['positive_norm'] = results['positive'] / results['positive'].sum()
results['tweets_norm'] = results['tweets'] / results['tweets'].sum()
"""

# create log features
results['negative_log'] = np.log(results['negative_perc'])
results['neutral_log'] = np.log(results['neutral_perc'])
results['positive_log'] = np.log(results['positive_perc'])
results['tweets_log'] = np.log(results['tweets_perc'])

### Train Model

In [76]:
# set the features to analyze in the model
features = ['negative_log', 'neutral_log', 'positive_log', 'tweets_log']
#features = ['negative_norm', 'neutral_norm', 'positive_norm', 'tweets_norm']
#features = ['negative_log', 'neutral_log', 'positive_log', 'tweets_log', \
#            'negative_norm', 'neutral_norm', 'positive_norm', 'tweets_norm']
features_string = ' + '.join(features)

In [77]:
# create input matrix and outut array
y, X = dmatrices('finalist ~ {}'.format(features_string), results, return_type = 'dataframe')

#split into train and test subsets
y_train = y[0:36]
X_train = X[0:36]
y_test = y[0:36]
X_test = X[0:36]

In [78]:
# Logistic Regression model with sklearn
regularization = 0.001
model = lm.LogisticRegression(fit_intercept = False, C = 1/regularization)
model.fit(X_train, y_train.values.ravel())
model.coef_

array([[ 2.22157127, -2.0677648 , -0.25456623,  0.0645284 ,  2.79439937]])

In [79]:
# print feature relationship
features_tmp = np.insert(features,0,'intercept')
pd.DataFrame(features_tmp, model.coef_.ravel())

Unnamed: 0,0
2.221571,intercept
-2.067765,negative_log
-0.254566,neutral_log
0.064528,positive_log
2.794399,tweets_log


### Predict (same subset)

In [80]:
# predict
predictions = model.predict(X_test)

# evaluation
finalists = y_test.values.ravel()
# metrics.r2_score(predictions, finalists)

print("Accuracy: {}%".format(100*(predictions == finalists).sum() / len(predictions)))

Accuracy: 61.111111111111114%


## Recompute ranking using modeled feature weights

In [81]:
# compute the new predicted score using the feature weights modeled in Logistic Regression
results['predicted_score'] = np.dot(X.values, model.coef_.T)

In [82]:
# Ranking
print("Accuracy: {}%".format(5*results.sort_values(by = 'predicted_score', ascending = False)[0:20]['finalist'].sum()))
results.sort_values(by = 'predicted_score', ascending = False)

Accuracy: 70%


Unnamed: 0_level_0,negative,neutral,positive,tweets,positive_perc,negative_perc,neutral_perc,tweets_perc,finalist,harm_score,negative_log,neutral_log,positive_log,tweets_log,predicted_score
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
POR,30,195,293,1343,0.036648,0.013357,0.018712,0.041045,1,240.525061,-4.315709,-3.978579,-3.306399,-3.193081,3.022155
ISR,71,866,469,2202,0.058662,0.031612,0.083101,0.067298,1,386.648446,-3.454226,-2.487693,-2.835969,-2.69862,2.273358
BUL,73,428,742,2016,0.092808,0.032502,0.041071,0.061614,1,542.375635,-3.426447,-3.192455,-2.377222,-2.786871,2.178319
BEL,17,206,162,582,0.020263,0.007569,0.019768,0.017787,1,126.725806,-4.883693,-3.923702,-3.898975,-4.029271,1.807753
POL,7,80,54,281,0.006754,0.003117,0.007677,0.008588,1,45.295522,-5.770996,-4.869552,-4.997588,-4.757387,1.777731
NED,103,967,552,2407,0.069043,0.045859,0.092793,0.073564,1,449.02467,-3.082177,-2.37738,-2.673024,-2.609605,1.735224
BLR,67,333,383,1367,0.047905,0.029831,0.031955,0.041779,1,299.177714,-3.512214,-3.443436,-3.038537,-3.175368,1.291267
CRO,122,735,706,2229,0.088305,0.054319,0.070531,0.068123,1,536.175128,-2.912885,-2.651708,-2.426956,-2.686433,1.256192
AZE,6,75,31,209,0.003877,0.002671,0.007197,0.006388,1,26.995833,-5.925147,-4.93409,-5.552584,-5.053408,1.249895
ARM,7,76,55,220,0.006879,0.003117,0.007293,0.006724,1,44.0,-5.770996,-4.920845,-4.979238,-5.002114,1.108107
