# Predicting Eurovision Finalists results based on tweets

Script that reads tweets from a database and processes the information to predict the results of the Eurovision Song Contest.

In [1]:
import re
import sqlite3
import pandas as pd
from textblob import TextBlob
from collections import Counter

## Aux functions

In [2]:
def get_tweet_sentiment(tweet):
    """
    Utility function to classify sentiment of passed tweet
    using textblob's sentiment method
    """

    # create TextBlob object of passed tweet text
    analysis = TextBlob(clean_tweet(tweet['tweetText']))
    
    # set sentiment
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

In [3]:
def clean_tweet(tweet):
    '''
    Utility function to clean tweet text by removing links, special characters
    using simple regex statements.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

## Setup

In [311]:
# Setup sqlite
sqlite_file = 'eurovision_semi1.db'

# Connect to the database sqlite file
connection = sqlite3.connect(sqlite_file)
db = connection.cursor()

In [312]:
# set country hashtags - semifinal 1
all_hashtags = ['SWE', 'GEO', 'AUS', 'ALB', 'BEL', 'MNE', 'FIN', 'AZE', 'POR',\
                'POL', 'MDA', 'ISL', 'CZE', 'CYP', 'ARM', 'SLO', 'LAT', 'GRE',\
                'AUT', 'BLR', 'DEN', 'EST', 'MKD', 'HUN', 'IRL', 'ISR', 'LTU',\
                'NOR', 'ROM', 'SMR', 'SRB', 'SUI', 'NED', 'CRO', 'BUL', 'MLT',\
                'ITA', 'FRA', 'ESP', 'GBR', 'UKR', 'GER']

hashtags_semi1 = ['SWE', 'GEO', 'AUS', 'ALB', 'BEL', 'MNE', 'FIN', 'AZE', 'POR', 'GRE',\
                    'POL', 'MDA', 'ISL', 'CZE', 'CYP', 'ARM', 'SLO', 'LAT']

# set country hashtags - semifinal 2
hashtags_semi2 = ['AUT', 'BLR', 'DEN', 'EST', 'MKD', 'HUN', 'IRL', 'ISR', 'LTU', 'MLT', \
                    'NOR', 'ROM', 'SMR', 'SRB', 'SUI', 'NED', 'CRO', 'BUL']

# set country hashtags - final
hashtags_final = ['ARM', 'AZE', 'ITA', 'MDA', 'POL', 'POR', 'UKR', 'AUS', 'BEL', 'CYP', 'FRA',\
                  'GER', 'GRE', 'ESP', 'GBR', 'SWE', '???', '???', '???', '???', '???',\
                  '???', '???', '???', '???', '???']

hashtags = hashtags_semi1

## Count tweets and analyze sentiment

In [313]:
# read ALL tweets in english from db, evaluate sentiment, and count
all_sentiments = []
for country in hashtags:

    # get tweets from DB
    country_tweets = pd.read_sql_query("SELECT * FROM TweetsRaw WHERE language='en' AND tweetText LIKE '%#{}%'".format(country), connection)

    # count number of sentiments
    sentiments_count = Counter(country_tweets.apply(get_tweet_sentiment, axis=1))
    
    # append country to list
    all_sentiments.append({'country': country, \
                           'positive': sentiments_count['positive'],\
                           'neutral': sentiments_count['neutral'],\
                           'negative': sentiments_count['negative']
                          })

In [314]:
# read all tweets (to just count)
all_tweet_counts = []
for country in hashtags:

    # get tweet count from DB
    db.execute("SELECT COUNT(*) AS count FROM TweetsRaw WHERE tweetText LIKE '%#{}%'".format(country))
    country_tweet_count = db.fetchone()[0]
    
    # append country to list
    all_tweet_counts.append({'country': country, \
                           'count': country_tweet_count
                          })

In [315]:
# transform to pandas dataframe from sentiments list
results = pd.DataFrame(all_sentiments)
results = results.set_index(['country'])

# add total tweet count
results['tweets'] = [tc['count'] for tc in all_tweet_counts]

In [316]:
# add finalist 'column'
results['finalist'] = 0
results.loc['MDA','finalist'] = 1
results.loc['AZE','finalist'] = 1
results.loc['GRE','finalist'] = 1
results.loc['SWE','finalist'] = 1
results.loc['POR','finalist'] = 1
results.loc['POL','finalist'] = 1
results.loc['ARM','finalist'] = 1
results.loc['AUS','finalist'] = 1
results.loc['CYP','finalist'] = 1
results.loc['BEL','finalist'] = 1

## Rank countries by simple feature-based score

In [318]:
# compute final scores
results['score'] = results['tweets'] * results['positive'] / (results['tweets'] + results['positive'])

In [497]:
# Ranking
print("Accuracy: {}%".format(10*results.sort_values(by = 'score', ascending = False)[0:10]['finalist'].sum()))
results.sort_values(by = 'score', ascending = False)

Accuracy: 70%


Unnamed: 0_level_0,negative,neutral,positive,tweets,finalist,score,negative_norm,neutral_norm,positive_norm,tweets_norm,negative_log,neutral_log,positive_log,tweets_log,predicted_score
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
POR,30,177,275,1265,1,225.892857,1.610752,2.088001,3.674035,3.654657,3.401197,5.17615,5.616771,7.142827,5.265455
CYP,43,198,87,542,1,74.966614,2.892778,2.525995,0.480371,1.004684,3.7612,5.288267,4.465908,6.295266,4.656586
BEL,13,106,93,347,1,73.343182,-0.065745,0.607166,0.582296,0.289962,2.564949,4.663439,4.532599,5.849325,1.385009
CZE,17,65,82,236,0,60.855346,0.328725,-0.247965,0.395433,-0.116881,2.833213,4.174387,4.406719,5.463832,0.447398
FIN,8,79,57,232,0,45.757785,-0.558832,0.044031,-0.029256,-0.131542,2.079442,4.369448,4.043051,5.446737,0.417147
AUS,21,90,53,224,1,42.859206,0.723195,0.273456,-0.097207,-0.160864,3.044522,4.49981,3.970292,5.411646,1.04102
POL,5,77,50,263,1,42.01278,-0.854685,0.002317,-0.148169,-0.017919,1.609438,4.343805,3.912023,5.572154,0.360404
ARM,7,71,52,210,1,41.679389,-0.65745,-0.122824,-0.114194,-0.212177,1.94591,4.26268,3.951244,5.347108,0.17691
LAT,14,91,48,257,0,40.445902,0.032872,0.294313,-0.182145,-0.03991,2.639057,4.51086,3.871201,5.549076,0.863214
GRE,9,67,48,201,1,38.746988,-0.460215,-0.206251,-0.182145,-0.245164,2.197225,4.204693,3.871201,5.303305,0.111942


## Logistic Regression Prediction model

In [328]:
import numpy as np
from patsy import dmatrices
import sklearn.linear_model as lm
import sklearn.metrics as metrics

### Feature engineering

In [372]:
# create normalized features
results['negative_norm'] = (results['negative'] - results['negative'].mean() ) / results['negative'].std()
results['neutral_norm'] = (results['neutral'] - results['neutral'].mean() ) / results['neutral'].std()
results['positive_norm'] = (results['positive'] - results['positive'].mean() ) / results['positive'].std()
results['tweets_norm'] = (results['tweets'] - results['tweets'].mean() ) / results['tweets'].std()

# create log features
results['negative_log'] = np.log(results['negative'])
results['neutral_log'] = np.log(results['neutral'])
results['positive_log'] = np.log(results['positive'])
results['tweets_log'] = np.log(results['tweets'])

### Train Model

In [526]:
# set the features to analyze in the model
#features = ['negative_log', 'neutral_log', 'positive_log', 'tweets_log']
features = ['negative_log', 'neutral_log', 'positive_log', 'tweets_log', \
            'negative_norm', 'neutral_norm', 'positive_norm', 'tweets_norm']
features_string = ' + '.join(features)

In [527]:
# create input matrix and outut array
y, X = dmatrices('finalist ~ {}'.format(features_string), results, return_type = 'dataframe')

#split into train and test subsets
y_train = y[0:18]
X_train = X[0:18]
y_test = y[0:18]
X_test = X[0:18]

In [528]:
# Logistic Regression model with sklearn
regularization = 0.001
model = lm.LogisticRegression(fit_intercept = False, C = 1/regularization)
model.fit(X_train, y_train.values.ravel())
model.coef_

array([[ -0.28177526, -15.09077224,   0.64080446,   3.93206855,
          3.85856528,  13.44687015,   0.76969512,  -6.81908351,
          2.01213479]])

In [529]:
# print feature relationship
features_tmp = np.insert(features,0,'intercept')
pd.DataFrame(features_tmp, model.coef_.ravel())

Unnamed: 0,0
-0.281775,intercept
-15.090772,negative_log
0.640804,neutral_log
3.932069,positive_log
3.858565,tweets_log
13.44687,negative_norm
0.769695,neutral_norm
-6.819084,positive_norm
2.012135,tweets_norm


### Predict (same subset)

In [530]:
# predict
predictions = model.predict(X_test)

# evaluation
finalists = y_test.values.ravel()
# metrics.r2_score(predictions, finalists)

print("Accuracy: {}%".format(100*(predictions == finalists).sum() / len(predictions)))

Accuracy: 83.33333333333333%


## Recompute ranking using modeled feature weights

In [531]:
# compute the new predicted score using the feature weights modeled in Logistic Regression
results['predicted_score'] = np.dot(X.values, model.coef_.T)

In [532]:
# Ranking
print("Accuracy: {}%".format(10*results.sort_values(by = 'predicted_score', ascending = False)[0:10]['finalist'].sum()))
results.sort_values(by = 'predicted_score', ascending = False)

Accuracy: 90%


Unnamed: 0_level_0,negative,neutral,positive,tweets,finalist,score,negative_norm,neutral_norm,positive_norm,tweets_norm,negative_log,neutral_log,positive_log,tweets_log,predicted_score
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
CYP,43,198,87,542,1,74.966614,2.892778,2.525995,0.480371,1.004684,3.7612,5.288267,4.465908,6.295266,27.78744
POR,30,177,275,1265,1,225.892857,1.610752,2.088001,3.674035,3.654657,3.401197,5.17615,5.616771,7.142827,6.921827
POL,5,77,50,263,1,42.01278,-0.854685,0.002317,-0.148169,-0.017919,1.609438,4.343805,3.912023,5.572154,4.580232
AUS,21,90,53,224,1,42.859206,0.723195,0.273456,-0.097207,-0.160864,3.044522,4.49981,3.970292,5.411646,3.424541
AZE,6,71,31,202,1,26.875536,-0.756067,-0.122824,-0.470933,-0.241499,1.791759,4.26268,3.433987,5.308268,1.859838
SWE,14,100,40,239,1,34.265233,0.032872,0.482025,-0.318045,-0.105885,2.639057,4.60517,3.688879,5.476464,1.24881
LAT,14,91,48,257,0,40.445902,0.032872,0.294313,-0.182145,-0.03991,2.639057,4.51086,3.871201,5.549076,1.247008
MDA,23,38,35,134,1,27.751479,0.920429,-0.811099,-0.402983,-0.490736,3.135494,3.637586,3.555348,4.89784,1.123829
ARM,7,71,52,210,1,41.679389,-0.65745,-0.122824,-0.114194,-0.212177,1.94591,4.26268,3.951244,5.347108,0.669802
BEL,13,106,93,347,1,73.343182,-0.065745,0.607166,0.582296,0.289962,2.564949,4.663439,4.532599,5.849325,0.587987
