# Predicting Eurovision finalists results based on tweets during semifinals

In [3]:
import re
import sqlite3
import pandas as pd
from textblob import TextBlob
from collections import Counter
from random import randint

## Aux functions

In [4]:
def get_tweet_sentiment(tweet):
    """
    Utility function to classify sentiment of passed tweet
    using textblob's sentiment method
    """

    # create TextBlob object of passed tweet text
    analysis = TextBlob(clean_tweet(tweet['tweetText']))
    
    # set sentiment
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

In [5]:
def clean_tweet(tweet):
    '''
    Utility function to clean tweet text by removing links, special characters
    using simple regex statements.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

## Setup

In [9]:
# Setup sqlite
sqlite_file = 'db_semis.db'

# Connect to the database sqlite file
connection = sqlite3.connect(sqlite_file)
db = connection.cursor()

In [10]:
# set country hashtags - semifinal 1
all_hashtags = ['SWE', 'GEO', 'AUS', 'ALB', 'BEL', 'MNE', 'FIN', 'AZE', 'POR',\
                'POL', 'MDA', 'ISL', 'CZE', 'CYP', 'ARM', 'SLO', 'LAT', 'GRE',\
                'AUT', 'BLR', 'DEN', 'EST', 'MKD', 'HUN', 'IRL', 'ISR', 'LTU',\
                'NOR', 'ROM', 'SMR', 'SRB', 'SUI', 'NED', 'CRO', 'BUL', 'MLT',\
                'ITA', 'FRA', 'ESP', 'GBR', 'UKR', 'GER']

hashtags_semi1 = ['SWE', 'GEO', 'AUS', 'ALB', 'BEL', 'MNE', 'FIN', 'AZE', 'POR', 'GRE',\
                    'POL', 'MDA', 'ISL', 'CZE', 'CYP', 'ARM', 'SLO', 'LAT']

# set country hashtags - semifinal 2
hashtags_semi2 = ['AUT', 'BLR', 'DEN', 'EST', 'MKD', 'HUN', 'IRL', 'ISR', 'LTU', 'MLT', \
                    'NOR', 'ROM', 'SMR', 'SRB', 'SUI', 'NED', 'CRO', 'BUL']

# set country hashtags - final
hashtags_final = ['ARM', 'AZE', 'ITA', 'MDA', 'POL', 'POR', 'UKR', 'AUS', 'BEL', 'CYP', 'FRA',\
                  'GER', 'GRE', 'ESP', 'GBR', 'SWE', 'BUL', 'BLR', 'CRO', 'HUN', 'DEN',\
                  'ISR', 'ROM', 'NOR', 'NED', 'AUT']

hashtags = hashtags_semi1 + hashtags_semi2
print(len(hashtags))

36


## Count tweets and analyze sentiment

In [11]:
# read ALL tweets in english from db, evaluate sentiment, and count - SEMI 1
all_sentiments = []
for country in hashtags:

    # get tweets from DB
    country_tweets = pd.read_sql_query("SELECT * FROM TweetsRaw WHERE language='en' AND tweetText LIKE '%#{}%'".format(country), connection)

    # count number of sentiments
    sentiments_count = Counter(country_tweets.apply(get_tweet_sentiment, axis=1))
    
    # append country to list
    all_sentiments.append({'country': country, \
                           'positive': sentiments_count['positive'],\
                           'neutral': sentiments_count['neutral'],\
                           'negative': sentiments_count['negative']
                          })

In [12]:
# read all tweets (to just count)
all_tweet_counts = []
for country in hashtags:

    # get tweet count from DB
    db.execute("SELECT COUNT(*) AS count FROM TweetsRaw WHERE tweetText LIKE '%#{}%'".format(country))
    country_tweet_count = db.fetchone()[0]
    
    # append country to list
    all_tweet_counts.append({'country': country, \
                           'count': country_tweet_count
                          })

In [13]:
# transform to pandas dataframe from sentiments list
results = pd.DataFrame(all_sentiments)
results = results.set_index(['country'])

# add total tweet count
results['tweets'] = [tc['count'] for tc in all_tweet_counts]

# add percentages of features over the totals
results['positive_perc'] = results['positive'] / results['positive'].sum()
results['negative_perc'] = results['negative'] / results['negative'].sum()
results['neutral_perc'] = results['neutral'] / results['neutral'].sum()
results['tweets_perc'] = results['tweets'] / results['tweets'].sum()

In [14]:
# add finalist 'column'
results['finalist'] = 0

results.loc['MDA','finalist'] = 1
results.loc['AZE','finalist'] = 1
results.loc['GRE','finalist'] = 1
results.loc['SWE','finalist'] = 1
results.loc['POR','finalist'] = 1
results.loc['POL','finalist'] = 1
results.loc['ARM','finalist'] = 1
results.loc['AUS','finalist'] = 1
results.loc['CYP','finalist'] = 1
results.loc['BEL','finalist'] = 1
results.loc['BUL','finalist'] = 1
results.loc['BLR','finalist'] = 1
results.loc['CRO','finalist'] = 1
results.loc['HUN','finalist'] = 1
results.loc['DEN','finalist'] = 1
results.loc['ISR','finalist'] = 1
results.loc['ROM','finalist'] = 1
results.loc['NOR','finalist'] = 1
results.loc['NED','finalist'] = 1
results.loc['AUT','finalist'] = 1

## Rank countries by simple feature-based score

In [15]:
# compute final scores (harmonic mean of 'tweets' and 'positive')
results['harm_score'] = results['tweets'] * results['positive'] / (results['tweets'] + results['positive'])

In [16]:
# Ranking
print("Accuracy: {}%".format(5*results.sort_values(by = 'harm_score', ascending = False)[0:20]['finalist'].sum()))
results.sort_values(by = 'harm_score', ascending = False)

Accuracy: 60%


Unnamed: 0_level_0,negative,neutral,positive,tweets,positive_perc,negative_perc,neutral_perc,tweets_perc,finalist,harm_score
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BUL,73,428,742,2016,0.092808,0.032502,0.041071,0.061614,1,542.375635
CRO,122,735,706,2229,0.088305,0.054319,0.070531,0.068123,1,536.175128
NED,103,967,552,2407,0.069043,0.045859,0.092793,0.073564,1,449.02467
ISR,71,866,469,2202,0.058662,0.031612,0.083101,0.067298,1,386.648446
IRL,189,710,456,1985,0.057036,0.08415,0.068132,0.060666,0,370.81524
AUT,199,400,409,1454,0.051157,0.088602,0.038384,0.044438,1,319.208803
MKD,114,359,403,1376,0.050407,0.050757,0.03445,0.042054,0,311.707701
BLR,67,333,383,1367,0.047905,0.029831,0.031955,0.041779,1,299.177714
SUI,105,500,365,1585,0.045654,0.04675,0.04798,0.048441,0,296.679487
SRB,87,285,372,1129,0.046529,0.038736,0.027349,0.034505,0,279.805463


## Logistic Regression Prediction model

In [17]:
import numpy as np
from patsy import dmatrices
import sklearn.linear_model as lm
import sklearn.metrics as metrics
from sklearn.cross_validation import train_test_split, cross_val_score

### Feature engineering

In [18]:
# create normalized features
"""
results['negative_norm'] = (results['negative'] - results['negative'].mean() ) / results['negative'].std()
results['neutral_norm'] = (results['neutral'] - results['neutral'].mean() ) / results['neutral'].std()
results['positive_norm'] = (results['positive'] - results['positive'].mean() ) / results['positive'].std()
results['tweets_norm'] = (results['tweets'] - results['tweets'].mean() ) / results['tweets'].std()
results['negative_norm'] = results['negative'] / results['negative'].sum()
results['neutral_norm'] = results['neutral'] / results['neutral'].sum()
results['positive_norm'] = results['positive'] / results['positive'].sum()
results['tweets_norm'] = results['tweets'] / results['tweets'].sum()
"""

# create log features
results['negative_log'] = np.log(1 + results['negative_perc'])
results['neutral_log'] = np.log(1 + results['neutral_perc'])
results['positive_log'] = np.log(1 + results['positive_perc'])
results['tweets_log'] = np.log(1 + results['tweets_perc'])

### Train Model

In [19]:
# set the features to analyze in the model
features = ['negative_log', 'neutral_log', 'positive_log', 'tweets_log']
features_string = ' + '.join(features)

In [20]:
# create input matrix and outut array
y, X = dmatrices('finalist ~ {}'.format(features_string), results, return_type = 'dataframe')

In [21]:
# sklearn split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=randint(0,1000))

print("{}/{} finalists in train".format(
        sum([results.loc[country,'finalist'] for country in X_train.index.values]),
        X_train.shape[0]
    ))
print("{}/{} finalists in test: {}".format(
        sum([results.loc[country,'finalist'] for country in X_test.index.values]),
        X_test.shape[0],
        X_test.index.values
    ))

14.0/28 finalists in train
6.0/8 finalists in test: ['MLT' 'ISR' 'NOR' 'AUT' 'GEO' 'NED' 'ARM' 'ROM']


In [22]:
# Logistic Regression model with sklearn
regularization = 0.001
model = lm.LogisticRegression(fit_intercept = False, C = 1/regularization)
classifier = model.fit(X_train, y_train.values.ravel())

# print results
print("Coefficients: ", classifier.coef_)
print("Score:        ", classifier.score(X_test, y_test))

# print feature relationship
features_tmp = np.insert(features,0,'intercept')
pd.DataFrame(features_tmp, model.coef_.ravel())

Coefficients:  [[  0.05069144 -36.53821845   0.54238905  18.33126843  15.27301152]]
Score:         0.75


Unnamed: 0,0
0.050691,intercept
-36.538218,negative_log
0.542389,neutral_log
18.331268,positive_log
15.273012,tweets_log


### Predict (same subset)

In [23]:
# predict
predictions = model.predict(X_test)

# evaluation
finalists = y_test.values.ravel()
# metrics.r2_score(predictions, finalists)

print("Accuracy: {}%".format(100*(predictions == finalists).sum() / len(predictions)))

Accuracy: 75.0%


## Recompute ranking using modeled feature weights

In [24]:
# compute the new predicted score using the feature weights modeled in Logistic Regression
results['predicted_score'] = np.dot(X.values, model.coef_.T)

In [25]:
# Ranking
print("Accuracy: {}%".format(5*results.sort_values(by = 'predicted_score', ascending = False)[0:20]['finalist'].sum()))
results.sort_values(by = 'predicted_score', ascending = False)

Accuracy: 65%


Unnamed: 0_level_0,negative,neutral,positive,tweets,positive_perc,negative_perc,neutral_perc,tweets_perc,finalist,harm_score,negative_log,neutral_log,positive_log,tweets_log,predicted_score
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
BUL,73,428,742,2016,0.092808,0.032502,0.041071,0.061614,1,542.375635,0.031985,0.04025,0.088751,0.05979,1.443925
ISR,71,866,469,2202,0.058662,0.031612,0.083101,0.067298,1,386.648446,0.031122,0.079829,0.057006,0.06513,0.996555
POR,30,195,293,1343,0.036648,0.013357,0.018712,0.041045,1,240.525061,0.013269,0.018539,0.035992,0.040225,0.85008
NED,103,967,552,2407,0.069043,0.045859,0.092793,0.073564,1,449.02467,0.044839,0.088737,0.066764,0.070984,0.768491
CRO,122,735,706,2229,0.088305,0.054319,0.070531,0.068123,1,536.175128,0.052895,0.068154,0.084622,0.065903,0.712738
BLR,67,333,383,1367,0.047905,0.029831,0.031955,0.041779,1,299.177714,0.029395,0.031455,0.046793,0.04093,0.476619
BEL,17,206,162,582,0.020263,0.007569,0.019768,0.017787,1,126.725806,0.007541,0.019575,0.02006,0.017631,0.422797
POL,7,80,54,281,0.006754,0.003117,0.007677,0.008588,1,45.295522,0.003112,0.007647,0.006732,0.008551,0.195142
ARM,7,76,55,220,0.006879,0.003117,0.007293,0.006724,1,44.0,0.003112,0.007267,0.006856,0.006701,0.168955
FIN,9,90,58,259,0.007255,0.004007,0.008636,0.007916,0,47.388013,0.003999,0.008599,0.007228,0.007884,0.16216
