# Predicting Eurovision Ranking (LinReg)

In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:99% !important; }</style>"))

In [5]:
import re
import sqlite3
import pandas as pd
from textblob import TextBlob
from collections import Counter
from random import randint
import csv
import numpy as np

## Aux functions

In [6]:
def get_tweet_sentiment(tweet):
    """
    Utility function to classify sentiment of passed tweet
    using textblob's sentiment method
    """

    # create TextBlob object of passed tweet text
    analysis = TextBlob(clean_tweet(tweet['tweetText']))
    
    # set sentiment
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

In [7]:
def clean_tweet(tweet):
    '''
    Utility function to clean tweet text by removing links, special characters
    using simple regex statements.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

## Setup

In [8]:
hashtags = {}

# set country hashtags - semifinal 1
hashtags['2017_semi1'] = ['SWE', 'GEO', 'AUS', 'ALB', 'BEL', 'MNE', 'FIN', 'AZE', 'POR', 'GRE',\
                            'POL', 'MDA', 'ISL', 'CZE', 'CYP', 'ARM', 'SLO', 'LAT']

# set country hashtags - semifinal 2
hashtags['2017_semi2'] = ['AUT', 'BLR', 'DEN', 'EST', 'MKD', 'HUN', 'IRL', 'ISR', 'LTU', 'MLT', \
                            'NOR', 'ROM', 'SMR', 'SRB', 'SUI', 'NED', 'CRO', 'BUL']

# set country hashtags - final
hashtags['2017_final'] = ['ARM', 'AZE', 'ITA', 'MDA', 'POL', 'POR', 'UKR', 'AUS', 'BEL', 'CYP', 'FRA',\
                          'GER', 'GRE', 'ESP', 'GBR', 'SWE', 'BUL', 'BLR', 'CRO', 'HUN', 'DEN',\
                          'ISR', 'ROM', 'NOR', 'NED', 'AUT']

In [9]:
# read csv file with results
results_list = {'2017_semi1':[], '2017_semi2':[], '2017_final':[]}
with open('results.csv', 'r') as results_csv:
    csv_reader = csv.reader(results_csv, delimiter=',')
    for row in csv_reader:
        if len(row) > 0:
            if row[0] not in ('round'):
                results_list[row[0]].append({
                    'country':row[2],
                    'rank':row[3],
                    'televoting':row[4],
                    'jury':row[5]
                })

# create dataframe
results = pd.DataFrame()
for round in results_list:
    round_df = pd.DataFrame(results_list[round])
    round_df['round'] = round
    if len(results) == 0:
        results = round_df
    else:
        results = results.append(round_df, ignore_index=True)
    print('{}: {}'.format(round,len(results_list[round])))

# convert to numeric values
results['rank'] = pd.to_numeric(results['rank'], errors='coerce').fillna(0).astype(np.int64)
results['televoting'] = pd.to_numeric(results['televoting'], errors='coerce').fillna(0).astype(np.int64)
results['jury'] = pd.to_numeric(results['jury'], errors='coerce').fillna(0).astype(np.int64)

# add total points
results['total'] = results.apply(lambda r: int(r['jury']) + int(r['televoting']), axis=1)

# add % of total points
results['total_perc'] = results['total']/results['total'].sum()

# sort columns
results = results[['round', 'country', 'rank', 'jury', 'televoting', 'total', 'total_perc']]

2017_final: 26
2017_semi1: 18
2017_semi2: 18


## Compose the Data table (features and labels) from different rounds

In [10]:
all_rounds_data = pd.DataFrame()

for round_string in ['2017_final', '2017_semi1', '2017_semi2']:
    
    print('Processing round', round_string)

    #select db
    if round_string == '2017_final':
        sqlite_file = 'db_2017_friday_and_final.db'        
    else:
        sqlite_file = 'db_' + round_string + '.db'

    # Connect to the database sqlite file
    connection = sqlite3.connect(sqlite_file)
    db = connection.cursor()

    # read ALL tweets in english from db, evaluate sentiment, and count
    print('  Reading sentiments')
    round_sentiments = []
    for country in hashtags[round_string]:

        # get tweets from DB
        country_tweets = pd.read_sql_query("SELECT * FROM TweetsRaw WHERE language='en' AND tweetText LIKE '%#{}%'".format(country), connection)

        # count number of sentiments
        sentiments_count = Counter(country_tweets.apply(get_tweet_sentiment, axis=1))

        # append country to list
        round_sentiments.append({'country': country, \
                               'positive': sentiments_count['positive'],\
                               'neutral': sentiments_count['neutral'],\
                               'negative': sentiments_count['negative']
                              })

    # read all tweets (to just count)
    print('  Counting tweets')
    round_tweet_counts = []
    for country in hashtags[round_string]:

        # get tweet count from DB
        db.execute("SELECT COUNT(*) AS count FROM TweetsRaw WHERE tweetText LIKE '%#{}%'".format(country))
        country_tweet_count = db.fetchone()[0]

        # append country to list
        round_tweet_counts.append({'country': country, \
                               'count': country_tweet_count
                              })

    # transform to pandas dataframe from sentiments list
    round_data = pd.DataFrame(round_sentiments)
    round_data['round'] = round_string

    # add total tweet count
    round_data['tweets'] = [tc['count'] for tc in round_tweet_counts]
        
    # merge with results
    round_data = pd.merge(round_data, results, on=['country','round'], how='left')
    
    # re-order columns
    round_data = round_data[['round','country',\
                                'tweets','negative','neutral','positive',\
                                'rank','jury','televoting','total', 'total_perc'
                              ]]    

    # append to dataframe with all rounds data
    if len(all_rounds_data) == 0: 
        print('  Initializing dataframe of length',len(round_data),'into global dataframe of length',len(all_rounds_data))
        all_rounds_data = round_data
        print('  Initialized global dataframe with length',len(all_rounds_data))
    else:
        print('  Appending dataframe of length',len(round_data),'into global dataframe of length',len(all_rounds_data))
        all_rounds_data = all_rounds_data.append(round_data, ignore_index=True)
        print('  Appended global dataframe with length',len(all_rounds_data))

Processing round 2017_final
  Reading sentiments
  Counting tweets
  Initializing dataframe of length 26 into global dataframe of length 0
  Initialized global dataframe with length 26
Processing round 2017_semi1
  Reading sentiments
  Counting tweets
  Appending dataframe of length 18 into global dataframe of length 26
  Appended global dataframe with length 44
Processing round 2017_semi2
  Reading sentiments
  Counting tweets
  Appending dataframe of length 18 into global dataframe of length 44
  Appended global dataframe with length 62


## Feature engineering

In [11]:
# create normalized features
"""
results['negative_norm'] = (results['negative'] - results['negative'].mean() ) / results['negative'].std()
results['neutral_norm'] = (results['neutral'] - results['neutral'].mean() ) / results['neutral'].std()
results['positive_norm'] = (results['positive'] - results['positive'].mean() ) / results['positive'].std()
results['tweets_norm'] = (results['tweets'] - results['tweets'].mean() ) / results['tweets'].std()
results['negative_norm'] = results['negative'] / results['negative'].sum()
results['neutral_norm'] = results['neutral'] / results['neutral'].sum()
results['positive_norm'] = results['positive'] / results['positive'].sum()
results['tweets_norm'] = results['tweets'] / results['tweets'].sum()
"""

# add percentages of features over the totals
all_rounds_data['positive_perc'] = all_rounds_data['positive'] / all_rounds_data['positive'].sum()
all_rounds_data['negative_perc'] = all_rounds_data['negative'] / all_rounds_data['negative'].sum()
all_rounds_data['neutral_perc'] = all_rounds_data['neutral'] / all_rounds_data['neutral'].sum()
all_rounds_data['tweets_perc'] = all_rounds_data['tweets'] / all_rounds_data['tweets'].sum()

# create log features
all_rounds_data['negative_log'] = np.log(1 + all_rounds_data['negative'])
all_rounds_data['neutral_log'] = np.log(1 + all_rounds_data['neutral'])
all_rounds_data['positive_log'] = np.log(1 + all_rounds_data['positive'])
all_rounds_data['tweets_log'] = np.log(1 + all_rounds_data['tweets'])

# create label 'isTopN'
all_rounds_data['isTop5'] = all_rounds_data.apply(lambda r: 1 if r['rank']<=5 else 0, axis=1)
all_rounds_data['isTop10'] = all_rounds_data.apply(lambda r: 1 if r['rank']<=10 else 0, axis=1)

In [196]:
# Eplore features
import matplotlib as mpl
import seaborn as sns
columns = [#'tweets', 'negative', 'neutral', 'positive',\
           'rank', 'isTop5', 'isTop10',\
           #'jury', 'televoting', 'total', 'total_perc',\
           'positive_perc', 'negative_perc', 'neutral_perc', 'tweets_perc']
           #'negative_log', 'neutral_log','positive_log', 'tweets_log']
data_to_plot = all_rounds_data[columns]
#sns.pairplot(data_to_plot)

<seaborn.axisgrid.PairGrid at 0x7fabd276cb70>

## Linear Regression Prediction model

In [13]:
from patsy import dmatrices
from sklearn.linear_model import LinearRegression, LogisticRegression
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [198]:
# set the features to analyze in the model
features = ['negative', 'neutral', 'positive', 'tweets']
#features = ['negative_log', 'neutral_log', 'positive_log', 'tweets_log']
#features = ['negative_perc', 'neutral_perc', 'positive_perc', 'tweets_perc']
features_string = ' + '.join(features)

# create input matrix and outut array
y, X = dmatrices('rank ~ {}'.format(features_string), all_rounds_data, return_type = 'dataframe')

# sklearn split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=randint(0,1000))
print('X_train: {}\nX_test:  {}\ny_train: {}\ny_test:  {}\n'.format(len(X_train),len(X_test),len(y_train),len(y_test)))

# Linear Regression model with sklearn
model = LinearRegression(fit_intercept = True, normalize = False, copy_X=True)
regressor = model.fit(X_train, y_train.values.ravel())

# print results
print("Score train: {}".format(regressor.score(X_train, y_train)))
print("Score test:  {}\n".format(regressor.score(X_test, y_test)))

# print feature relationship
features_tmp = np.insert(features,0,'intercept')
print(pd.DataFrame(list(zip(features_tmp, model.coef_.ravel())), columns=['features','coefs']))

# predict test
pd.DataFrame(list(zip(model.predict(X_test), y_test.values.ravel())), columns=['predicted','real']).sort_values('predicted', ascending=False)

X_train: 49
X_test:  13
y_train: 49
y_test:  13

Score train: 0.25625903568336417
Score test:  0.04586627952493105

   features     coefs
0  intercep  0.000000
1  negative  0.078345
2   neutral -0.015520
3  positive -0.023051
4    tweets  0.004445


Unnamed: 0,predicted,real
10,18.007469,7.0
12,16.660751,17.0
9,12.261282,14.0
1,11.301531,19.0
5,11.110133,23.0
6,10.829666,5.0
2,10.707635,12.0
7,10.706985,12.0
0,10.476928,1.0
11,8.918433,9.0


## Logistic Regression with isTopN as label

In [55]:
isTopN = 'isTop5'

# set the features to analyze in the model
#features = ['negative', 'neutral', 'positive', 'tweets']
features = ['negative_log', 'neutral_log', 'positive_log', 'tweets_log']
#features = ['negative_perc', 'neutral_perc', 'positive_perc', 'tweets_perc']
features_string = ' + '.join(features)

# create input matrix and outut array
y, X = dmatrices('{} ~ {}'.format(isTopN, features_string), all_rounds_data, return_type = 'dataframe')

# normalize features
scaler = StandardScaler()
X_norm = pd.DataFrame(scaler.fit_transform(X))
X_norm[0] = 1 # set intercept back to 1 (scaler sets it to 0 because of 0 variance)

# sklearn split
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.3, random_state=randint(0,1000))
print('X_train: {}\nX_test:  {}\ny_train: {} {}%\ny_test:  {} {}%'.format(
    len(X_train),
    len(X_test),
    len(y_train), int(100*y_train.sum()/len(y_train)),
    len(y_test), int(100*y_test.sum()/len(y_test))
))

# Linear Regression model with sklearn
regularization = 0.1
model = LogisticRegression(fit_intercept = True, C = 1/regularization)
regressor = model.fit(X_train, y_train.values.ravel())

# predict test
y_test_pred = regressor.predict(X_test)

# print scores
print('\nClassification performance metrics')
print(' Accuracy: {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print(' F1 Score: {}'.format(metrics.f1_score(y_test, y_test_pred)))
print(' ROC AUC:  {}\n'.format(metrics.roc_auc_score(y_test, y_test_pred)))

# print feature relationship
features_tmp = np.insert(features,0,'intercept')
print(pd.DataFrame(list(zip(features_tmp, model.coef_.ravel())), columns=['features','coefs']))

# compute the new predicted score using the feature weights modeled in Logistic Regression
all_rounds_data['predicted_score'] = np.dot(X_norm.values, model.coef_.T)

# compare predicted as topN
all_rounds_data[all_rounds_data['round']=='2017_final'][['round','country','rank',isTopN,'negative', 'neutral', 'positive', 'tweets','predicted_score']].sort_values(by=['predicted_score'], ascending=False).head(5)

X_train: 43
X_test:  19
y_train: 43 23%
y_test:  19 26%

Classification performance metrics
 Accuracy: 0.631578947368421
 F1 Score: 0.22222222222222224
 ROC AUC:  0.4928571428571428

       features     coefs
0     intercept -0.883645
1  negative_log -2.865041
2   neutral_log -0.523624
3  positive_log  3.009323
4    tweets_log  1.089205


Unnamed: 0,round,country,rank,isTop5,negative,neutral,positive,tweets,predicted_score
3,2017_final,MDA,3,1,95,438,875,2024,2.705261
5,2017_final,POR,1,1,138,786,833,3377,1.807106
16,2017_final,BUL,2,1,106,313,488,1364,0.5181
24,2017_final,NED,11,0,124,922,563,2341,0.508773
10,2017_final,FRA,12,0,41,210,179,1179,0.287857


In [None]:
# persist features, scaler and model to binary files
import pickle
with open("features.bin", "wb") as f:
    pickle.dump(features, f, pickle.HIGHEST_PROTOCOL)
with open("scaler.bin", "wb") as f:
    pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL)
with open("regressor.bin", "wb") as f:
    pickle.dump(regressor, f, pickle.HIGHEST_PROTOCOL)