In [1]:
import re
import sqlite3
import pandas as pd
from textblob import TextBlob
from collections import Counter

In [2]:
def get_tweet_sentiment(tweet):
    """
    Utility function to classify sentiment of passed tweet
    using textblob's sentiment method
    """

    # create TextBlob object of passed tweet text
    analysis = TextBlob(clean_tweet(tweet['tweetText']))
    
    # set sentiment
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

In [3]:
def clean_tweet(tweet):
    '''
    Utility function to clean tweet text by removing links, special characters
    using simple regex statements.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

In [4]:
# Setup sqlite
sqlite_file = 'eurovision_semi1.db'

# Connect to the database sqlite file
connection = sqlite3.connect(sqlite_file)
db = connection.cursor()

In [289]:
# set country hashtags - semifinal 1
all_hashtags = ['SWE', 'GEO', 'AUS', 'ALB', 'BEL', 'MNE', 'FIN', 'AZE', 'POR',\
                'POL', 'MDA', 'ISL', 'CZE', 'CYP', 'ARM', 'SLO', 'LAT', 'GRE',\
                'AUT', 'BLR', 'DEN', 'EST', 'MKD', 'HUN', 'IRL', 'ISR', 'LTU',\
                'NOR', 'ROM', 'SMR', 'SRB', 'SUI', 'NED', 'CRO', 'BUL', 'MLT',\
                'ITA', 'FRA', 'ESP', 'GBR', 'UKR', 'GER']

hashtags_semi1 = ['SWE', 'GEO', 'AUS', 'ALB', 'BEL', 'MNE', 'FIN', 'AZE', 'POR', 'GRE',\
                    'POL', 'MDA', 'ISL', 'CZE', 'CYP', 'ARM', 'SLO', 'LAT']

# set country hashtags - semifinal 2
hashtags_semi2 = ['AUT', 'BLR', 'DEN', 'EST', 'MKD', 'HUN', 'IRL', 'ISR', 'LTU', 'MLT', \
                    'NOR', 'ROM', 'SMR', 'SRB', 'SUI', 'NED', 'CRO', 'BUL']

# set country hashtags - final
hashtags_final = ['ARM', 'AZE', 'ITA', 'MDA', 'POL', 'POR', 'UKR', 'AUS', 'BEL', 'CYP', 'FRA',\
                  'GER', 'GRE', 'ESP', 'GBR', 'SWE', '???', '???', '???', '???', '???',\
                  '???', '???', '???', '???', '???']

hashtags = hashtags_semi1

In [290]:
# read ALL tweets in english from db, evaluate sentiment, and count
all_sentiments = []
for country in hashtags:

    # get tweets from DB
    country_tweets = pd.read_sql_query("SELECT * FROM TweetsRaw WHERE language='en' AND tweetText LIKE '%#{}%'".format(country), connection)

    # count number of sentiments
    sentiments_count = Counter(country_tweets.apply(get_tweet_sentiment, axis=1))
    
    # append country to list
    all_sentiments.append({'country': country, \
                           'positive': sentiments_count['positive'],\
                           'neutral': sentiments_count['neutral'],\
                           'negative': sentiments_count['negative']
                          })

In [291]:
# read all tweets (to just count)
all_tweet_counts = []
for country in hashtags:

    # get tweet count from DB
    db.execute("SELECT COUNT(*) AS count FROM TweetsRaw WHERE tweetText LIKE '%#{}%'".format(country))
    country_tweet_count = db.fetchone()[0]
    
    # append country to list
    all_tweet_counts.append({'country': country, \
                           'count': country_tweet_count
                          })

In [292]:
# transform to pandas dataframe and add total tweet count
results = pd.DataFrame(all_sentiments)
results = results.set_index(['country'])
results['tweets'] = [tc['count'] for tc in all_tweet_counts]
results['tweets_norm'] = results['tweets'] / results['tweets'].sum()

In [293]:
# add finalist 'column'
results['finalist'] = 0
results.loc['MDA','finalist'] = 1
results.loc['AZE','finalist'] = 1
results.loc['GRE','finalist'] = 1
results.loc['SWE','finalist'] = 1
results.loc['POR','finalist'] = 1
results.loc['POL','finalist'] = 1
results.loc['ARM','finalist'] = 1
results.loc['AUS','finalist'] = 1
results.loc['CYP','finalist'] = 1
results.loc['BEL','finalist'] = 1

In [294]:
# compute final scores
results['score'] = results['tweets_norm'] * results['positive'] / (results['tweets_norm'] + results['positive'])

In [295]:
# Top-10
print("Accuracy: {}%".format(10*results.sort_values(by = 'score', ascending = False)[0:10]['finalist'].sum()))
results.sort_values(by = 'tweets', ascending = False)

Accuracy: 70%


Unnamed: 0_level_0,negative,neutral,positive,tweets,tweets_norm,finalist,score
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
POR,30,177,275,1265,0.262339,1,0.262089
CYP,43,198,87,542,0.112401,1,0.112256
BEL,13,106,93,347,0.071962,1,0.071906
POL,5,77,50,263,0.054542,1,0.054482
LAT,14,91,48,257,0.053297,0,0.053238
SWE,14,100,40,239,0.049564,1,0.049503
CZE,17,65,82,236,0.048942,0,0.048913
FIN,8,79,57,232,0.048113,0,0.048072
AUS,21,90,53,224,0.046454,1,0.046413
ARM,7,71,52,210,0.04355,1,0.043514


### Logit model

In [296]:
import numpy as np
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score

In [297]:
# create log features
results['negative_log'] = np.log(results['negative'])
results['neutral_log'] = np.log(results['neutral'])
results['positive_log'] = np.log(results['positive'])
results['tweets_log'] = np.log(results['tweets'])

In [304]:
y, X = dmatrices('finalist ~ negative_log + neutral_log + positive_log + tweets_log', results, return_type = 'dataframe')

In [305]:
# with sklearn
model = LogisticRegression(fit_intercept = False, C = 1e9)
model.fit(X, y.values.ravel())
model.coef_

array([[-22.84224465,   0.82679824,  -0.34317562,  -2.82925343,
          6.32468189]])

In [306]:
# predict
predictions = model.predict(X)

# evaluation
finalists = results['finalist'].values
r2_score(predictions, finalists)

-0.168831168831169