In [18]:
import pandas as pd
import numpy as np
import re
import time
from textblob import TextBlob
import sqlite3
import GetOldTweets
from polyglot.polyglot import detect
from polyglot.polyglot.downloader import downloader
from polyglot.polyglot.text import Text
import datetime

# Data Preprocessing

In [3]:
database = 'database.sqlite'
conn = sqlite3.connect(database)

In [4]:
#Fetching required data tables
player_data = pd.read_sql("SELECT * FROM Player;", conn)
player_stats_data = pd.read_sql("SELECT * FROM Player_Attributes;", conn)
team_data = pd.read_csv("Team_names.csv")
match_data = pd.read_sql("SELECT * FROM Match;", conn)
countries = pd.read_sql_query("SELECT * from Country", conn)
leagues = pd.read_sql_query("SELECT * from League", conn)
team_data.head()

Unnamed: 0.1,Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,search_term,team_twitter_handle,team_short_name
0,0,1,9987,673.0,KRC Genk,KRC Genk,KRCGenkofficial,GEN
1,1,2,9993,675.0,Beerschot AC,Beerschot AC,beerschotac,BAC
2,2,3,10000,15005.0,SV Zulte-Waregem,essevee,ESSEVEELIVE,ZUL
3,3,4,9994,2007.0,Sporting Lokeren,ksc lokeren,KSCLokeren,LOK
4,4,5,9984,1750.0,KSV Cercle Brugge,KSV Cercle Brugge,cercleofficial,CEB


In [7]:
# Initializing search parameter data
search_params = pd.DataFrame(columns=['match_api_id', 'date', 'start_date', 'end_date', 'home_team_api_id', 'away_team_api_id', 'home_team_search_term', 'away_team_search_term', 'home_team_twitter_handle', 'away_team_twitter_handle', 'home_team_positive_tweets', 'home_team_neutral_tweets','home_team_negative_tweets','away_team_positive_tweets','away_team_neutral_tweets','away_team_negative_tweets'])
search_params['match_api_id'] = match_data['match_api_id']
search_params['date'] = match_data['date']
search_params['home_team_api_id'] = match_data['home_team_api_id']
search_params['away_team_api_id'] = match_data['away_team_api_id']
search_params.head()

Unnamed: 0,match_api_id,date,start_date,end_date,home_team_api_id,away_team_api_id,home_team_search_term,away_team_search_term,home_team_twitter_handle,away_team_twitter_handle,home_team_positive_tweets,home_team_neutral_tweets,home_team_negative_tweets,away_team_positive_tweets,away_team_neutral_tweets,away_team_negative_tweets
0,492473,2008-08-17 00:00:00,,,9987,9993,,,,,,,,,,
1,492474,2008-08-16 00:00:00,,,10000,9994,,,,,,,,,,
2,492475,2008-08-16 00:00:00,,,9984,8635,,,,,,,,,,
3,492476,2008-08-17 00:00:00,,,9991,9998,,,,,,,,,,
4,492477,2008-08-16 00:00:00,,,7947,9985,,,,,,,,,,


In [5]:
# Getting start_date and end_date for search_params
for i in range(0, len(search_params)):
    val = search_params.iloc[i, search_params.columns.get_loc('date')]
    
    # getting first token of date
    date = val.split(' ', 1)[0]
    search_params.iloc[i, search_params.columns.get_loc('date')] = date
    
    # parsing date into year, month, day
    year = date.split('-', 2)[0]
    year = year[2:]
    month = date.split('-', 2)[1]
    day = date.split('-', 2)[2]
    
    # converting date to datetime type
    match_date = datetime.datetime.strptime(year + "-" + month + "-" + day, "%y-%m-%d")
    
    # subtracting 10 days from match_date to get start_date
    start_date = match_date + datetime.timedelta(days=-10)
    search_params.iloc[i, search_params.columns.get_loc('start_date')] = str(start_date).split(' ', 1)[0]
    
    # subtracting 1 day from match_date to get end_date
    end_date = match_date + datetime.timedelta(days=-1)
    search_params.iloc[i, search_params.columns.get_loc('end_date')] = str(end_date).split(' ', 1)[0]
    

In [9]:
search_params.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,match_api_id,date,start_date,end_date,home_team_api_id,away_team_api_id,home_team_name,away_team_name,home_team_positive_tweets,home_team_neutral_tweets,home_team_negative_tweets,away_team_positive_tweets,away_team_neutral_tweets,away_team_negative_tweets
0,0,0,0,0,492473,2008-08-17,2008-08-07,2008-08-16,9987,9993,KRC Genk,Beerschot AC,0.0,1.0,0.0,0.0,1.0,0.0
1,1,1,1,1,492474,2008-08-16,2008-08-06,2008-08-15,10000,9994,SV Zulte-Waregem,Sporting Lokeren,0.0,1.0,0.0,0.0,1.0,0.0
2,2,2,2,2,492475,2008-08-16,2008-08-06,2008-08-15,9984,8635,KSV Cercle Brugge,RSC Anderlecht,0.0,1.0,0.0,0.1,0.9,0.0
3,3,3,3,3,492476,2008-08-17,2008-08-07,2008-08-16,9991,9998,KAA Gent,RAEC Mons,0.1,0.9,0.0,0.0,1.0,0.0
4,4,4,4,4,492477,2008-08-16,2008-08-06,2008-08-15,7947,9985,FCV Dender EH,Standard de Liège,0.0,1.0,0.0,,,


In [47]:
# Getting home_team_name and away_team name for search_params
for i in range(0, len(search_params)):   
    # Getting team ids
    home_team_api_id = search_params.iloc[i, search_params.columns.get_loc('home_team_api_id')]
    away_team_api_id = search_params.iloc[i, search_params.columns.get_loc('away_team_api_id')]
    
    # Getting index of ids in team_data
    home_team_index = team_data.index[team_data['team_api_id'] == home_team_api_id]
    away_team_index = team_data.index[team_data['team_api_id'] == away_team_api_id]
    
    # Getting search terms
    home_team_search_term = team_data.iloc[home_team_index, team_data.columns.get_loc('search_term')].values[0]
    away_team_search_term = team_data.iloc[away_team_index, team_data.columns.get_loc('search_term')].values[0]
    
    search_params.iloc[i, search_params.columns.get_loc('home_team_search_term')] = home_team_search_term
    search_params.iloc[i, search_params.columns.get_loc('away_team_search_term')] = away_team_search_term
    
    # Getting twitter handles
    home_team_twitter_handle = team_data.iloc[home_team_index, team_data.columns.get_loc('team_twitter_handle')].values[0]
    away_team_twitter_handle = team_data.iloc[away_team_index, team_data.columns.get_loc('team_twitter_handle')].values[0]
    
    search_params.iloc[i, search_params.columns.get_loc('home_team_twitter_handle')] = home_team_twitter_handle
    search_params.iloc[i, search_params.columns.get_loc('away_team_twitter_handle')] = away_team_twitter_handle

In [2]:
search_params = pd.read_csv("temp_output.csv")

In [50]:
search_params.to_csv("Twitter_NLP.csv")

In [3]:
search_params.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,match_api_id,date,start_date,end_date,home_team_api_id,away_team_api_id,home_team_name,away_team_name,home_team_search_term,away_team_search_term,home_team_twitter_handle,away_team_twitter_handle,home_team_positive_tweets,home_team_neutral_tweets,home_team_negative_tweets,away_team_positive_tweets,away_team_neutral_tweets,away_team_negative_tweets
0,0,0,492473,2008-08-17,2008-08-07,2008-08-16,9987,9993,KRC Genk,Beerschot AC,KRC Genk,Beerschot AC,KRCGenkofficial,beerschotac,1.0,0.0,0.0,,,
1,1,1,492474,2008-08-16,2008-08-06,2008-08-15,10000,9994,SV Zulte-Waregem,Sporting Lokeren,essevee,ksc lokeren,ESSEVEELIVE,KSCLokeren,,,,,,
2,2,2,492475,2008-08-16,2008-08-06,2008-08-15,9984,8635,KSV Cercle Brugge,RSC Anderlecht,KSV Cercle Brugge,RSC Anderlecht,cercleofficial,rscanderlecht,,,,0.1,0.6,0.3
3,3,3,492476,2008-08-17,2008-08-07,2008-08-16,9991,9998,KAA Gent,RAEC Mons,KAA Gent,RAEC Mons,KAAGent,RAEC_Mons,,,,1.0,0.0,0.0
4,4,4,492477,2008-08-16,2008-08-06,2008-08-15,7947,9985,FCV Dender EH,Standard de Li��ge,FCV Dender EH,standard rscl,fcdender,Standard_RSCL,,,,,,


# Twitter Sentiment Analysis

In [28]:
search_params['home_team_negative_tweets'] = np.nan
search_params['home_team_positive_tweets'] = np.nan
search_params['home_team_neutral_tweets'] = np.nan
search_params['away_team_negative_tweets'] = np.nan
search_params['away_team_positive_tweets'] = np.nan
search_params['away_team_neutral_tweets'] = np.nan

In [3]:
search_params.iloc[3961, search_params.columns.get_loc('home_team_positive_tweets')]

0.28000000000000003

In [23]:
api = TwitterClient()
for i in range(3962, 3965):
    try:
        print i
        tweets = []
        if (search_params.iloc[i, search_params.columns.get_loc('home_team_twitter_handle')] != np.nan):
            tweets = api.get_tweets(
                user=search_params.iloc[i, search_params.columns.get_loc('home_team_twitter_handle')],
                since=search_params.iloc[i, search_params.columns.get_loc('start_date')],
                until=search_params.iloc[i, search_params.columns.get_loc('end_date')],
                count=20)
        if (len(tweets) < 20):
            tweets2 = api.get_tweets(
                query=search_params.iloc[i, search_params.columns.get_loc('home_team_search_term')],
                since=search_params.iloc[i, search_params.columns.get_loc('start_date')],
                until=search_params.iloc[i, search_params.columns.get_loc('end_date')],
                count=20-len(tweets))
            tweets.extend(tweets2)
        if (len(tweets) != 0):
            positive, negative, neutral = get_sentiment(tweets)
            search_params.iloc[i, search_params.columns.get_loc('home_team_positive_tweets')] = positive
            search_params.iloc[i, search_params.columns.get_loc('home_team_negative_tweets')] = negative
            search_params.iloc[i, search_params.columns.get_loc('home_team_neutral_tweets')] = neutral
        tweets = []
        if (search_params.iloc[i, search_params.columns.get_loc('away_team_twitter_handle')] != np.nan):
            tweets = api.get_tweets(
                user=search_params.iloc[i, search_params.columns.get_loc('away_team_twitter_handle')],
                since=search_params.iloc[i, search_params.columns.get_loc('start_date')],
                until=search_params.iloc[i, search_params.columns.get_loc('end_date')],
                count=20)
        if (len(tweets) < 20):
            tweets2 = api.get_tweets(
                query=search_params.iloc[i, search_params.columns.get_loc('away_team_search_term')],
                since=search_params.iloc[i, search_params.columns.get_loc('start_date')],
                until=search_params.iloc[i, search_params.columns.get_loc('end_date')],
                count=20-len(tweets))
            tweets.extend(tweets2)
        if (len(tweets) != 0):
            positive, negative, neutral = get_sentiment(tweets)
            search_params.iloc[i, search_params.columns.get_loc('away_team_positive_tweets')] = positive
            search_params.iloc[i, search_params.columns.get_loc('away_team_negative_tweets')] = negative
            search_params.iloc[i, search_params.columns.get_loc('away_team_neutral_tweets')] = neutral
        print ("Percentage complete: {0:.2f}".format(i / 25979. * 100.))
        print("-------------------------------------------------------------------------------------")
        if (i % 100 == 0):
            search_params.to_csv("temp_output.csv")
    except:
        print ("ERROR")
        print("-------------------------------------------------------------------------------------\n")

3962
query
hello
0
2013-09-11
2013-09-20
user: LFC
0
user
hello
0
2013-09-11
2013-09-20
user: 0
Liverpool
query
hello
0
2013-09-11
2013-09-20
user: SouthamptonFC
0
user
hello
0
2013-09-11
2013-09-20
user: 0
Southampton
Percentage complete: 15.25
-------------------------------------------------------------------------------------
3963
query
hello
0
2013-09-12
2013-09-21
user: ManUtd
0
user
hello
0
2013-09-12
2013-09-21
user: 0
Manchester City
query
hello
0
2013-09-12
2013-09-21
user: ManUtd
0
user
hello
0
2013-09-12
2013-09-21
user: 0
Manchester United
Percentage complete: 15.25
-------------------------------------------------------------------------------------
3964
query
hello
0
2013-09-11
2013-09-20
user: NUFC
0
user
hello
0
2013-09-11
2013-09-20
user: 0
Newcastle United
query
hello
0
2013-09-11
2013-09-20
user: HullCity
0
user
hello
0
2013-09-11
2013-09-20
user: 0
Hull City
Percentage complete: 15.26
---------------------------------------------------------------------------------

In [22]:
class TwitterClient(object):
    def clean_tweet(self, tweet):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
 
    def get_tweet_sentiment(self, tweet):
        '''
        Utility function to classify sentiment of passed tweet
        using textblob's sentiment method
        '''
        # create TextBlob object of passed tweet text
        analysis = TextBlob(self.clean_tweet(tweet))
        
        text = Text(str(analysis))
        
        sum = 0
        for word in text.words:
            sum += word.polarity
        # set sentiment
        if sum > 0:
            return 'positive'
        elif sum < 0:
            return 'negative'
        else:
            return 'neutral'
        
    def get_tweets(self, since, until, count, query=0, user=0):
        '''
        Main function to fetch tweets and parse them.
        '''
        # empty list to store parsed tweets
        tweets = []
        if (query == 0):
            print ("query")
            tweetCriteria = GetOldTweets.got.manager.TweetCriteria().setUsername(user).setSince(since).setUntil(until).setMaxTweets(count)
        elif (user == 0):
            print ("user")
            tweetCriteria = GetOldTweets.got.manager.TweetCriteria().setQuerySearch(query).setSince(since).setUntil(until).setMaxTweets(count)
        fetched_tweets = GetOldTweets.got.manager.TweetManager.getTweets(tweetCriteria)
        print ("hello")
        print len(fetched_tweets)
        print since
        print until
        print ("user: {0}".format(user))
        print query
        # parsing tweets one by one
        for tweet in fetched_tweets:
            # empty dictionary to store required params of a tweet
            parsed_tweet = {}
 
            # saving text of tweet
            parsed_tweet['text'] = tweet.text
            # saving sentiment of tweet
            parsed_tweet['sentiment'] = self.get_tweet_sentiment(tweet.text)
 
            # appending parsed tweet to tweets list
            if tweet.retweets > 0:
                # if tweet has retweets, ensure that it is appended only once
                if parsed_tweet not in tweets:
                    tweets.append(parsed_tweet)
            else:
                tweets.append(parsed_tweet)
        # return parsed tweets
        
        return tweets

In [25]:
def get_sentiment(tweets):
    # picking positive tweets from tweets
    ptweets = [tweet for tweet in tweets if tweet['sentiment'] == 'positive']
    # percentage of positive tweets
    pos_tweet_percentage = 100*len(ptweets)/len(tweets)
    print("Positive tweets percentage: {} %".format(100*len(ptweets)/len(tweets)))
    # picking negative tweets from tweets
    ntweets = [tweet for tweet in tweets if tweet['sentiment'] == 'negative']
    # percentage of negative tweets
    neg_tweet_percentage = 100*len(ntweets)/len(tweets)
    print("Negative tweets percentage: {} %".format(100*len(ntweets)/len(tweets)))
    # percentage of neutral tweets
    neutral_tweet_percentage = 100*(len(tweets) - len(ntweets) - len(ptweets))/len(tweets)
    print("Neutral tweets percentage: {} %".format(100*(len(tweets) - len(ntweets) - len(ptweets))/len(tweets)))
 

    # printing first 5 positive tweets
#     print("\n\nPositive tweets:")
#     for tweet in ptweets[:10]:
#         print(tweet['text'])
 
    # printing first 5 negative tweets
#     print("\n\nNegative tweets:")
#     for tweet in ntweets[:10]:
#         print(tweet['text'])
    return pos_tweet_percentage / 100., neg_tweet_percentage / 100., neutral_tweet_percentage / 100.
    
    

In [28]:
api = TwitterClient()
print "LFC"
tweets = api.get_tweets(
        user="LFC",
        since="2013-09-11",
        until="2013-09-20",
        count=10)
print type(tweets)
print tweets
pos, neg, neut = get_sentiment(tweets)

LFC
query
hello
0
2013-09-11
2013-09-20
user: LFC
0
<type 'list'>
[]


ZeroDivisionError: integer division or modulo by zero

In [55]:
print(downloader.supported_languages_table("sentiment2", 3))

  1. Turkmen                    2. Thai                       3. Latvian                  
  4. Zazaki                     5. Tagalog                    6. Tamil                    
  7. Tajik                      8. Telugu                     9. Luxembourgish, Letzeb... 
 10. Alemannic                 11. Latin                     12. Turkish                  
 13. Limburgish, Limburgan...  14. Egyptian Arabic           15. Tatar                    
 16. Lithuanian                17. Spanish; Castilian        18. Basque                   
 19. Estonian                  20. Asturian                  21. Greek, Modern            
 22. Esperanto                 23. English                   24. Ukrainian                
 25. Marathi (Marāṭhī)         26. Maltese                   27. Burmese                  
 28. Kapampangan               29. Uighur, Uyghur            30. Uzbek                    
 31. Malagasy                  32. Yiddish                   33. Macedonian               

In [196]:
downloader.download("sentiment2.ts")

[polyglot_data] Error loading sentiment2.ts: Package u'sentiment2.ts'
[polyglot_data]     not found in index


False

In [192]:
downloader.list(show_packages=False)

Using default data directory (/Users/christophergang/polyglot_data)
 Data server index for <http://polyglot.cs.stonybrook.edu/~polyglot/>
Collections:
  [P] LANG:af............. Afrikaans            packages and models
  [P] LANG:als............ Alemannic            packages and models
  [P] LANG:am............. Amharic              packages and models
  [P] LANG:an............. Aragonese            packages and models
  [P] LANG:ar............. Arabic               packages and models
  [P] LANG:arz............ Egyptian Arabic      packages and models
  [P] LANG:as............. Assamese             packages and models
  [P] LANG:ast............ Asturian             packages and models
  [P] LANG:az............. Azerbaijani          packages and models
  [P] LANG:ba............. Bashkir              packages and models
  [P] LANG:bar............ Bavarian             packages and models
  [P] LANG:be............. Belarusian           packages and models
  [P] LANG:bg............. Bulgar

In [197]:
search_params.to_csv("temp_output.csv")