# Part 1: Data Preparation

We firstly import the required libraries and read the given csv file.

In [1]:
import tweepy
import pandas as pd

file = pd.DataFrame(pd.read_csv('retweets.csv'))

Only the tweets written in English are kept.

In [2]:
data = file[file.lang == 'en']
data.head(100)

Unnamed: 0,origUserId,origUserScreenName,origMepId,origMepName,origMepGroupId,origMepGroupShort,origMepCountryId,origMepCountryShort,retweetUserId,retweetUserScreenName,...,retweetMepName,retweetMepGroupId,retweetMepGroupShort,retweetMepCountryId,retweetMepCountryShort,origCreatedAt,origTweetId,retweetCreatedAt,retweetTweetId,lang
3,19017675,Nigel_Farage,4525,Nigel FARAGE,6,EFDD,27,GBR,121171051,MargotLJParker,...,Margot PARKER,6,EFDD,27,GBR,Wed Jul 04 11:05:03 +0000 2012,220473289259233285,Sun Dec 07 21:46:00 +0000 2014,541710240014942209,en
5,17675072,MartinSchulz,1911,Martin SCHULZ,1,S&D,10,DEU,1668992125,NathanGillMEP,...,Nathan GILL,6,EFDD,27,GBR,Fri Jul 05 08:00:23 +0000 2013,353060776707235841,Thu Jun 18 08:52:51 +0000 2015,611456527409524736,en
11,17675072,MartinSchulz,1911,Martin SCHULZ,1,S&D,10,DEU,21648649,maritaulvskog,...,Marita ULVSKOG,1,S&D,26,SWE,Sat May 17 12:47:17 +0000 2014,467647542378123264,Wed Sep 16 09:15:06 +0000 2015,644077036022165504,en
12,744743599,Tim_Aker,99650,Tim AKER,6,EFDD,27,GBR,121171051,MargotLJParker,...,Margot PARKER,6,EFDD,27,GBR,Wed May 28 16:47:44 +0000 2014,471694322472353793,Thu Dec 04 22:21:10 +0000 2014,540631925145493504,en
13,744743599,Tim_Aker,99650,Tim AKER,6,EFDD,27,GBR,19017675,Nigel_Farage,...,Nigel FARAGE,6,EFDD,27,GBR,Wed May 28 16:47:44 +0000 2014,471694322472353793,Sun Mar 29 18:53:32 +0000 2015,582254277122473984,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,19541556,MarietjeSchaake,96945,Marietje SCHAAKE,3,ALDE,19,NLD,25980581,mortenhelveg,...,Morten Helveg PETERSEN,3,ALDE,6,DNK,Mon Oct 06 16:51:47 +0000 2014,519168150685114368,Mon Oct 06 17:05:26 +0000 2014,519171584259784704,en
162,19541556,MarietjeSchaake,96945,Marietje SCHAAKE,3,ALDE,19,NLD,2396056585,JezekCZ,...,Petr JEŽEK,3,ALDE,5,CZE,Mon Oct 06 17:03:03 +0000 2014,519170987276107776,Mon Oct 06 17:13:52 +0000 2014,519173708804796416,en
164,19541556,MarietjeSchaake,96945,Marietje SCHAAKE,3,ALDE,19,NLD,68964628,kajakallas,...,Kaja KALLAS,3,ALDE,7,EST,Mon Oct 06 17:06:10 +0000 2014,519171768565895168,Mon Oct 06 17:06:32 +0000 2014,519171860949663745,en
165,19541556,MarietjeSchaake,96945,Marietje SCHAAKE,3,ALDE,19,NLD,31099936,Andreas_Schwab,...,Andreas SCHWAB,4,EPP,10,DEU,Mon Oct 06 17:06:10 +0000 2014,519171768565895168,Mon Oct 06 17:14:27 +0000 2014,519173856154906625,en


In order to fetch the original tweet texts by hitting the Twitter API, we firstly gather the unique tweet IDs included in the dataset.

In [3]:
list_tweet_IDs = list(data['origTweetId'].unique())
list_tweet_IDs

[220473289259233285,
 353060776707235841,
 467647542378123264,
 471694322472353793,
 486509184708792320,
 500600838419935232,
 510380324636073984,
 512257283087429632,
 515077347645870080,
 516923314225553409,
 517285141371453440,
 517317510719614976,
 517374434278404096,
 517562642723274752,
 517645289407401984,
 517654781733928960,
 517661138319052801,
 517667224237400064,
 517673569724424192,
 517676555456499714,
 517677345428492288,
 517679694544904192,
 517685087941701633,
 517692033096773632,
 517709806271201282,
 517719797652807681,
 517723618999283713,
 517724307892764672,
 517735072754302976,
 517749884003561472,
 517768704424423426,
 517771316079132672,
 517774239282524160,
 517781199750979584,
 517936232597561344,
 517942048184627200,
 517947971586236416,
 517953272645451776,
 517953672731701249,
 517956961762283520,
 517965234724941824,
 517978265718628352,
 518002908651323392,
 518005586878025729,
 518024241024073728,
 518054291001860096,
 518061262241284096,
 518088733162

For security reasons, the API keys are included and imported from the *twitter_config.py* file. Due to the limitation that currently exists, we will send API requests in batches of 100 tweet IDs. Once the original texts are extracted, they are saved along with their respective IDs in the dataframe **tweet_text**.

In [4]:
# Create the dataframe
tweet_text = pd.DataFrame(columns=['origTweetId', 'origTweetText'])        

# Create the function that beautifies the result stored in dataframe
def insert(tweet_text, row):
    insert_loc = tweet_text.index.max()

    if pd.isna(insert_loc):
        tweet_text.loc[0] = row
    else:
        tweet_text.loc[insert_loc + 1] = row

# Create the function to retrieve tweets' texts        
def retrieve_tweets(list_tweet_IDs, api):
    all_tweets = []
    tweet_count = len(list_tweet_IDs)
    try:
        for i in range((tweet_count // 100) + 1):
            # Check if the last group has less than 100 tweets
            lastTweets = min((i + 1) * 100, tweet_count)
            all_tweets.extend(
                api.statuses_lookup(list_tweet_IDs[i * 100:lastTweets])
            )
        for tweet in all_tweets:
            if tweet:
                insert(tweet_text, [tweet.id, tweet.text])
    except tweepy.TweepError:
        print('Connection failed')

from twitter_config import config
        
# Connect to Twitter API
auth = tweepy.OAuthHandler(config['consumer_key'], config['consumer_secret'])
auth.set_access_token(config['access_token'], config['access_token_secret'])
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

# Call text function by providing the unique tweet IDs
retrieve_tweets(list_tweet_IDs, api)

tweet_text

Unnamed: 0,origTweetId,origTweetText
0,519050739642286080,Lord Hill's written answers to EP are now on m...
1,519139901359783936,Our @SebDance asks @ABratusek about global cli...
2,519192343069872128,.@Ansip_EU agrees with @JanAlbrecht that #open...
3,519172557900365824,New:Child abuse scandal Manchester area - wors...
4,518024241024073728,I write for @LabourList Tory threat to Court o...
...,...,...
10162,690542168285515776,#MichaelCaine said this am that bureaucrats in...
10163,691007348148703233,How long can the Calais scandal go on? - Jerem...
10164,690861330555887616,Memo to Tories for Reform in EU: Never mind th...
10165,690514426991120384,#EUreferendum is going to be about working peo...


The original tweet texts are subsequently merged to the original dataframe *data*.

In [5]:
final_tweet_text = data.merge(tweet_text, on = "origTweetId", how = 'inner')
final_tweet_text

Unnamed: 0,origUserId,origUserScreenName,origMepId,origMepName,origMepGroupId,origMepGroupShort,origMepCountryId,origMepCountryShort,retweetUserId,retweetUserScreenName,...,retweetMepGroupId,retweetMepGroupShort,retweetMepCountryId,retweetMepCountryShort,origCreatedAt,origTweetId,retweetCreatedAt,retweetTweetId,lang,origTweetText
0,19017675,Nigel_Farage,4525,Nigel FARAGE,6,EFDD,27,GBR,121171051,MargotLJParker,...,6,EFDD,27,GBR,Wed Jul 04 11:05:03 +0000 2012,220473289259233285,Sun Dec 07 21:46:00 +0000 2014,541710240014942209,en,Euro Parliament votes overwhelmingly to reject...
1,17675072,MartinSchulz,1911,Martin SCHULZ,1,S&D,10,DEU,1668992125,NathanGillMEP,...,6,EFDD,27,GBR,Fri Jul 05 08:00:23 +0000 2013,353060776707235841,Thu Jun 18 08:52:51 +0000 2015,611456527409524736,en,"US have one currency, one Central Bank and one..."
2,17675072,MartinSchulz,1911,Martin SCHULZ,1,S&D,10,DEU,21648649,maritaulvskog,...,1,S&D,26,SWE,Sat May 17 12:47:17 +0000 2014,467647542378123264,Wed Sep 16 09:15:06 +0000 2015,644077036022165504,en,More jobs for young people is my top priority ...
3,744743599,Tim_Aker,99650,Tim AKER,6,EFDD,27,GBR,121171051,MargotLJParker,...,6,EFDD,27,GBR,Wed May 28 16:47:44 +0000 2014,471694322472353793,Thu Dec 04 22:21:10 +0000 2014,540631925145493504,en,"Vote #Labour, get Tory. Vote Tory, get Labour...."
4,744743599,Tim_Aker,99650,Tim AKER,6,EFDD,27,GBR,19017675,Nigel_Farage,...,6,EFDD,27,GBR,Wed May 28 16:47:44 +0000 2014,471694322472353793,Sun Mar 29 18:53:32 +0000 2015,582254277122473984,en,"Vote #Labour, get Tory. Vote Tory, get Labour...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12974,1485429175,JSaryuszWolski,28297,Jacek SARYUSZ-WOLSKI,4,EPP,20,POL,539156512,ZdzKrasnodebski,...,5,ECR,20,POL,Sun Jan 24 13:17:25 +0000 2016,691248438504210432,Mon Jan 25 08:21:14 +0000 2016,691536290198745088,en,Drastic drop in positive EU perception in Pola...
12975,829060856,TheresaMEP,124961,Theresa GRIFFIN,1,S&D,27,GBR,1354570123,julie4nw,...,1,S&D,27,GBR,Sun Jan 24 14:04:13 +0000 2016,691260217288454144,Sun Jan 24 14:54:31 +0000 2016,691272874393485314,en,I'm with @UKYP North West now in Knowsley - gr...
12976,829060856,TheresaMEP,124961,Theresa GRIFFIN,1,S&D,27,GBR,202610289,akhanmep,...,1,S&D,27,GBR,Sun Jan 24 14:12:29 +0000 2016,691262296396599296,Sun Jan 24 14:21:13 +0000 2016,691264493725335552,en,Listening to some fab ideas from young people ...
12977,829060856,TheresaMEP,124961,Theresa GRIFFIN,1,S&D,27,GBR,1354570123,julie4nw,...,1,S&D,27,GBR,Sun Jan 24 14:12:29 +0000 2016,691262296396599296,Sun Jan 24 14:54:26 +0000 2016,691272856739659776,en,Listening to some fab ideas from young people ...


The rows are grouped by *origMepGroupShort* and the groups with less than 50 tweets are dropped.
The content of the final dataframe **grouped_final_tweet_text** is presented below.

In [6]:
# Group the records by the European group of the MEP that posted the original tweet
grouped_final_tweet_text = final_tweet_text.assign(freq=final_tweet_text.apply(lambda x: final_tweet_text.origMepGroupShort.value_counts()\
  .to_dict()[x.origMepGroupShort], axis=1))\
  .sort_values(by=['freq','origMepGroupShort'],ascending=[False,True])

# Drop the groups with few tweets
grouped_final_tweet_text.drop(grouped_final_tweet_text.loc[grouped_final_tweet_text['freq'] < 50].index, inplace=True)

grouped_final_tweet_text.head(1500)

Unnamed: 0,origUserId,origUserScreenName,origMepId,origMepName,origMepGroupId,origMepGroupShort,origMepCountryId,origMepCountryShort,retweetUserId,retweetUserScreenName,...,retweetMepGroupShort,retweetMepCountryId,retweetMepCountryShort,origCreatedAt,origTweetId,retweetCreatedAt,retweetTweetId,lang,origTweetText,freq
0,19017675,Nigel_Farage,4525,Nigel FARAGE,6,EFDD,27,GBR,121171051,MargotLJParker,...,EFDD,27,GBR,Wed Jul 04 11:05:03 +0000 2012,220473289259233285,Sun Dec 07 21:46:00 +0000 2014,541710240014942209,en,Euro Parliament votes overwhelmingly to reject...,3261
3,744743599,Tim_Aker,99650,Tim AKER,6,EFDD,27,GBR,121171051,MargotLJParker,...,EFDD,27,GBR,Wed May 28 16:47:44 +0000 2014,471694322472353793,Thu Dec 04 22:21:10 +0000 2014,540631925145493504,en,"Vote #Labour, get Tory. Vote Tory, get Labour....",3261
4,744743599,Tim_Aker,99650,Tim AKER,6,EFDD,27,GBR,19017675,Nigel_Farage,...,EFDD,27,GBR,Wed May 28 16:47:44 +0000 2014,471694322472353793,Sun Mar 29 18:53:32 +0000 2015,582254277122473984,en,"Vote #Labour, get Tory. Vote Tory, get Labour....",3261
6,158021529,oflynnmep,124940,Patrick O'FLYNN,6,EFDD,27,GBR,108882900,RogerHelmerMEP,...,EFDD,27,GBR,Sat Aug 16 11:11:55 +0000 2014,500600838419935232,Mon Nov 23 10:08:57 +0000 2015,668732964516294656,en,Robert Halfon raising issue of illegal travell...,3261
7,744743599,Tim_Aker,99650,Tim AKER,6,EFDD,27,GBR,952267464,JamesJimCarver,...,EFDD,27,GBR,Wed Sep 17 15:10:28 +0000 2014,512257283087429632,Wed Dec 17 23:05:07 +0000 2014,545354031577980928,en,"Ignore the Labour lies, this is where #UKIP st...",3261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5006,478679663,Steven_Woolfe,124966,Steven WOOLFE,6,EFDD,27,GBR,209020126,paulnuttallukip,...,EFDD,27,GBR,Tue Apr 21 19:25:03 +0000 2015,590597132937199616,Tue Apr 21 22:14:10 +0000 2015,590639692359544832,en,"Study hard, gain entry to eligible Maths insti...",3261
5007,478679663,Steven_Woolfe,124966,Steven WOOLFE,6,EFDD,27,GBR,209020126,paulnuttallukip,...,EFDD,27,GBR,Tue Apr 21 19:33:48 +0000 2015,590599334451875840,Tue Apr 21 22:14:17 +0000 2015,590639720713060352,en,I'm told @BBCNewsbeat debate will be broadcas...,3261
5009,478679663,Steven_Woolfe,124966,Steven WOOLFE,6,EFDD,27,GBR,1668992125,NathanGillMEP,...,EFDD,27,GBR,Tue Apr 21 21:12:10 +0000 2015,590624086033883142,Tue Apr 21 21:58:08 +0000 2015,590635655056773121,en,The post #Newsbeat debate debate is as freneti...,3261
5010,19017675,Nigel_Farage,4525,Nigel FARAGE,6,EFDD,27,GBR,121171051,MargotLJParker,...,EFDD,27,GBR,Wed Apr 22 06:17:44 +0000 2015,590761384666865664,Wed Apr 22 20:49:26 +0000 2015,590980753246289921,en,"""If you actually want a full, free and fair re...",3261


# Part 2: Classification

We firstly import some of the required libraries for classification. 

In [20]:
import numpy as np
from sklearn.dummy import DummyClassifier
import sklearn as sk
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import LancasterStemmer
import string
import re

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eleftheriaapostolaki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/eleftheriaapostolaki/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

From the final dataframe *grouped_final_tweet_text* we extract only the columns that are needed for classification.
In our case these are the political party (target variable) and the original tweet text (training feautures).

In [8]:
corpus = grouped_final_tweet_text[['origTweetText', 'origMepGroupShort']]
corpus

Unnamed: 0,origTweetText,origMepGroupShort
0,Euro Parliament votes overwhelmingly to reject...,EFDD
3,"Vote #Labour, get Tory. Vote Tory, get Labour....",EFDD
4,"Vote #Labour, get Tory. Vote Tory, get Labour....",EFDD
6,Robert Halfon raising issue of illegal travell...,EFDD
7,"Ignore the Labour lies, this is where #UKIP st...",EFDD
...,...,...
12902,We call on the #EU to address an urgent call t...,GUE-NGL
12903,We call on the #EU to address an urgent call t...,GUE-NGL
12939,"The heart always beats ""left"". \nThis Sunday, ...",GUE-NGL
12951,Another packed hse in Limerick for the launch...,GUE-NGL


After having extracted the required information, we create a **bag of words** and proceed to a number of **text cleaning steps** so as for the dataset text to be smoothly used for the training of the classification algorithm. Such action involves the following:
- Removal of duplicate texts
- Conversion of characters to lower case
- Removal of punctuation (and thus emojiis)
- Filtering out stopwords (including stopwords specified for the current context)
- Removal of URLs
- Lemmatizion of words

In order for us to remove the additional stopwords (except the already known ones) that are applicable for the specific data context, we investigate, as indicated below, the top words that most frequently used in the original text of tweets.

We conclude that the majority of the top used words are included in the common stopwords that will be eliminated in the text cleaning process while some of the rest ones (e.g. European, Europe, @EPPGroup, etc.) could also be marked so as to be including in the extra stopwords which will be removed. 

In [9]:
pd.Series(' '.join(corpus.origTweetText).split()).value_counts()[:100]

to      6764
the     6623
in      4587
of      4443
on      3290
        ... 
or       205
Good     204
see      204
MEP      198
EU.      193
Length: 100, dtype: int64

In [10]:
# Remove duplicate tweet texts
corpus = corpus.drop_duplicates()
# Convert to lower case
corpus.origTweetText = [w.lower() for w in corpus.origTweetText]
# Remove punctuation from each word
corpus.origTweetText = corpus.origTweetText.str.replace('[^\w\s]','')
# Filter out stop words
stop_words = set(stopwords.words('english'))
extra_stop_words = ['@EPPGroup', 'EU', 'europe', 'Europe', 'european', 'parliament', 'today', 'tonight', 'morning', 
                    'tomorrow', 'amp', 'timmermans', 'timkirkhopemep', 'etuc_ces', 'eu', 'ep',
                    'see', 'want', 'amp' ]
stop_words.update(extra_stop_words)
corpus.origTweetText = [w for w in corpus.origTweetText if not w in stop_words]
# Lemmatizion
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])
corpus.origTweetText = corpus.origTweetText.apply(lemmatize_text)
# Remove URLS from tweet texts
corpus.origTweetText = corpus.origTweetText.str.replace('http\S+|www.\S+', '', case=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


After the above text cleaning procedure is completed, the final texts can be seen below.

In [11]:
corpus.origTweetText

0        euro parliament vote overwhelmingly to reject ...
3        vote labour get tory vote tory get labour vote...
6        robert halfon raising issue of illegal travell...
7        ignore the labour lie this is where ukip stand...
9        migration can be positive for the uk only if i...
                               ...                        
12901    we cant tolerate the violent oppression agains...
12902    we call on the eu to address an urgent call to...
12939    the heart always beat left this sunday our hea...
12951    another packed hse in limerick for the launch ...
12953    brianhayesmep eppgroup finegael will your coll...
Name: origTweetText, Length: 10143, dtype: object

In [52]:
political_parties = list(corpus.origMepGroupShort.unique())
political_parties

['EFDD', 'S&D', 'EPP', 'ALDE', 'Greens-EFA', 'ECR', 'GUE-NGL']

Before proceeding to the training of the classification algorithms, we split the dataset to train and test with a proportion of 80% and 20% respectively.

In [14]:
# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(corpus.origTweetText, corpus.origMepGroupShort, test_size=0.2, random_state = 40)
X_train.head()

10170    ecr blaming ecj for upholding the law the rule...
9852     is hollande like martinschulz confused about h...
635      ready to start spring forward for woman un_wom...
10615    i am on bbc radio suffolk at 1720 this evening...
3284                   copenhagen on my mind in amsterdam 
Name: origTweetText, dtype: object

Let's now create the DummyClassifier which will be used as our baseline classifier. This means that the accurancy of the classification algorithm that we will train should be at least greater than **23,95%**. 

In [15]:
# Create dummy classifier
x = corpus.origTweetText
y = corpus.origMepGroupShort
dummy_classifier = DummyClassifier(strategy="most_frequent")
dummy_classifier.fit(x, y)
dummy_classifier.predict(x)

array(['EFDD', 'EFDD', 'EFDD', ..., 'EFDD', 'EFDD', 'EFDD'], dtype='<U4')

In [16]:
# Calculate the dummy classifier accuracy
dummy_classifier.score(x, y)

0.238391008577344

# Training Algorithms

# 1. Stochastic Gradient Descent

As indicated below, with the Stochastic Gradient Descent classifier an accuracy of approximately **58%** is achieved. 
In the *check_df* dataframe we can also see the deviations of the model from the actual classification of each text row. More specifically, it seems that a text percentage which is bit higher than 50% is indeed incorrectly classified. 

In [40]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

stochastic_grad_desc = Pipeline(steps=[('v', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss = 'hinge', penalty = 'l2',alpha = 0.001, 
                                      random_state = 42, max_iter = 4, tol = None)),
               ])
stochastic_grad_desc.fit(X_train, y_train)
stochastic_grad_desc_y_pred = stochastic_grad_desc.predict(X_test)

stochastic_grad_desc_scores = cross_val_score(stochastic_grad_desc, corpus.origTweetText, corpus.origMepGroupShort, cv = 8)

print(classification_report(y_test, stochastic_grad_desc_y_pred,target_names = political_parties))
print('SGD Classifier Accuracy: %s' % accuracy_score(stochastic_grad_desc_y_pred, y_test))
print('SGD Classifier Accuracy (CV):', stochastic_grad_desc_scores.mean())

              precision    recall  f1-score   support

        EFDD       0.61      0.39      0.48       289
         S&D       0.73      0.27      0.40       180
         EPP       0.56      0.91      0.69       462
        ALDE       0.68      0.68      0.68       337
  Greens-EFA       0.65      0.35      0.45        63
         ECR       0.62      0.38      0.47       208
     GUE-NGL       0.61      0.67      0.64       490

    accuracy                           0.61      2029
   macro avg       0.64      0.52      0.54      2029
weighted avg       0.62      0.61      0.59      2029

SGD Classifier Accuracy: 0.6091670773780187
SGD Classifier Accuracy (CV): 0.584046945764729


In [41]:
pd.set_option('display.max_rows', 2020)

testing_predictions = []
for i in range(len(X_test)):
    if stochastic_grad_desc_y_pred[i] == 'EFDD':
        testing_predictions.append('EFDD')
    elif stochastic_grad_desc_y_pred[i] == 'S&D':
        testing_predictions.append('S&D')
    elif stochastic_grad_desc_y_pred[i] == 'EPP':
        testing_predictions.append('EPP')
    elif stochastic_grad_desc_y_pred[i] == 'ALDE':
        testing_predictions.append('ALDE')
    elif stochastic_grad_desc_y_pred[i] == 'Greens-EFA':
        testing_predictions.append('Greens-EFA')
    elif stochastic_grad_desc_y_pred[i] == 'ECR':
        testing_predictions.append('ECR')
    else:
        testing_predictions.append('GUE-NGL')
        
check_df = pd.DataFrame({'actual_label': list(y_test), 'prediction': testing_predictions, 'abstract':list(X_test)})
check_df

Unnamed: 0,actual_label,prediction,abstract
0,S&D,S&D,worth repeating eu cost u very little and bene...
1,EFDD,EFDD,i understand tony blair wa in the ne today and...
2,EFDD,EFDD,rt excellent piece on how people from all walk...
3,EFDD,EFDD,the riskier option is to stay in the eu
4,ALDE,ALDE,homophobic statement by latvian president riga...
...,...,...,...
2024,EPP,EPP,nepal amp people in the region can count on eu...
2025,Greens-EFA,S&D,terryreintke asking timmermans on woman right ...
2026,EFDD,EFDD,one of the stupidiest thing ukip have had to o...
2027,EFDD,EFDD,snp never wanted independence in 1st place snp...


In [42]:
conditions = [check_df['actual_label'] == check_df['prediction']]
choices = [check_df['actual_label'], check_df['prediction']]

pd_check_df = pd.concat([check_df['actual_label'], check_df['prediction']], axis=1)
pd_check_df['result'] = np.where(check_df['actual_label'] == check_df['prediction'], 'no change', 'changed')

pd_check_df.groupby('result').count().reset_index()

Unnamed: 0,result,actual_label,prediction
0,changed,793,793
1,no change,1236,1236


# 2. Naive Bayes

When trying the Naive Bayes method, we see that its accuracy is approximately **50%**, 8% worse than this of the Stochastic Gradient Descent classifier. On top of that, when checking the respective *check_df* dataframe we can also see the differences per row in the prediction failures of each algorithm. For instance, the row with index 3 is correctly classified by the Naive Bayes algorithm while it is not by the Stochastic Gradient Descent algorithm. 

In [43]:
from sklearn.naive_bayes import MultinomialNB

naiveBayes = Pipeline(steps=[('v', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

naiveBayes.fit(X_train, y_train)
naiveBayes_y_pred = naiveBayes.predict(X_test)

naiveBayes_scores = cross_val_score(naiveBayes, corpus.origTweetText, corpus.origMepGroupShort, cv = 9)

print(classification_report(y_test, naiveBayes_y_pred,target_names = political_parties))
print('Naive Bayes Classifier Accuracy: %s' % accuracy_score(naiveBayes_y_pred, y_test))
print('Naive Bayes Classifier Accuracy (CV):', naiveBayes_scores.mean())

              precision    recall  f1-score   support

        EFDD       0.69      0.15      0.24       289
         S&D       1.00      0.03      0.06       180
         EPP       0.49      0.94      0.64       462
        ALDE       0.76      0.49      0.60       337
  Greens-EFA       0.00      0.00      0.00        63
         ECR       0.92      0.06      0.11       208
     GUE-NGL       0.44      0.75      0.55       490

    accuracy                           0.51      2029
   macro avg       0.61      0.35      0.32      2029
weighted avg       0.62      0.51      0.43      2029

Naive Bayes Classifier Accuracy: 0.5066535239034007
Naive Bayes Classifier Accuracy (CV): 0.49916198363403336


  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
pd.set_option('display.max_rows', 2020)

testing_predictions = []
for i in range(len(X_test)):
    if naiveBayes_y_pred[i] == 'EFDD':
        testing_predictions.append('EFDD')
    elif naiveBayes_y_pred[i] == 'S&D':
        testing_predictions.append('S&D')
    elif naiveBayes_y_pred[i] == 'EPP':
        testing_predictions.append('EPP')
    elif naiveBayes_y_pred[i] == 'ALDE':
        testing_predictions.append('ALDE')
    elif naiveBayes_y_pred[i] == 'Greens-EFA':
        testing_predictions.append('Greens-EFA')
    elif naiveBayes_y_pred[i] == 'ECR':
        testing_predictions.append('ECR')
    else:
        testing_predictions.append('GUE-NGL')
        
check_df = pd.DataFrame({'actual_label': list(y_test), 'prediction': testing_predictions, 'abstract':list(X_test)})
check_df

Unnamed: 0,actual_label,prediction,abstract
0,S&D,EFDD,worth repeating eu cost u very little and bene...
1,EFDD,EFDD,i understand tony blair wa in the ne today and...
2,EFDD,EFDD,rt excellent piece on how people from all walk...
3,EFDD,EFDD,the riskier option is to stay in the eu
4,ALDE,S&D,homophobic statement by latvian president riga...
...,...,...,...
2024,EPP,EPP,nepal amp people in the region can count on eu...
2025,Greens-EFA,S&D,terryreintke asking timmermans on woman right ...
2026,EFDD,EFDD,one of the stupidiest thing ukip have had to o...
2027,EFDD,EFDD,snp never wanted independence in 1st place snp...


In [45]:
conditions = [check_df['actual_label'] == check_df['prediction']]
choices = [check_df['actual_label'], check_df['prediction']]

pd_check_df = pd.concat([check_df['actual_label'], check_df['prediction']], axis=1)
pd_check_df['result'] = np.where(check_df['actual_label'] == check_df['prediction'], 'no change', 'changed')

pd_check_df.groupby('result').count().reset_index()

Unnamed: 0,result,actual_label,prediction
0,changed,1001,1001
1,no change,1028,1028


Let as proceed to the investigation of another algorithm that will have an accuracy higher than this of the SGD algorithm which is for now the best algorithm we have.

# 3. Logistic Regression

In [46]:
from sklearn.linear_model import LogisticRegression

logistic_regr = Pipeline(steps=[('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=3, C=3)),
               ])

logistic_regr.fit(X_train, y_train)
logistic_regr_y_pred = logistic_regr.predict(X_test)

logistic_regr_scores = cross_val_score(logistic_regr, corpus.origTweetText, corpus.origMepGroupShort, cv=8)

print(classification_report(y_test, logistic_regr_y_pred, target_names = political_parties))
print('Logistic Regression Classifier Accuracy: %s' % accuracy_score(logistic_regr_y_pred, y_test))
print('Logistic Regression Classifier (CV): ', logistic_regr_scores.mean())

              precision    recall  f1-score   support

        EFDD       0.54      0.50      0.52       289
         S&D       0.63      0.31      0.42       180
         EPP       0.66      0.85      0.75       462
        ALDE       0.68      0.67      0.68       337
  Greens-EFA       0.83      0.32      0.46        63
         ECR       0.58      0.38      0.45       208
     GUE-NGL       0.59      0.71      0.65       490

    accuracy                           0.62      2029
   macro avg       0.65      0.53      0.56      2029
weighted avg       0.63      0.62      0.61      2029

Logistic Regression Classifier Accuracy: 0.6249383932971907
Logistic Regression Classifier (CV):  0.5880881992286606


Multiclass logistic regression has a slightly better accuracy than this of SGC algorithm that counts to **58,80%**.

# Stochastic Gradient Descent (tuned hyperparameters)

Since for now the most promissing classifiers are Logistic Regression and SGD Classifier, we will try to tune the  hyperparameters of SGD Classifier to check if we can get a higher accurancy.

In [38]:
from sklearn.model_selection import GridSearchCV
import pprint
from time import time

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
])

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (10, 50, 80)
}

gridSearch = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Doing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])

t0 = time()
gridSearch.fit(corpus.origTweetText, corpus.origMepGroupShort)
print("Completed in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % gridSearch.best_score_)
print("Best parameters set:")
best_parameters = gridSearch.best_estimator_.get_params()
for parameter_name in sorted(parameters.keys()):
    print("\t%s: %r" % (parameter_name, best_parameters[parameter_name]))

Doing grid search...
pipeline: ['vect', 'tfidf', 'clf']
Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 23.3min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 33.6min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 40.9min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 54.6min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 69.9min finished


Completed in 4198.297s

Best score: 0.593
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 80
	clf__penalty: 'elasticnet'
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 2)


In [48]:
stochastic_grad_desc_gs = Pipeline(steps=[('vect', CountVectorizer(max_df=0.5, max_features=None, ngram_range=(1,2))), 
                      ('tfidf', TfidfTransformer(use_idf=True)),
                ('clf', SGDClassifier(alpha=1e-05, penalty='elasticnet', random_state=42, max_iter=80)),
               ])

stochastic_grad_desc_gs.fit(X_train, y_train)
stochastic_grad_desc_gs_y_pred = stochastic_grad_desc_gs.predict(X_test)

stochastic_grad_desc_gs_scores = cross_val_score(stochastic_grad_desc_gs, corpus.origTweetText, corpus.origMepGroupShort, cv = 8)

print(classification_report(y_test, stochastic_grad_desc_gs_y_pred, target_names = political_parties))
print('SGD Classifier Accuracy: %s' % accuracy_score(stochastic_grad_desc_gs_y_pred, y_test))
print('SGD Classifier Accuracy (CV): ', stochastic_grad_desc_gs_scores.mean())

              precision    recall  f1-score   support

        EFDD       0.57      0.54      0.55       289
         S&D       0.55      0.36      0.43       180
         EPP       0.68      0.85      0.76       462
        ALDE       0.64      0.68      0.66       337
  Greens-EFA       0.70      0.37      0.48        63
         ECR       0.55      0.43      0.49       208
     GUE-NGL       0.65      0.67      0.66       490

    accuracy                           0.63      2029
   macro avg       0.62      0.56      0.58      2029
weighted avg       0.63      0.63      0.62      2029

SGD Classifier Accuracy: 0.6333169048792509
SGD Classifier Accuracy (CV):  0.5978499193305431


It seems that the tuned hyperparameters in SGC algorithm do not result in a better result than this of the simple SGC algorithm.

# Feature Hashing

In [49]:
from sklearn.feature_extraction.text import HashingVectorizer

text_clf = Pipeline([('v', HashingVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=0.0001,
                                           random_state=40))
                    ])

text_clf.fit(X_train, y_train)
text_clf_predicted = text_clf.predict(X_test)

text_clf_scores = cross_val_score(text_clf, corpus.origTweetText, corpus.origMepGroupShort, cv=8)

print(classification_report(y_test, text_clf_predicted, target_names=political_parties))
print('Feature Hashing Classifier Accuracy: %s' % accuracy_score(text_clf_predicted, y_test))
print('Feature Hashing Classifier Accuracy (CV): ', text_clf_scores.mean())

              precision    recall  f1-score   support

        EFDD       0.57      0.50      0.53       289
         S&D       0.67      0.36      0.46       180
         EPP       0.65      0.87      0.74       462
        ALDE       0.69      0.69      0.69       337
  Greens-EFA       0.68      0.41      0.51        63
         ECR       0.57      0.43      0.49       208
     GUE-NGL       0.66      0.70      0.68       490

    accuracy                           0.64      2029
   macro avg       0.64      0.57      0.59      2029
weighted avg       0.64      0.64      0.63      2029

Feature Hashing Classifier Accuracy: 0.6426811237062593
Feature Hashing Classifier Accuracy (CV):  0.6027796416682643


When using feature hashing algorithm, we managed to get almost 60.3% accurancy with cross validation. 
Let as now gather the results of all the above algorithms so as to compare them and decide which is the best one implemented.

So finally, as we can see in the *Classifier Accuracy_CV* column, <font color=red>**the feature hashing algorithm is the top performer algorithm**</font> in terms of accuracy with cross validation followed by the SGD algorithm with tuned parameters and then the multiclass logistic regression. 

In [50]:
accuracy_scores = pd.DataFrame(columns=['Classifier Type','Classifier Accuracy', 'Classifier Accuracy_CV'])  
accuracy_scores = accuracy_scores.append({'Classifier Type':'Stochastic Gradient Descent' ,'Classifier Accuracy' : accuracy_score(stochastic_grad_desc_y_pred, y_test) , 'Classifier Accuracy_CV' : stochastic_grad_desc_scores.mean()} , ignore_index=True)
accuracy_scores = accuracy_scores.append({'Classifier Type':'Naive Bayes','Classifier Accuracy' : accuracy_score(naiveBayes_y_pred, y_test) , 'Classifier Accuracy_CV' : naiveBayes_scores.mean()} , ignore_index=True)
accuracy_scores = accuracy_scores.append({'Classifier Type':'Logistic Regression', 'Classifier Accuracy' : accuracy_score(logistic_regr_y_pred, y_test) , 'Classifier Accuracy_CV' : logistic_regr_scores.mean()} , ignore_index=True)
accuracy_scores = accuracy_scores.append({'Classifier Type':'SGD (tuned hyperparameters)', 'Classifier Accuracy' : accuracy_score(stochastic_grad_desc_gs_y_pred, y_test) , 'Classifier Accuracy_CV' : stochastic_grad_desc_gs_scores.mean()} , ignore_index=True)
accuracy_scores = accuracy_scores.append({'Classifier Type':'Feature Hashing', 'Classifier Accuracy' : accuracy_score(text_clf_predicted, y_test) , 'Classifier Accuracy_CV' : text_clf_scores.mean()} , ignore_index=True)

accuracy_scores

Unnamed: 0,Classifier Type,Classifier Accuracy,Classifier Accuracy_CV
0,Stochastic Gradient Descent,0.609167,0.584047
1,Naive Bayes,0.506654,0.499162
2,Logistic Regression,0.624938,0.588088
3,SGD (tuned hyperparameters),0.633317,0.59785
4,Feature Hashing,0.642681,0.60278


In [51]:
accuracy_scores.loc[accuracy_scores['Classifier Accuracy'].idxmax()]

Classifier Type           Feature Hashing
Classifier Accuracy              0.642681
Classifier Accuracy_CV            0.60278
Name: 4, dtype: object