# Brexit Polarity Tweets

Twitter datasets: Brexit related tweets from pro and anti Brexit accounts January - March 2022 (Brexit leaning based on their Twitter bios)

This Twitter dataset covers the January - March 2022 period and comprises tweets relating to Brexit or Europe from Twitter accounts with publicly stated Brexit positions in their bio.


# Connecting to Google Drive

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


# NLTK settings and libraries import

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
from sklearn.utils import shuffle
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, f1_score
import pandas as pd
import re

# Datasets loading

In [4]:
path_anti_brexit = "/content/drive/MyDrive/Progetti/personal/brexit/TweetDataset_AntiBrexit_Jan-Mar2022.csv"
path_pro_brexit = "/content/drive/MyDrive/Progetti/personal/brexit/TweetDataset_ProBrexit_Jan-Mar2022.csv"

dataset_anti = pd.read_csv(path_anti_brexit)
dataset_pro = pd.read_csv(path_pro_brexit)

# Dataset Anti-Brexit

In [5]:
dataset_anti.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210574 entries, 0 to 210573
Data columns (total 38 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             210574 non-null  int64  
 1   Date                   210574 non-null  object 
 2   Headline               0 non-null       float64
 3   URL                    210574 non-null  object 
 4   Opening Text           0 non-null       float64
 5   Hit Sentence           210574 non-null  object 
 6   Source                 210574 non-null  object 
 7   Influencer             210574 non-null  object 
 8   Country                210574 non-null  object 
 9   Subregion              0 non-null       float64
 10  Language               210574 non-null  object 
 11  Reach                  210574 non-null  int64  
 12  Desktop Reach          210574 non-null  int64  
 13  Mobile Reach           210574 non-null  int64  
 14  Twitter Social Echo    0 non-null   

In [6]:
dataset_anti.head()

Unnamed: 0.1,Unnamed: 0,Date,Headline,URL,Opening Text,Hit Sentence,Source,Influencer,Country,Subregion,...,Twitter Screen Name,User Profile Url,Twitter Bio,Twitter Followers,Twitter Following,Alternate Date Format,Time,State,City,Document Tags
0,0,09-Jan-2022 11:58PM,,https://twitter.com/nickynicky77551/statuses/1...,,"RT @cnapan: #JohnsonOut is now running at 218,...",Twitter,@nickynicky77551,Unknown,,...,nicky@nicky7755,https://twitter.com/nickynicky77551,"Ardent Remainer; Rejoiner now, Support NHS 💙 #...",1697.0,2576.0,"Jan 9, 2022",11:58 PM,,,
1,1,09-Jan-2022 11:58PM,,https://twitter.com/lines12345/statuses/148032...,,RT @Femi_Sorry: My point: Any Labour leader wh...,Twitter,@lines12345,Unknown,,...,Helen,https://twitter.com/lines12345,Tory hating Remainer,11.0,92.0,"Jan 9, 2022",11:58 PM,,,
2,2,09-Jan-2022 11:58PM,,https://twitter.com/millymoo97/statuses/148032...,,RT @andy_murray: QT @Nigel_Farage: Please reco...,Twitter,@millymoo97,Unknown,,...,seonaid mcgill,https://twitter.com/millymoo97,"Scottish, European, anti-Brexit, pro-EU, freel...",2129.0,3880.0,"Jan 9, 2022",11:58 PM,,,
3,3,09-Jan-2022 11:58PM,,https://twitter.com/ValueSurplus/statuses/1480...,,RT @mikegalsworthy: We’re the first country in...,Twitter,@valuesurplus,United Kingdom,,...,Rightwing politics is a social virus #BLM 🇵🇸🌈,https://twitter.com/ValueSurplus,Classical Cultural Marxist.\nAnti Brexit estab...,2420.0,4952.0,"Jan 9, 2022",11:58 PM,,,
4,4,09-Jan-2022 11:57PM,,https://twitter.com/Jackcdawes/statuses/148032...,,"RT @StevePeers: QT @andy_murray: Game, set and...",Twitter,@jackcdawes,United Kingdom,,...,Jack Dawes #FBPEGlobal #JohnsonOut #RejoinEU,https://twitter.com/Jackcdawes,"Lover of crime & horror fiction, history, & Me...",18304.0,19889.0,"Jan 9, 2022",11:57 PM,England,London,


In [7]:
dataset_anti['Sentiment'].value_counts()

Neutral      118081
Negative      73933
Positive      15149
Not Rated      3411
Name: Sentiment, dtype: int64

# Building a dataset with only two columns: Hit Sentence and Sentiment

In [8]:
anti = dataset_anti[['Hit Sentence', 'Sentiment']].copy()
anti.describe()

Unnamed: 0,Hit Sentence,Sentiment
count,210574,210574
unique,102442,4
top,RT @donaldtuskEPP: Boris Johnson likens Ukrain...,Neutral
freq,260,118081


# Dataset Pro-Brexit

In [9]:
dataset_pro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147631 entries, 0 to 147630
Data columns (total 38 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             147631 non-null  int64  
 1   Date                   147631 non-null  object 
 2   Headline               0 non-null       float64
 3   URL                    147631 non-null  object 
 4   Opening Text           0 non-null       float64
 5   Hit Sentence           147631 non-null  object 
 6   Source                 147631 non-null  object 
 7   Influencer             147631 non-null  object 
 8   Country                147631 non-null  object 
 9   Subregion              0 non-null       float64
 10  Language               147631 non-null  object 
 11  Reach                  147631 non-null  int64  
 12  Desktop Reach          147631 non-null  int64  
 13  Mobile Reach           147631 non-null  int64  
 14  Twitter Social Echo    0 non-null   

In [10]:
dataset_pro.head()

Unnamed: 0.1,Unnamed: 0,Date,Headline,URL,Opening Text,Hit Sentence,Source,Influencer,Country,Subregion,...,Twitter Screen Name,User Profile Url,Twitter Bio,Twitter Followers,Twitter Following,Alternate Date Format,Time,State,City,Document Tags
0,0,15-Jan-2022 11:58PM,,https://twitter.com/dianemcgregor8/statuses/14...,,RT @re11ddy: QT @sandieshoes: Do this if you v...,Twitter,@dianemcgregor8,Cameroon,,...,Diane M McGregor,https://twitter.com/dianemcgregor8,Veggie burger! Lover of all Animals and Nature...,122.0,185.0,"Jan 15, 2022",11:58 PM,North,,
1,1,15-Jan-2022 11:57PM,,http://twitter.com/DeniseMembery/statuses/1482...,,RT @RickSacrop: Michael Fabricant is absolutel...,Twitter,@DeniseMembery,United Kingdom,,...,Denise M.,http://www.twitter.com/DeniseMembery,"🇬🇧✡️ Proud to be British. 🇬🇧 Animal lover, m...",5721.0,6291.0,"Jan 15, 2022",11:57 PM,England,,
2,2,15-Jan-2022 11:56PM,,https://twitter.com/007Stirling/statuses/14825...,,@StokieDrew2 I do find it rich; that Tory vote...,Twitter,@007stirling,United Kingdom,,...,Finlay Stirling 🇬🇧☘️❤️🏴󠁧󠁢󠁳󠁣󠁴󠁿,https://twitter.com/007Stirling,BRITISH/IRISH/ULSTER-SCOTS PUL\nRIGHT-WING. \n...,7219.0,7433.0,"Jan 15, 2022",11:56 PM,Northern Ireland,Belfast,
3,3,15-Jan-2022 11:54PM,,https://twitter.com/UKleadstheworld/statuses/1...,,RT @denistmurray: Brexit Britain win as London...,Twitter,@ukleadstheworld,United Kingdom,,...,BestofBritish,https://twitter.com/UKleadstheworld,"GB News, Conservatives, Brexiteer, Block all ads.",700.0,1014.0,"Jan 15, 2022",11:54 PM,,,
4,4,15-Jan-2022 11:53PM,,https://twitter.com/paul_beamish/statuses/1482...,,@afneil If Johnson is taken down by The Tories...,Twitter,@paul_beamish,United Kingdom,,...,Paul Beamish,https://twitter.com/paul_beamish,Brexiteer and Shipbuilder (Plater/Welder) livi...,200.0,822.0,"Jan 15, 2022",11:53 PM,England,,


In [11]:
dataset_pro['Sentiment'].value_counts()

Neutral      85812
Negative     44829
Positive     14584
Not Rated     2406
Name: Sentiment, dtype: int64

# Building a dataset with only two columns: Hit Sentence and Sentiment

In [12]:
pro = dataset_pro[['Hit Sentence', 'Sentiment']].copy()
pro.describe()

Unnamed: 0,Hit Sentence,Sentiment
count,147631,147631
unique,94652,4
top,RT @FrankMcCann2: It transpires that the BBC h...,Neutral
freq,110,85812


# Building a merge dataset

In [13]:
dataset = pd.concat([anti, pro], ignore_index=True)
dataset = shuffle(dataset)
dataset

Unnamed: 0,Hit Sentence,Sentiment
160651,RT @supertanskiii: The Bercow report was relea...,Neutral
47056,@BBCNews @BBCNews HOW CAN YOU EVEN THINK THIS ...,Neutral
39418,RT @ChrisBurn_Post: Today’s findings from the ...,Neutral
1003,RT @harpohap: Guardian’s Cadwalladr in court t...,Neutral
307588,Did they mention parties gate did they mention...,Neutral
...,...,...
218179,RT @CatharineHoey: QT @julianHjessop: Just ano...,Negative
341330,RT @ubique60: @grumpygit2 1. Show us the Evide...,Neutral
104784,Any thoughts Truss Dorries!! Thought not!! REV...,Neutral
25649,QT @DeborahMeaden: Quite. ; Those quoting “ he...,Neutral


# Data cleaning

In [14]:
wn = WordNetLemmatizer()

def text_preprocessing(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [word for word in text if word not in stopwords.words('english')]
    text = [wn.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

In [15]:
dataset['Hit Sentence'] = dataset['Hit Sentence'].apply(text_preprocessing)

In [16]:
dataset

Unnamed: 0,Hit Sentence,Sentiment
160651,rt supertanskiii bercow report released unreda...,Neutral
47056,bbcnews bbcnews even think honest journalism o...,Neutral
39418,rt chrisburn post today finding treasury commi...,Neutral
1003,rt harpohap guardian cadwalladr court fight de...,Neutral
307588,mention party gate mention brexiteer conservat...,Neutral
...,...,...
218179,rt catharinehoey qt julianhjessop another reas...,Negative
341330,rt ubique grumpygit show u evidence russian in...,Neutral
104784,thought truss dorries thought revealed share e...,Neutral
25649,qt deborahmeaden quite quoting got brexit done...,Neutral


# Train - validation - test split

In [17]:
X = dataset['Hit Sentence']
y = dataset['Sentiment']

In [18]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=123)
X_validation, X_test, y_validation, y_test = train_test_split(X_validation, y_validation, test_size=0.2, random_state=123)

print('Training Data : ', X_train.shape)
print('Validation Data : ', X_validation.shape)
print('Test Data : ', X_test.shape)

Training Data :  (286564,)
Validation Data :  (57312,)
Test Data :  (14329,)


# Feature Extraction - TFIDF - Training

In [19]:
from sklearn.linear_model import SGDClassifier

text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='perceptron'
                                            )),
                      ])

text_clf = text_clf.fit(X_train, y_train)

## Validation-set performance

In [20]:
y_pred = text_clf.predict(X_validation)

accuracy = accuracy_score(y_validation, y_pred)
#precision = precision_score(y_validation, y_pred)
#recall = recall_score(y_validation, y_pred)
f1 = f1_score(y_validation, y_pred, average='weighted')
print('accuracy: ', accuracy)
print('f1:', f1)

accuracy:  0.8335252652149637
f1: 0.832971657155422


## Test-set performance

In [21]:
y_pred = text_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
#precision = precision_score(y_validation, y_pred)
#recall = recall_score(y_validation, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print('accuracy: ', accuracy)
print('f1:', f1)

accuracy:  0.8279014585805011
f1: 0.8273340641445237
