In [1]:
from utils import *

DATA_PATH = "../Data/"
MODEL = 'BERT'

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/antoinecrettenand/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/antoinecrettenand/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Preprocessing for Sentiment Analysis
We will run a baseline sentiment analysis with TextBlob implementation on our dataset i.e aggregated quotes based on mentions of 2012, 2016, 2020 U.S elections candidates. The goal of this preliminary analysis is :
* Explore the distribution of positive/negative among quotes mentioning political candidates
* Visualise partially the correctness of the baseline implementation through Wordclouds

We calculated the general sentiment of each quotes using VADER model of the Natural Language Toolkit (NLTK). This widely used open-source algorithm specifies a sentiment score in the range [−1,1]. There are several approaches for identifying the sentiment on a sentence level (such as LIWC). However, VADER is preferred for our needs because it is sensitive to social media sentiment and can be adjusted easily to a specific domain.

In [2]:
# load data
df_obama_2012 = pd.read_csv(f"{DATA_PATH}2012/obama_2012.zip", compression="zip")
df_romney_2012 = pd.read_csv(f"{DATA_PATH}2012/romney_2012.zip", compression="zip")
df_trump_2016 = pd.read_csv(f"{DATA_PATH}2016/trump_2016.zip", compression="zip")
df_clinton_2016 = pd.read_csv(f"{DATA_PATH}2016/clinton_2016.zip", compression="zip")
df_trump_2020 = pd.read_csv(f"{DATA_PATH}2020/trump_2020.zip", compression="zip")
df_biden_2020 = pd.read_csv(f"{DATA_PATH}2020/biden_2020.zip", compression="zip")

## Text preprocessing
Our baseline text preprocessing consists of :
* Make text lowercase
* Remove punctuation
* Remove stopwords
* Lemmatization

Before applying sentiment analysis. See implementation in [utils_preprocessing.py](utils_preprocessing.py) for more details.

In [3]:
# preprocess data for sentiment analysis
tags = ['quotation_lemmatized', 'quotation_stemmed', 'quotation_tokenized', 'quotation_conc_lemmatized']
df_trump_2016 = preprocess_data_for_sentiment_analysis(df_trump_2016, tags=tags)
df_trump_2020 = preprocess_data_for_sentiment_analysis(df_trump_2020, tags=tags)
df_clinton_2016 = preprocess_data_for_sentiment_analysis(df_clinton_2016, tags=tags)
df_biden_2020 = preprocess_data_for_sentiment_analysis(df_biden_2020, tags=tags)
df_obama_2012 = preprocess_data_for_sentiment_analysis(df_obama_2012, tags=tags)
df_romney_2012 = preprocess_data_for_sentiment_analysis(df_romney_2012, tags=tags)

df_trump_2016.head()

[process_sa] Prepared for sentiment analysis with tags: ['quotation_lemmatized', 'quotation_stemmed', 'quotation_tokenized', 'quotation_conc_lemmatized']
[process_sa] Prepared for sentiment analysis with tags: ['quotation_lemmatized', 'quotation_stemmed', 'quotation_tokenized', 'quotation_conc_lemmatized']
[process_sa] Prepared for sentiment analysis with tags: ['quotation_lemmatized', 'quotation_stemmed', 'quotation_tokenized', 'quotation_conc_lemmatized']
[process_sa] Prepared for sentiment analysis with tags: ['quotation_lemmatized', 'quotation_stemmed', 'quotation_tokenized', 'quotation_conc_lemmatized']
[process_sa] Prepared for sentiment analysis with tags: ['quotation_lemmatized', 'quotation_stemmed', 'quotation_tokenized', 'quotation_conc_lemmatized']
[process_sa] Prepared for sentiment analysis with tags: ['quotation_lemmatized', 'quotation_stemmed', 'quotation_tokenized', 'quotation_conc_lemmatized']


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,month,type,candidate,quotation_tokenized,quotation_stemmed,quotation_lemmatized,quotation_conc_lemmatized
0,3,1709,2016-07-07-033290,I didn't want to delete it -- I would have nev...,Donald J. Trump,['Q22686'],2016-07-07 01:49:02,4,"[['Donald J. Trump', '0.7791'], ['None', '0.21...",['http://mobile.nytimes.com/2016/07/07/us/poli...,E,Jul,NY Times,Donald Trump,"[i, did not, want, to, delete, it, i, would, h...","[did not, want, delet, would, never, delet, pe...","[did not, want, delete, would, never, deleted,...",did not want delete would never deleted people...
1,9,6715,2016-07-25-126140,"Wow, the Republican Convention went so smoothl...",Donald Trump,"['Q22686', 'Q27947481']",2016-07-25 00:00:00,171,"[['Donald Trump', '0.8662'], ['None', '0.1103'...",['http://dailyherald.com/article/20160725/news...,E,Jul,NY Times,Donald Trump,"[wow, the, , convention, went, so, smoothly, c...","[wow, , convent, went, smoothli, compar, dem, ...","[wow, , convention, went, smoothly, compared, ...",wow convention went smoothly compared dems to...
2,11,7112,2016-08-09-004561,"Although, the Second Amendment people -- maybe...",Donald Trump,"['Q22686', 'Q27947481']",2016-08-09 00:00:00,1183,"[['Donald Trump', '0.6569'], ['None', '0.2428'...",['http://onenewspage.com/video/20160809/525505...,E,Aug,NY Times,Donald Trump,"[although, the, second, amendment, people, , t...","[although, second, amend, peopl, , , do not, k...","[although, second, amendment, people, , , do n...",although second amendment people do not know
3,15,11168,2016-09-07-092168,She's totally unfit to be our commander in chief.,Donald J. Trump,['Q22686'],2016-09-07 16:38:49,2,"[['Donald J. Trump', '0.4223'], ['Donald Trump...",['http://www.nytimes.com/2016/09/08/us/politic...,E,Sep,NY Times,Donald Trump,"[she is, totally, unfit, to, , our, commander,...","[she i, total, unfit, , command, chief]","[she is, totally, unfit, , commander, chief]",she is totally unfit commander chief
4,16,13945,2016-09-07-003185,A Trump supporter is fighting against just abo...,Brad Pitt,"['Q35332', 'Q373912']",2016-09-07 10:34:00,16,"[['Brad Pitt', '0.3934'], ['None', '0.349'], [...",['http://eonline.com/news/792842/brad-pitt-wei...,E,Sep,NY Times,Donald Trump,"[a, trump, supporter, , fighting, against, jus...","[trump, support, , fight, , everyth]","[trump, supporter, , fighting, , everything]",trump supporter fighting everything


## Sentiment Analysis
Our baseline sentiment analysis consists of using TextBlob's sentiment analysis implementation which makes us of NLTK and pattern. The sentiment property is a namedtuple of the form Sentiment(polarity, subjectivity). The polarity score is a float within the range [-1.0, 1.0]. The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective. We separate the tuple and append two columns with the retrieved values.

In [7]:
sentiment_analysis_datapath = f"{DATA_PATH}preprocessed/"

In [4]:
df_obama_2012_sa = expand_quotations_with_polarity_subjectivity(df_obama_2012, column='quotation_conc_lemmatized', model=MODEL)
df_obama_2012_sa.to_csv(f'{sentiment_analysis_datapath}2012_obama_quotes_{MODEL}_processed.csv')

Processing sentiment analysis with BERT
Processed dataset (13403, 19) with BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768

KeyboardInterrupt: 

In [6]:
df_romney_2012_sa = expand_quotations_with_polarity_subjectivity(df_romney_2012, column='quotation_conc_lemmatized', model=MODEL)
df_romney_2012_sa.to_csv(f'{sentiment_analysis_datapath}2012_romney_quotes_{MODEL}_processed.csv')

In [8]:
df_trump_2016_sa = expand_quotations_with_polarity_subjectivity(df_trump_2016, column='quotation_conc_lemmatized', model=MODEL)
df_trump_2016_sa.to_csv(f'{sentiment_analysis_datapath}2016_trump_quotes_{MODEL}_processed.csv')

Processing sentiment analysis with BERT
Processed dataset (10092, 19) with BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768

In [9]:
df_clinton_2016_sa = expand_quotations_with_polarity_subjectivity(df_clinton_2016, column='quotation_conc_lemmatized', model=MODEL)
df_clinton_2016_sa.to_csv(f'{sentiment_analysis_datapath}2016_clinton_quotes_{MODEL}_processed.csv')

Processing sentiment analysis with BERT
Processed dataset (4632, 19) with BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [10]:
df_trump_2020_sa = expand_quotations_with_polarity_subjectivity(df_trump_2020, column='quotation_conc_lemmatized', model=MODEL)
df_trump_2020_sa.to_csv(f'{sentiment_analysis_datapath}2020_trump_quotes_{MODEL}_processed.csv')

Processing sentiment analysis with BERT
Processed dataset (6976, 19) with BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [11]:
df_biden_2020_sa = expand_quotations_with_polarity_subjectivity(df_biden_2020, column='quotation_conc_lemmatized', model=MODEL)
df_biden_2020_sa.to_csv(f'{sentiment_analysis_datapath}2020_biden_quotes_{MODEL}_processed.csv')

Processing sentiment analysis with BERT
Processed dataset (2733, 19) with BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768,