# Analyse tweets and articles on the 2021 Swiss CO2 law
First, we import dependencies and variables used throughout the notebook. All following cells depend on the first two cells being run. The pandas 'set_options' are optional, but recommended for easier reading of the data.

In [1]:
import pandas as pd
import os
import nltk.data
import random
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
nltk.download('punkt')

# pd.set_option('display.min_rows', 400)
# pd.set_option('display.max_rows', 400)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', 100)
# pd.set_option('display.width', 1000)
# pd.set_option('display.colheader_justify', 'center')
# pd.set_option('display.precision', 3)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Import Twitter data and exclude retweets.
tweets = pd.read_csv('Twitter/23.-30.4/Twitter_Week2.csv')
tweets = tweets[tweets['isRetweet'] == False]

# Use a sample of the dataset for tests. Comment the following line out to use the whole dataset.
tweets = tweets.sample(10, random_state=42)

In [2]:
# Import all German articles.
de_path = 'Articles/Zeitungsartikel_DE/'
de_files = [file for file in os.listdir(de_path) if os.path.isfile(os.path.join(de_path, file))]
de_articles = []
for file in de_files:
    with open(de_path + file, 'r', errors='ignore') as f:
        de_articles.append(f.read())

# Import all French articles.
fr_path = 'Articles/Zeitungsartikel_FR/'
fr_files = [file for file in os.listdir(fr_path) if os.path.isfile(os.path.join(fr_path, file))]
fr_articles = []
for file in fr_files:
    with open(fr_path + file, 'r', errors='ignore') as f:
        fr_articles.append(f.read())

## Translate tweets and articles from German and French to English.

In [4]:
# Load the tokenizers and the models.
de_model_name = 'Helsinki-NLP/opus-mt-de-en'
de_tokenizer = AutoTokenizer.from_pretrained(de_model_name)
de_model = AutoModelForSeq2SeqLM.from_pretrained(de_model_name)

fr_model_name = 'Helsinki-NLP/opus-mt-fr-en'
fr_tokenizer = AutoTokenizer.from_pretrained(fr_model_name)
fr_model = AutoModelForSeq2SeqLM.from_pretrained(fr_model_name)

In [None]:
# Translate tweets.
de_translation = pipeline("translation_de_to_en", model=ger_model, tokenizer=ger_tokenizer)
processed_text = [ele['translation_text'] for ele in de_translation(tweets.text.to_list())]
tweets.insert(loc=1, column='processed_text', value=processed_text)
tweets.to_csv('output/tweets_processed.csv')

In [5]:
# Translate articles and save English versions.
splitter = nltk.data.load('tokenizers/punkt/english.pickle')

# German articles.
for i, article in enumerate(de_articles):
    de_articles[i] = splitter.tokenize(article) # Split articles into sentences.
    batch = de_tokenizer(de_articles[i], return_tensors='pt', padding=True) # Tokenize sentences.
    output = de_model.generate(**batch) # Generate English translations.
    translation = ' '.join(de_tokenizer.batch_decode(output, skip_special_tokens=True)) # Decode translations.
    with open(f'Articles/Zeitungsartikel_uebersetzt/{de_files[i]}_en.txt', 'w') as f:
        f.write(translation) # Save English translations.

# French articles.
for i, article in enumerate(fr_articles):
    fr_articles[i] = splitter.tokenize(article)
    batch = fr_tokenizer(fr_articles[i], return_tensors='pt', padding=True)
    output = fr_model.generate(**batch)
    translation = ' '.join(fr_tokenizer.batch_decode(output, skip_special_tokens=True))
    with open(f'Articles/Zeitungsartikel_uebersetzt/{fr_files[i]}_en.txt', 'w') as f:
        f.write(translation)


## Clean tweets for further processing
Tweets are messy because they contain a lot of non-alphabetical symbols like URLs, hashtags and mentions. Hashtags are especially valuable for the analysis of the topic of the tweet because they tend to contain keywords. However, an algorithm wouldn't be able to automatically discern multiple keywords in a hashtag such as #supportCO2lawnow. Therefore, we need mark URLs and mentions, and separate hashtags.

In [None]:
from ekphrasis.classes.segmenter import Segmenter
from preprocessor import tokenize
from re import findall, sub

# Clean text, and find, split and replace hashtags.
seg_tw = Segmenter(corpus="twitter")
def clean(tweet):
    hashtags = findall(r"#(\w+)", tweet) # Find hashtags.
    tweet_text = tokenize(tweet).split() # Split text into words.
    tweet_text = ' '.join([word for word in tweet_text if '$' not in word or '$HASHTAG$' in word]) # Remove placeholders for URLs and mentions.
    if hashtags:
        hashtags = [seg_tw.segment(hashtag) for hashtag in hashtags] # Segment hashtags.
        hashtags = [sub('\s*([o])\s*', r'\1', hashtag) for hashtag in hashtags] # Remove spaces around 'o' to join 'co' and 2.
        while hashtags:
            tweet_text = tweet_text.replace('$HASHTAG$', hashtags.pop(0), 1) # Replace hashtags with segmented hashtags.
    return tweet_text

tweets['processed_text'] = tweets['processed_text'].apply(clean)

## Determine the tweets' and the articles' sentiment
Ideally, the sentiment reflects whether a tweet is in support or against the CO2 law. Manual inspection has shown that this is indeed mostly the case.

In [None]:
# Infer sentiment from pretrained model.
sentiment = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english', tokenizer='distilbert-base-uncased')

In [None]:
# Execute for tweets and save results.
sentiment_output = sentiment(tweets.processed_text.to_list())
tweets['sentiment_label'] = [ele['label'] for ele in sentiment_output]
tweets['sentiment_score'] = [ele['score'] for ele in sentiment_output]
tweets.to_csv('output/tweets_processed.csv')

In [None]:
# Execute for German articles and save results.
de_articles_sentiment = pd.DataFrame(de_files, columns=['file'])
sentences_sample = []
label = []
score = []
for article in de_articles:
    sample = ' '.join(random.sample(article, 10)) # Sample 10 sentences from each article to not exceed the model's max length.
    output = sentiment(sample) # Infer sentiment.
    sentences_sample.append(sample) # Save sample sentences.
    label.append(output[0]['label']) # Save sentiment labels.
    score.append(output[0]['score']) # Save sentiment scores.

de_articles_sentiment['sentences_sample'] = sentences_sample
de_articles_sentiment['sentiment_label'] = label
de_articles_sentiment['sentiment_score'] = score

# Execute for French articles and save results.
fr_articles_sentiment = pd.DataFrame(fr_files, columns=['file'])
sentences_sample = []
label = []
score = []
for article in fr_articles:
    sample = ' '.join(random.sample(article, 10))
    output = sentiment(sample)
    sentences_sample.append(sample)
    label.append(output[0]['label'])
    score.append(output[0]['score'])

fr_articles_sentiment['sentences_sample'] = sentences_sample
fr_articles_sentiment['sentiment_label'] = label
fr_articles_sentiment['sentiment_score'] = score

# Merge both tables and save results
articles_sentiment = pd.concat([de_articles_sentiment, fr_articles_sentiment], axis=0)
articles_sentiment.to_csv('output/articles_sentiment.csv')

## Determine the tweets' topics and arguments in articles
An unsupervised approach probably would not find the desired finegrained topics (or arguments) in the tweets. Therefore, we would have to manually label the tweets and subsequently train a supervised, finetuned model.
Concerning the arguments in the articles, they could be entirely identified manually or with a finetuned Named Entity Recognition (NER) algorithm preceded by some manual labelling. The latter Machine Learning approach is significantly more complex but highly scalable.
The open-source app [Label Studio](https://labelstud.io/) provides a powerful toolbox for human labelling. It is, however, necessary to develop a tailored interface, which would take a few days to complete. When completed, any person can use the app to label the tweets and articles.