# Analyse tweets and articles on the 2021 Swiss CO2 law
First, we import dependencies and variables used throughout the notebook. All following cells depend on the first two cells being run. The pandas 'set_options' are optional, but recommended for easier reading of the data.

In [22]:
import pandas as pd
import os
import nltk.data
import random
import json
import re
from textblob import TextBlob
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

nltk.download('punkt')

# pd.set_option('display.min_rows', 400)
# pd.set_option('display.max_rows', 400)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', 100)
# pd.set_option('display.width', 1000)
# pd.set_option('display.colheader_justify', 'center')
# pd.set_option('display.precision', 3)

[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Import old Twitter data and exclude retweets.
# tweets = pd.read_csv('Twitter/23.-30.4/Twitter_Week2.csv')
# tweets = tweets[tweets['isRetweet'] == False]

# Import new Twitter data (as of 3 May 2022)
twitter_path = 'Downloaded_tweets/'
twitter_files = [file for file in os.listdir(twitter_path) if '.txt' in file]
tweets = pd.DataFrame()
for file in twitter_files:
    with open(twitter_path + file, 'r') as f:
        weekly_tweets = f.read()
    weekly_tweets = pd.DataFrame(json.loads(weekly_tweets)['data'])
    tweets = pd.concat([tweets, weekly_tweets], ignore_index=True)

tweets = tweets.drop(columns=['withheld'])

# Use a sample of the dataset for tests. Comment the following line out to use the whole dataset.
tweets = tweets.sample(10, random_state=42)
print(tweets)

In [7]:
splitter = nltk.data.load('tokenizers/punkt/english.pickle')
random.seed(42)

# Import all German articles.
de_path = 'Articles/Zeitungsartikel_DE/'
de_files = [file for file in os.listdir(de_path) if os.path.isfile(os.path.join(de_path, file))]
de_files = random.sample(de_files, 3) # Use a sample of the dataset for tests. Comment this line out to use the whole dataset.
de_articles = pd.DataFrame(columns=['medium', 'date', 'day_index', 'author', 'text', 'language', 'characters_in_text'])
for file in de_files:
    row = []
    row.append(file.split('_')[0]) # Add medium to row.
    with open(de_path + file, 'r', encoding='cp1252', errors='ignore') as f:
        article = f.read()
    article = article.split('\n') # Split the article when new line
    split = article.pop(0).split(',') # Split first element of the article because it contains the medium and the date
    row.append(split[1].replace(' ', '')) # Add date to row
    if 'Art' in file:
        row.append(int(file.split('.')[-2].split('_')[-1])) # Append character in file before the dot
    else:
        row.append(1) # If there is no 'Art' in file, then there is just one article for the day.
    row.append('unknown')
    for ele in article: # Search for string consisting of two words because it contains the author
        if len(ele.split()) == 2:
            row[3] = ele
            break

    text_length = len(' '.join(article))  # Get length of rejoined article
    tokenized_text = splitter.tokenize(' '.join(article)) # Tokenize rejoined article which now only contains the text
    row.append(tokenized_text) # Add tokenized text to row
    row.append('de') # Add language to row
    row.append(text_length) # Add length of rejoined article to row
    de_articles.loc[len(de_articles)] = row # Append row to bottom of de_articles with iloc

de_articles['date'] = pd.to_datetime(de_articles['date'], infer_datetime_format=True) # Transform date column to datetime format

# Import all French articles.
fr_path = 'Articles/Zeitungsartikel_FR/'
fr_files = [file for file in os.listdir(fr_path) if os.path.isfile(os.path.join(fr_path, file))]
fr_files = random.sample(fr_files, 3) # Use a sample of the dataset for tests. Comment this line out to use the whole dataset.
fr_articles = pd.DataFrame(columns=['medium', 'date', 'day_index', 'author', 'text', 'language', 'characters_in_text'])
for file in fr_files:
    row = []
    row.append(file.split('_')[0]) # Add medium to row.
    with open(fr_path + file, 'r', encoding='cp1252', errors='ignore') as f:
        article = f.read()
    article = article.split('\n') # Split the article when new line
    split = article.pop(0).split(',') # Split first element of the article because it contains the medium and the date
    row.append(split[1].replace(' ', '')) # Add date to row
    if 'Art' in file:
        row.append(int(file.split('.')[-2].split('_')[-1])) # Append character in file before the dot
    else:
        row.append(1) # If there is no 'Art' in file, then there is just one article for the day.
    row.append('unknown')
    for ele in article: # Search for string consisting of two words because it contains the author
        if len(ele.split()) == 2:
            row[3] = ele
            break

    text_length = len(' '.join(article))  # Get length of rejoined article
    tokenized_text = splitter.tokenize(' '.join(article)) # Tokenize rejoined article which now only contains the text
    row.append(tokenized_text) # Add tokenized text to row
    row.append('fr') # Add language to row
    row.append(text_length) # Add length of rejoined article to row
    fr_articles.loc[len(fr_articles)] = row # Append row to bottom of fr_articles with iloc

fr_articles['date'] = pd.to_datetime(fr_articles['date'], infer_datetime_format=True) # Transform date column to datetime format

## Translate tweets and articles from German and French to English.

In [4]:
# Load the tokenizers and the models.
de_model_name = 'Helsinki-NLP/opus-mt-de-en'
de_tokenizer = AutoTokenizer.from_pretrained(de_model_name)
de_model = AutoModelForSeq2SeqLM.from_pretrained(de_model_name)

fr_model_name = 'Helsinki-NLP/opus-mt-fr-en'
fr_tokenizer = AutoTokenizer.from_pretrained(fr_model_name)
fr_model = AutoModelForSeq2SeqLM.from_pretrained(fr_model_name)

In [None]:
# Translate tweets.
de_translation = pipeline("translation_de_to_en", model=de_model, tokenizer=de_tokenizer)
processed_text = [ele['translation_text'] for ele in de_translation(tweets.text.to_list())]
tweets.insert(loc=2, column='processed_text', value=processed_text)
tweets.to_csv('output/tweets_processed.csv')
tweets

In [8]:
# Translate articles and save English versions.

# German articles.
processed_texts = []
for i, text in enumerate(de_articles.text.to_list()):
    path = f"Articles/Zeitungsartikel_EN/{de_articles.medium[i]}_{de_articles.date[i].strftime('%Y-%m-%d')}_{de_articles.day_index[i]}_en.txt"
    if not os.path.exists(path):
        batch = de_tokenizer(text, return_tensors='pt', padding=True) # Tokenize sentences.
        output = de_model.generate(**batch, num_beams=2) # Generate English translations.
        translation = ' '.join(de_tokenizer.batch_decode(output, skip_special_tokens=True)) # Decode translations.
        processed_texts.append(translation)
        with open(path, 'w') as f:
            f.write(translation)
    else:
        with open(path, 'r') as f:
            translation = f.read()
        processed_texts.append(translation)

de_articles.insert(loc=5, column='processed_text', value=processed_texts)

# French articles.
processed_texts = []
for i, text in enumerate(fr_articles.text.to_list()):
    path = f"Articles/Zeitungsartikel_EN/{fr_articles.medium[i]}_{fr_articles.date[i].strftime('%Y-%m-%d')}_{fr_articles.day_index[i]}_en.txt"
    if not os.path.exists(path):
        batch = fr_tokenizer(text, return_tensors='pt', padding=True)
        output = fr_model.generate(**batch, num_beams=2)
        translation = ' '.join(fr_tokenizer.batch_decode(output, skip_special_tokens=True))
        processed_texts.append(translation)
        with open(path, 'w') as f:
            f.write(translation)
    else:
        with open(path, 'r') as f:
            translation = f.read()
        processed_texts.append(translation)

fr_articles.insert(loc=5, column='processed_text', value=processed_texts)

## Clean tweets for further processing
Tweets are messy because they contain a lot of non-alphabetical symbols like URLs, hashtags and mentions. Hashtags are especially valuable for the analysis of the topic of the tweet because they tend to contain keywords. However, an algorithm wouldn't be able to automatically discern multiple keywords in a hashtag such as #supportCO2lawnow. Therefore, we need to exclude URLs and mentions, and separate hashtags.

In [44]:
from ekphrasis.classes.segmenter import Segmenter
from preprocessor import tokenize
from re import findall, sub

# Clean text, and find, split and replace hashtags.
seg_tw = Segmenter(corpus="twitter")
def clean(tweet):
    hashtags = findall(r"#(\w+)", tweet) # Find hashtags.
    tweet_text = tokenize(tweet).split() # Split text into words.
    tweet_text = ' '.join([word for word in tweet_text if '$' not in word or '$HASHTAG$' in word]) # Remove placeholders for URLs and mentions.
    if hashtags:
        hashtags = [seg_tw.segment(hashtag) for hashtag in hashtags] # Segment hashtags.
        hashtags = [sub('\s*([o])\s*', r'\1', hashtag) for hashtag in hashtags] # Remove spaces around 'o' to join 'co' and 2.
        hashtags = [ele['translation_text'] for ele in de_translation(hashtags)] # Translate hashtags.
        while hashtags:
            tweet_text = tweet_text.replace('$HASHTAG$', hashtags.pop(0), 1) # Replace hashtags with segmented hashtags.
    return tweet_text

tweets['processed_text'] = tweets['processed_text'].apply(clean)

Reading twitter - 1grams ...
Reading twitter - 2grams ...


## Determine the tweets' and the articles' sentiment
Ideally, the sentiment reflects whether a tweet is in support or against the CO2 law. Manual inspection has shown that this is indeed mostly the case.

In [12]:
# Infer sentiment from pretrained model.
sentiment = pipeline('sentiment-analysis', 
                        model='distilbert-base-uncased-finetuned-sst-2-english', 
                        tokenizer='distilbert-base-uncased')

In [None]:
# Execute for tweets and save results.
sentiment_output = sentiment(tweets.processed_text.to_list())
tweets['sentiment_label'] = [ele['label'] for ele in sentiment_output]
tweets['sentiment_score'] = [ele['score'] for ele in sentiment_output]
tweets.to_csv('output/tweets_processed.csv')

In [24]:
# Combine German and French articles and save the dataset.
articles = pd.concat([de_articles, fr_articles])

# Execute sentiment analysis for articles.
sentiment_label = []
sentiment_score = []
for text in articles.processed_text.to_list():
    sentences = text.split('.') # Split article text into sentences.
    sentiment_per_sentence = [sentiment(sentence)[0] for sentence in sentences] # Get sentiments for each sentence.
    sentiment_score_per_sentence = [ele['score'] if ele['label'] == 'POSITIVE' else -ele['score'] for ele in sentiment_per_sentence] # Get sentiment scores.
    mean_score = sum(sentiment_score_per_sentence) # Get mean sentiment score.
    sentiment_label.append('POSITIVE' if mean_score > 0 else 'NEGATIVE') # Get sentiment label.
    sentiment_score.append(mean_score)
   
articles['sentiment_label'] = sentiment_label
articles['sentiment_score'] = sentiment_score

articles.to_csv('output/articles_processed.csv')

## Determine the tweets' topics and arguments in articles
An unsupervised approach probably would not find the desired finegrained topics (or arguments) in the tweets. Therefore, we would have to manually label the tweets and subsequently train a supervised, finetuned model.
Concerning the arguments in the articles, they could be entirely identified manually or with a finetuned Named Entity Recognition (NER) algorithm preceded by some manual labelling. The latter Machine Learning approach is significantly more complex but highly scalable.
The open-source app [Label Studio](https://labelstud.io/) provides a powerful toolbox for human labelling. It is, however, necessary to develop a tailored interface, which would take a few days to complete. When completed, any person can use the app to label the tweets and articles.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=afd85eb4-e181-4004-a2b4-65d914f16510' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>