# Analyse tweets and articles on the 2021 Swiss CO2 law
First, we import dependencies and variables used throughout the notebook. All following cells depend on the first two cells being run. The pandas 'set_options' are optional, but recommended for easier reading of the data.

In [None]:
import pandas as pd
import os
import nltk.data
import random
import ast
from textblob import TextBlob
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

nltk.download('punkt')

# pd.set_option('display.min_rows', 400)
# pd.set_option('display.max_rows', 400)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', 100)
# pd.set_option('display.width', 1000)
# pd.set_option('display.colheader_justify', 'center')
# pd.set_option('display.precision', 3)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Import old Twitter data and exclude retweets.
# tweets = pd.read_csv('Twitter/23.-30.4/Twitter_Week2.csv')
# tweets = tweets[tweets['isRetweet'] == False]

# Import new Twitter data (as of 3 May 2022)
twitter_path = 'Downloaded_tweets/'
twitter_files = [file for file in os.listdir(twitter_path) if '.txt' in file]
del twitter_files[5]
tweets = pd.DataFrame()
for file in twitter_files:
    with open(twitter_path + file, 'r') as f:
        weekly_tweets = f.read()
    weekly_tweets = pd.DataFrame(ast.literal_eval(weekly_tweets)['data'])
    tweets = pd.concat([tweets, weekly_tweets], ignore_index=True)

# Use a sample of the dataset for tests. Comment the following line out to use the whole dataset.
# tweets = tweets.sample(10, random_state=42)

In [None]:
splitter = nltk.data.load('tokenizers/punkt/english.pickle')

# Import all German articles.
de_path = 'Articles/Zeitungsartikel_DE/'
de_files = [file for file in os.listdir(de_path) if os.path.isfile(os.path.join(de_path, file))]
de_articles = []
for file in de_files:
    with open(de_path + file, 'r', encoding='cp1252', errors='ignore') as f:
        de_articles.append(f.read())

de_articles = [splitter.tokenize(article) for article in de_articles] # Split articles into sentences.

# Import all French articles.
fr_path = 'Articles/Zeitungsartikel_FR/'
fr_files = [file for file in os.listdir(fr_path) if os.path.isfile(os.path.join(fr_path, file))]
fr_articles = []
for file in fr_files:
    with open(fr_path + file, 'r', encoding='cp1252', errors='ignore') as f:
        fr_articles.append(f.read())

fr_articles = [splitter.tokenize(article) for article in fr_articles]

# Import translated articles
trans_path = 'Articles/Zeitungsartikel_translated/'
trans_files = [file for file in os.listdir(trans_path) if os.path.isfile(os.path.join(trans_path, file))]
trans_articles = []
for file in trans_files:
    with open(trans_path + file, 'r', encoding='cp1252', errors='ignore') as f:
        trans_articles.append(f.read())

trans_articles = [splitter.tokenize(article) for article in trans_articles]


## Translate tweets and articles from German and French to English.

In [None]:
# Load the tokenizers and the models.
de_model_name = 'Helsinki-NLP/opus-mt-de-en'
de_tokenizer = AutoTokenizer.from_pretrained(de_model_name)
de_model = AutoModelForSeq2SeqLM.from_pretrained(de_model_name)

fr_model_name = 'Helsinki-NLP/opus-mt-fr-en'
fr_tokenizer = AutoTokenizer.from_pretrained(fr_model_name)
fr_model = AutoModelForSeq2SeqLM.from_pretrained(fr_model_name)

In [None]:
# Translate tweets.
de_translation = pipeline("translation_de_to_en", model=de_model, tokenizer=de_tokenizer)
processed_text = [ele['translation_text'] for ele in de_translation(tweets.text.to_list())]
tweets.insert(loc=1, column='processed_text', value=processed_text)
tweets.to_csv('output/tweets_processed.csv')
tweets.processed_text.to_list()

['As an energy consumer, Infomaniak supports the new Swiss CO2 Act https://t.co/Su4wT7O2vN',
 '@gerhardpfister @Mitte_Centre With the CO2 law I agree with them and I am very pleased that "The CO2... https://t.co/MccfPUVFCZ',
 'Signed one or the other. #Co2LawYes https://t.co/RHOkU75qug',
 'The organization @greenpeace @greenpeace_ch has been condemned for lying. Now they manipulate children around Grosis d... https://t.co/jzBT2pMHuc',
 '@just7c3 @extreme_no @JazumPMT @NoCo2Law https://t.co/9GVB3ocNa7. https://t.co/IhpWQzqh6B',
 '@Pat61st @rene_anthon @NordmannRoger @DGaleuchet @Co2LawNo @CO2LawYes @Damian_Mueller_ @Martin_Baeumle Was g... https://t.co/KSZ8ufzEPZ',
 'Guest speaker @bastiengirod on the Co2 Act: expertly versed and with many convincing arguments for @JaCO2Law https://t.co/kvLhyV7FWS',
 'Worth seeing: Feusi Fédéral with Prof. Reiner Eichenberger. Exciting insights and good arguments among others about CO2... https://t.co/sqS3ZH4CEe',
 'The Basel FDP says 2x no to the canto

In [None]:
# Translate articles and save English versions.
# German articles.
for i, article in enumerate(de_articles):
    batch = de_tokenizer(article, return_tensors='pt', padding=True) # Tokenize sentences.
    output = de_model.generate(**batch) # Generate English translations.
    translation = ' '.join(de_tokenizer.batch_decode(output, skip_special_tokens=True)) # Decode translations.
    with open(f'Articles/Zeitungsartikel_uebersetzt/{de_files[i]}_en.txt', 'w') as f:
        f.write(translation) # Save English translations.

# French articles.
for i, article in enumerate(fr_articles):
    batch = fr_tokenizer(article, return_tensors='pt', padding=True)
    output = fr_model.generate(**batch)
    translation = ' '.join(fr_tokenizer.batch_decode(output, skip_special_tokens=True))
    with open(f'Articles/Zeitungsartikel_uebersetzt/{fr_files[i]}_en.txt', 'w') as f:
        f.write(translation)


## Clean tweets for further processing
Tweets are messy because they contain a lot of non-alphabetical symbols like URLs, hashtags and mentions. Hashtags are especially valuable for the analysis of the topic of the tweet because they tend to contain keywords. However, an algorithm wouldn't be able to automatically discern multiple keywords in a hashtag such as #supportCO2lawnow. Therefore, we need mark URLs and mentions, and separate hashtags.

In [None]:
from ekphrasis.classes.segmenter import Segmenter
from preprocessor import tokenize
from re import findall, sub

# Clean text, and find, split and replace hashtags.
seg_tw = Segmenter(corpus="twitter")
def clean(tweet):
    hashtags = findall(r"#(\w+)", tweet) # Find hashtags.
    tweet_text = tokenize(tweet).split() # Split text into words.
    tweet_text = ' '.join([word for word in tweet_text if '$' not in word or '$HASHTAG$' in word]) # Remove placeholders for URLs and mentions.
    if hashtags:
        hashtags = [seg_tw.segment(hashtag) for hashtag in hashtags] # Segment hashtags.
        hashtags = [sub('\s*([o])\s*', r'\1', hashtag) for hashtag in hashtags] # Remove spaces around 'o' to join 'co' and 2.
        while hashtags:
            tweet_text = tweet_text.replace('$HASHTAG$', hashtags.pop(0), 1) # Replace hashtags with segmented hashtags.
    return tweet_text

tweets['processed_text'] = tweets['processed_text'].apply(clean)

## Determine the tweets' and the articles' sentiment
Ideally, the sentiment reflects whether a tweet is in support or against the CO2 law. Manual inspection has shown that this is indeed mostly the case.

In [None]:
# Infer sentiment from pretrained model.
sentiment = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english', tokenizer='distilbert-base-uncased')

Downloading: 100%|██████████| 629/629 [00:00<00:00, 507kB/s]
Downloading: 100%|██████████| 255M/255M [00:03<00:00, 67.2MB/s]
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 13.8kB/s]
Downloading: 100%|██████████| 483/483 [00:00<00:00, 255kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 39.3MB/s]
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 33.3MB/s]


In [None]:
# Execute for tweets and save results.
sentiment_output = sentiment(tweets.processed_text.to_list())
tweets['sentiment_label'] = [ele['label'] for ele in sentiment_output]
tweets['sentiment_score'] = [ele['score'] for ele in sentiment_output]
tweets.to_csv('output/tweets_processed.csv')
tweets

Unnamed: 0,text,processed_text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted,longitude,latitude,sentiment_label,sentiment_score
1044,Als Energieverbraucher unterstützt Infomaniak ...,"As an energy consumer, Infomaniak supports the...",False,1,,2021-04-26T16:14:44Z,False,,1386715377128857610,,"<a href=""https://zapier.com/"" rel=""nofollow"">Z...",campaigning,0,False,False,,,POSITIVE,0.727026
1118,@gerhardpfister @Mitte_Centre Beim CO2 Gesetz ...,@gerhardpfister @Mitte_Centre With the CO2 law...,False,0,gerhardpfister,2021-04-25T17:42:45Z,True,1.386371e+18,1386375139693146117,3291242000.0,"<a href=""http://twitter.com/download/iphone"" r...",roger_reuss,0,False,False,,,POSITIVE,0.999467
327,Habe mir das eine oder andere eingetragen. #Co...,Signed one or the other. #Co2LawYes https://t....,False,6,,2021-04-27T14:58:42Z,False,,1387058630847918083,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",glptbollinger,0,False,False,,,NEGATIVE,0.992678
106,Die Organisation @greenpeace @greenpeace_ch wu...,The organization @greenpeace @greenpeace_ch ha...,False,2,,2021-04-27T11:58:37Z,True,,1387013310508376069,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",morvjn,1,False,False,,,NEGATIVE,0.995984
1056,@just7c3 @extreme_nein @JazumPMT @NeinCo2Geset...,@just7c3 @extreme_no @JazumPMT @NoCo2Law https...,False,0,just7c3,2021-04-26T13:40:23Z,False,1.386676e+18,1386676535206236160,1.289152e+18,"<a href=""https://mobile.twitter.com"" rel=""nofo...",MFrauchigerSVP,0,False,False,,,NEGATIVE,0.996913
428,@Pat61st @rene_anthon @NordmannRoger @DGaleuch...,@Pat61st @rene_anthon @NordmannRoger @DGaleuch...,False,0,Pat61st,2021-04-26T14:15:21Z,True,1.386684e+18,1386685333597827072,9.772495e+17,"<a href=""http://itunes.apple.com/us/app/twitte...",campaigning,0,False,False,,,NEGATIVE,0.99308
845,Gastreferat @bastiengirod zum Co2-Gesetz: fach...,Guest speaker @bastiengirod on the Co2 Act: ex...,False,18,,2021-04-27T17:38:23Z,False,,1387098815199653892,,"<a href=""http://twitter.com/download/iphone"" r...",gruenebern,4,False,False,,,POSITIVE,0.994745
1123,Sehenswert: Feusi Fédéral mit Prof. Reiner Eic...,Worth seeing: Feusi Fédéral with Prof. Reiner ...,False,28,,2021-04-25T16:44:53Z,True,,1386360578587955205,,"<a href=""http://twitter.com/download/android"" ...",JanssenAK,5,False,False,,,POSITIVE,0.997803
697,Die Basler FDP sagt 2x Nein zum kantonalen Min...,The Basel FDP says 2x no to the cantonal minim...,False,16,,2021-04-28T16:00:00Z,True,,1387436443925102600,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",FDP_BS,5,False,False,,,NEGATIVE,0.996366
755,@grunliberale sind auch prinzipienlose Soziali...,@grunliberals are also unprincipled socialists...,False,0,grunliberale,2021-04-28T11:03:00Z,True,,1387361700639297542,22799860.0,"<a href=""http://twitter.com/download/iphone"" r...",MartinCJanssen,0,False,False,,,NEGATIVE,0.997292


In [None]:
# Execute for German articles and save results.
de_articles_sentiments = pd.DataFrame(de_files, columns=['file'])
sentences_sample = []
label = []
score = []
for article in de_articles:
    inputs = ' '.join(article)
    if len(inputs) > 512:
        try:    
            inputs = ' '.join(random.sample(article, 10))[:512]
        except ValueError:
            inputs = inputs[:512]
    outputs = sentiment(inputs) # Infer sentiment.
    sentences_sample.append(inputs) # Save sample sentences.
    label.append(outputs[0]['label']) # Save sentiment labels.
    score.append(outputs[0]['score']) # Save sentiment scores.

de_articles_sentiments['sentences_sample'] = sentences_sample
de_articles_sentiments['sentiment_label'] = label
de_articles_sentiments['sentiment_score'] = score

# Execute for French articles and save results.
fr_articles_sentiments = pd.DataFrame(fr_files, columns=['file'])
sentences_sample = []
label = []
score = []
for article in fr_articles:
    inputs = ' '.join(article)
    if len(inputs) > 512:
        try:    
            inputs = ' '.join(random.sample(article, 10))[:512]
        except ValueError:
            inputs = inputs[:512]
    outputs = sentiment(inputs)
    sentences_sample.append(inputs)
    label.append(outputs[0]['label'])
    score.append(outputs[0]['score'])

fr_articles_sentiments['sentences_sample'] = sentences_sample
fr_articles_sentiments['sentiment_label'] = label
fr_articles_sentiments['sentiment_score'] = score

# Merge both tables and save results
articles_sentiments = pd.concat([de_articles_sentiments, fr_articles_sentiments], axis=0)
articles_sentiments.to_csv('output/articles_sentiments.csv')

In [None]:
# Execute for translated, English articles and save results.
trans_articles_sentiments = pd.DataFrame(trans_files, columns=['file'])
sentences_sample = []
label = []
score = []
for article in trans_articles:
    inputs = ' '.join(article)
    if len(inputs) > 512:
        try:    
            inputs = ' '.join(random.sample(article, 10))[:512]
        except ValueError:
            inputs = inputs[:512]
    outputs = sentiment(inputs)
    sentences_sample.append(inputs)
    label.append(outputs[0]['label'])
    score.append(outputs[0]['score'])

trans_articles_sentiments['sentences_sample'] = sentences_sample
trans_articles_sentiments['sentiment_label'] = label
trans_articles_sentiments['sentiment_score'] = score
trans_articles_sentiments

Unnamed: 0,file,sentences_sample,sentiment_label,sentiment_score
0,LT_01.06.2021.txt_en.txt,"Le Temps, 01.06.2021 Andr Schneider: Carbon ne...",NEGATIVE,0.849739
1,NZZ_03.05.2021_Art_1.txt_en.txt,"NZZ, 03.05.2021 Can I use the new CO2 law to r...",POSITIVE,0.804419
2,LT_03.05.2021_Art_2.txt_en.txt,"Time, 03.05.2021 On the revision of the CO2 la...",POSITIVE,0.638304
3,NZZ_01.06.2021_Art_2.txt_en.txt,"NZZ, 01.06.2021 Chinese tourists pay for Swiss...",NEGATIVE,0.993869
4,NZZ_02.06.2021_Art_2.txt_en.txt,"NZZ, 02.06.2021 Majorities in the agricultural...",NEGATIVE,0.994052
5,LT_01.05.2021.txt_en.txt,"Le Temps, 01.05.2021 In Switzerland, aviation ...",NEGATIVE,0.991487
6,LT_02.06.2021.txt_en.txt,"Le Temps, 01.06.2021 Cological projects are lo...",NEGATIVE,0.999756
7,LT_03.05.2021_Art_1.txt_en.txt,"Time, 03.05.2021 CO2, the weight of the wallet...",POSITIVE,0.984113
8,NZZ_01.06.2021_Art_1.txt_en.txt,"NZZ, 01.06.2021 Financial sector enters the cl...",NEGATIVE,0.987811
9,NZZ_02.06.2021_Art_1.txt_en.txt,"NZZ, 02.06.2021 CO2 law: Well meant is not wel...",NEGATIVE,0.997701


## Determine the tweets' topics and arguments in articles
An unsupervised approach probably would not find the desired finegrained topics (or arguments) in the tweets. Therefore, we would have to manually label the tweets and subsequently train a supervised, finetuned model.
Concerning the arguments in the articles, they could be entirely identified manually or with a finetuned Named Entity Recognition (NER) algorithm preceded by some manual labelling. The latter Machine Learning approach is significantly more complex but highly scalable.
The open-source app [Label Studio](https://labelstud.io/) provides a powerful toolbox for human labelling. It is, however, necessary to develop a tailored interface, which would take a few days to complete. When completed, any person can use the app to label the tweets and articles.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=afd85eb4-e181-4004-a2b4-65d914f16510' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>