# Analyse tweets and articles on the 2021 Swiss CO2 law
First, we import dependencies and variables used throughout the notebook. All following cells depend on the first two cells being run. The pandas 'set_options' are optional, but recommended for easier reading of the data.

In [None]:
import pandas as pd
import os
import nltk.data
import random
import json
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

nltk.download('punkt')

# pd.set_option('display.min_rows', 400)
# pd.set_option('display.max_rows', 400)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', 100)
# pd.set_option('display.width', 1000)
# pd.set_option('display.colheader_justify', 'center')
# pd.set_option('display.precision', 3)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Import old Twitter data and exclude retweets.
# tweets = pd.read_csv('Twitter/23.-30.4/Twitter_Week2.csv')
# tweets = tweets[tweets['isRetweet'] == False]

# Import new Twitter data (as of 3 May 2022)
twitter_path = 'Tweets/'
twitter_files = [file for file in os.listdir(twitter_path) if '.txt' in file]
tweets = pd.DataFrame()
for file in twitter_files:
    with open(twitter_path + file, 'r') as f:
        weekly_tweets = f.read()
    weekly_tweets = pd.DataFrame(json.loads(weekly_tweets)['data'])
    tweets = pd.concat([tweets, weekly_tweets], ignore_index=True)

tweets = tweets.drop(columns=['withheld'])

# Uncomment the following line to use a sample of the dataset for tests.
# tweets = tweets.sample(3, random_state=42)

In [None]:
def create_article_row(file, lang_path):
    row = [file.split('_')[0]] # Instantiate row with the medium, e.g. 'NZZ'.
    with open(lang_path + file, 'r', encoding='cp1252', errors='ignore') as f:
        article = f.read()
    article = article.split('\n') # Split the article when new line
    split = article.pop(0).split(',') # Split first line of the article because it contains the date.
    row.append(split[1].replace(' ', ''))
    if 'Art' in file:
        row.append(int(file.split('.')[-2].split('_')[-1])) # Append character in file before the dot
    else:
        row.append(1) # If there is no 'Art' in file, then there is just one article for the day.
    tokenized_text = splitter.tokenize(' '.join(article)) # Tokenize rejoined article which now only contains the text
    row.append(tokenized_text) # Add tokenized text to row
    row.append('de') # Add language to row
    row.append(len(' '.join(article))) # Add length of rejoined article to row.

    return row

splitter = nltk.data.load('tokenizers/punkt/english.pickle')

# Import German articles.
de_path = 'Articles/Zeitungsartikel_DE/'
de_files = [file for file in os.listdir(de_path) if os.path.isfile(os.path.join(de_path, file))]
# de_files = random.sample(de_files, 5) # Use a sample of the dataset for tests. Comment this line out to use the whole dataset.
de_articles = pd.DataFrame(columns=['medium', 'date', 'day_index', 'text', 'language', 'characters_in_text'])
for file in de_files:
    row = create_article_row(file, de_path)
    de_articles.loc[len(de_articles)] = row # Append row to bottom of de_articles with iloc

de_articles['date'] = pd.to_datetime(de_articles['date'], format='%d.%m.%Y') # Transform date column to datetime format

# Import French articles.
fr_path = 'Articles/Zeitungsartikel_FR/'
fr_files = [file for file in os.listdir(fr_path) if os.path.isfile(os.path.join(fr_path, file))]
# fr_files = random.sample(fr_files, 5) # Use a sample of the dataset for tests. Comment this line out to use the whole dataset.
fr_articles = pd.DataFrame(columns=['medium', 'date', 'day_index', 'text', 'language', 'characters_in_text'])
for file in fr_files:
    row = create_article_row(file, fr_path)
    fr_articles.loc[len(fr_articles)] = row # Append row to bottom of fr_articles with iloc

fr_articles['date'] = pd.to_datetime(fr_articles['date'], format='%d.%m.%Y') # Transform date column to datetime format

## Translate tweets and articles from German and French to English.

In [None]:
# Load the tokenizers and the models.
de_model_name = 'Helsinki-NLP/opus-mt-de-en'
de_tokenizer = AutoTokenizer.from_pretrained(de_model_name)
de_model = AutoModelForSeq2SeqLM.from_pretrained(de_model_name)

fr_model_name = 'Helsinki-NLP/opus-mt-fr-en'
fr_tokenizer = AutoTokenizer.from_pretrained(fr_model_name)
fr_model = AutoModelForSeq2SeqLM.from_pretrained(fr_model_name)

Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 42.0/42.0 [00:00<00:00, 59.5kB/s]
Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.11k/1.11k [00:00<00:00, 1.61MB/s]
Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 778k/778k [00:00<00:00, 46.2MB/s]
Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 750k/750k [00:00<00:00, 73.6MB/s]
Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.21M/1.21M [00:00<00:00, 63.7MB/s]
Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 284M/284M [00:04<00:00, 71.8MB/s]
Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 42.0/42.0 [00:00<00:00, 42.7kB/s]
Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.26k/1.26k [00:00<00:00, 1.63MB/s]
Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 784k/784k [00:00<00:00, 69.3MB/s]
Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 760k/760k [00:00<00:00, 69.2MB/s]
Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.28M/1.28M [00:00<00:00, 77.0MB/s]
Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 287M/287M [00:03<0

In [None]:
# Translate tweets.
de_translation = pipeline("translation_de_to_en", model=de_model, tokenizer=de_tokenizer)
processed_text = [de_translation(tweet) for tweet in tweets.text.to_list()]
processed_text = [ele[0]['translation_text'] for ele in processed_text]
tweets.insert(loc=2, column='processed_text', value=processed_text)
tweets.to_csv('output/tweets_processed.csv')
tweets

KernelInterrupted: Execution interrupted by the Jupyter kernel.

In [None]:
# Translate articles and save English versions.

# German articles.
def de_translate():
    for i, text in enumerate(de_articles.text.to_list()):
        path = f"Articles/Zeitungsartikel_EN/{de_articles.medium[i]}_{de_articles.date[i].strftime('%Y-%m-%d')}_{de_articles.day_index[i]}_en.txt"
        if not os.path.exists(path):
            for j, sentence in enumerate(text):
                batch = de_tokenizer([sentence], return_tensors='pt', padding=True) # Tokenize sentence.
                output = de_model.generate(**batch, num_beams=2) # Generate English translation.
                text[j] = de_tokenizer.batch_decode(output, skip_special_tokens=True)[0] # Decode translations.
            text = ' '.join(text)
            with open(path, 'w') as f:
                f.write(text)
        else:
            with open(path, 'r') as f:
                text = f.read()
        
        yield text

de_articles.insert(4, 'processed_text', [translation for translation in de_translate()])

# French articles.
def fr_translate():
    for i, text in enumerate(fr_articles.text.to_list()):
        path = f"Articles/Zeitungsartikel_EN/{fr_articles.medium[i]}_{fr_articles.date[i].strftime('%Y-%m-%d')}_{fr_articles.day_index[i]}_en.txt"
        if not os.path.exists(path):
            for j, sentence in enumerate(text):
                batch = fr_tokenizer([sentence], return_tensors='pt', padding=True)
                output = fr_model.generate(**batch, num_beams=2)
                text[j] = fr_tokenizer.batch_decode(output, skip_special_tokens=True)[0]
            text = ' '.join(text)
            with open(path, 'w') as f:
                f.write(text)
        else:
            with open(path, 'r') as f:
                text = f.read()
        
        yield text

fr_articles.insert(4, 'processed_text', [translation for translation in fr_translate()])

## Clean tweets for further processing
Tweets are messy because they contain a lot of non-alphabetical symbols like URLs, hashtags and mentions. Hashtags are especially valuable for the analysis of the topic of the tweet because they tend to contain keywords. However, an algorithm wouldn't be able to automatically discern multiple keywords in a hashtag such as #supportCO2lawnow. Therefore, we need to exclude URLs and mentions, and separate hashtags.

In [None]:
from ekphrasis.classes.segmenter import Segmenter
from preprocessor import tokenize
from re import findall, sub

# Clean text, and find, split and replace hashtags.
seg_tw = Segmenter(corpus="twitter")
def clean(tweet):
    hashtags = findall(r"#(\w+)", tweet) # Find hashtags.
    tweet_text = tokenize(tweet).split() # Split text into words.
    tweet_text = ' '.join([word for word in tweet_text if '$' not in word or '$HASHTAG$' in word]) # Remove placeholders for URLs and mentions.
    if hashtags:
        hashtags = [seg_tw.segment(hashtag) for hashtag in hashtags] # Segment hashtags.
        hashtags = [sub('\s*([o])\s*', r'\1', hashtag) for hashtag in hashtags] # Remove spaces around 'o' to join 'co' and 2.
        hashtags = [ele['translation_text'] for ele in de_translation(hashtags)] # Translate hashtags.
        while hashtags:
            tweet_text = tweet_text.replace('$HASHTAG$', hashtags.pop(0), 1) # Replace hashtags with segmented hashtags.
    return tweet_text

tweets['processed_text'] = tweets['processed_text'].apply(clean)

## Determine the tweets' and the articles' sentiment
Ideally, the sentiment reflects whether a tweet is in support or against the CO2 law. Manual inspection has shown that this is indeed mostly the case.

In [None]:
# Instantiate a sentiment analysis pipeline with a pretrained model.
sentiment = pipeline('sentiment-analysis', 
                        model='distilbert-base-uncased-finetuned-sst-2-english', 
                        tokenizer='distilbert-base-uncased',
                        batch_size=1, 
                        device=-1)

In [None]:
# Execute for tweets and save results.
sentiment_output = [sentiment(tweet[:512])[0] for tweet in tweets.processed_text.to_list()]
tweets['sentiment_label'] = [ele['label'] for ele in sentiment_output]
tweets['sentiment_score'] = [ele['score'] for ele in sentiment_output]
tweets.to_csv('output/tweets_processed.csv')
tweets

In [None]:
# Combine German and French articles.
articles = pd.concat([de_articles, fr_articles])

# Execute sentiment analysis for articles.
sentiment_label = []
sentiment_score = []
for text in articles.processed_text.to_list():
    sentences = text.split('.') # Split article text into sentences.
    sentiment_per_sentence = [sentiment(sentence)[0] for sentence in sentences] # Get sentiments for each sentence.
    sentiment_score_per_sentence = [ele['score'] if ele['label'] == 'POSITIVE' else -ele['score'] for ele in sentiment_per_sentence] # Get sentiment scores.
    mean_score = sum(sentiment_score_per_sentence) / len(sentiment_score_per_sentence) # Get mean sentiment score.
    sentiment_label.append('POSITIVE' if mean_score > 0 else 'NEGATIVE') # Get sentiment label.
    sentiment_score.append(mean_score)
   
articles['sentiment_label'] = sentiment_label
articles['sentiment_score'] = sentiment_score

articles.to_csv('output/articles_processed.csv')

## Find the author name(s) of the articles
The author's or the authors' name(s) are hidden in the articles' texts. To find them we can use a Named Entity Recognition approach. It identifies names of people in text.

In [None]:
# del sentiment # Delete the model from the previous step to save memory.
articles = pd.read_csv('output/articles_processed.csv')

ner_pipe = pipeline("ner")
for row in articles.itertuples():
    author = ''
    for ele in ner_pipe(row.processed_text[:256]): # Search for names in the first 256 characters.
        if ele['entity'] == 'I-PER':
            author = f"{author}{ele['word']}"
    author = author.replace('#', '') # Remove hashes.
    author = re.sub(r"(\w)([A-Z])", r"\1 \2", author) # Insert a space before every capital letter.
    articles.at[row.Index, 'author'] = author if author != '' else 'unknown'

articles.to_csv('output/articles_processed.csv')

## Review the results in the 'output' folder.

In [None]:
tweets = pd.read_csv('output/tweets_processed.csv')
tweets

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,text,processed_text,sentiment_label,sentiment_score
0,0,0,1387883734439956489,@rene_anthon @Pat61st @NordmannRoger @DGaleuch...,You have one at the gossip to call the Co2 law...,NEGATIVE,0.998060
1,1,1,1387880856962969601,RT @KlimaAllianzCH: ¬´F√ºr die Einhaltung des Pa...,"""For compliance with the Paris Agreement, it i...",POSITIVE,0.998982
2,2,2,1387876662574977031,@SandroBrotz @luzian_franzini @GrueneCH Wie de...,"Like climate change, the Corona crisis is a gl...",NEGATIVE,0.990527
3,3,3,1387870225874444289,RT @glpluzern: An unserer heutigen Mitgliederv...,At today's general meeting we have adopted the...,NEGATIVE,0.979833
4,4,4,1387867915416899586,An unserer heutigen Mitgliederversammlung habe...,"At today's general meeting, we made the follow...",NEGATIVE,0.994707
...,...,...,...,...,...,...,...
3699,3699,3699,1391065898174275585,@Pat61st @AlainS1991 @xHascox @morvjn @FelixSc...,"Here, the cable car association joins the Co2 ...",NEGATIVE,0.964805
3700,3700,3700,1391062441203752960,"RT @roliemmer: Zu galuben, sich an der frische...","To galubize, to infect yourself with Corona in...",NEGATIVE,0.993001
3701,3701,3701,1391059095113945089,RT @Europakonzept: @Politos_politik @thomas_ae...,concept: The climate youths will be...,NEGATIVE,0.642797
3702,3702,3702,1391055055856578563,RT @CO2GesetzJa: Gipfeltreffen auf dem Titlis:...,Summit on the Titlis: and Franoise Jaquet from...,POSITIVE,0.890876


In [None]:
_deepnote_run_altair(tweets, """{"$schema":"https://vega.github.io/schema/vega-lite/v4.json","mark":{"type":"bar","tooltip":true},"height":220,"autosize":{"type":"fit"},"data":{"name":"placeholder"},"encoding":{"x":{"field":"","type":"nominal","sort":null,"scale":{"type":"linear","zero":false}},"y":{"field":"","type":"nominal","sort":null,"scale":{"type":"linear","zero":true}},"color":{"field":"","type":"nominal","sort":null,"scale":{"type":"linear","zero":false}}}}""")

In [None]:
articles = pd.read_csv('output/articles_processed.csv')
articles

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,medium,date,day_index,text,processed_text,language,characters_in_text,sentiment_label,sentiment_score,author
0,0,0,0,TA,2021-05-07,1,['SRG-Trendumfrage zu Abstimmungen: Es wird kn...,SRG trend survey on votes: It's getting scarce...,de,4695,NEGATIVE,-0.406751,unknown
1,1,1,1,TA,2021-06-04,1,"['Was, wenn das CO2-Gesetz scheitert?', 'Absti...",What if the CO2 law fails? Vote on the climate...,de,7966,NEGATIVE,-0.529833,Stefan H√§ne Martin L√§ubl
2,2,2,2,NZZ,2021-05-21,1,"['Die Klimajugend ist zur√ºck.', 'Weniger nett,...","The climate youth is back. Less nice, but with...",de,32314,POSITIVE,0.406295,Michael Schilliger Flurin Clal√ºna Christoph Ru...
3,3,3,3,TA,2021-05-21,3,['Aktionstag ¬´Strike for Future¬ª: 30‚Äô000 Mensc...,"Action day ""Strike for Future"": 30,000 people ...",de,4353,POSITIVE,0.132977,unknown
4,4,4,4,TA,2021-05-18,1,['Meinungen Kolumne Konzerne als versteckte Ab...,Opinions Column corporations as hidden voters ...,de,5004,NEGATIVE,-0.212632,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,196,196,84,LT,2021-04-25,1,['Le WWF va f√™ter discr√®tement ses 60 ans Pand...,The WWF will discreetly celebrate its 60th ann...,de,4295,POSITIVE,0.534339,unknown
197,197,197,85,LT,2021-05-03,2,"['Sur la r√©vision de la loi CO2, des camps √©co...","On the revision of the CO2 law, economic camps...",de,2548,NEGATIVE,-0.214974,unknown
198,198,198,86,LT,2021-05-18,1,"['La loi sur le CO2, une chance pour notre √©co...","The CO2 law, a chance for our economy It is ce...",de,4915,NEGATIVE,-0.086209,unknown
199,199,199,87,LT,2021-04-16,1,['OPINION Loi sur le CO2: le co√ªt de l‚Äôinactio...,OPINION CO2 law: the cost of inaction ROGER NO...,de,3981,POSITIVE,0.112598,unknown


## Determine the tweets' topics and arguments in articles
An unsupervised approach probably would not find the desired finegrained topics (or arguments) in the tweets. Therefore, we would have to manually label the tweets and subsequently train a supervised, finetuned model.
Concerning the arguments in the articles, they could be entirely identified manually or with a finetuned Named Entity Recognition (NER) algorithm preceded by some manual labelling. The latter Machine Learning approach is significantly more complex but highly scalable.
The open-source app [Label Studio](https://labelstud.io/) provides a powerful toolbox for human labelling. It is, however, necessary to develop a tailored interface, which would take a few days to complete. When completed, any person can use the app to label the tweets and articles.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=afd85eb4-e181-4004-a2b4-65d914f16510' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>