# TAHLR Week 6: Gaining early insights

Code notebook for TAHLR course at ISAW (Fall 2023) based on Albrecht et al. 2022 (Blueprints) Ch. 1: Gaining Early Insights from Textual Data

In [None]:
# # Installs
# ! pip install -U seaborn
# ! pip install -U textacy
# ! pip install wordcloud

In [None]:
# # Get data from remote location

# !mkdir -p ../data/blueprints
# !curl -LJO https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/data/un-general-debates/un-general-debates-blueprint.csv.gz --output-dir ../data/blueprints

In [None]:
# Imports

import random
import pandas as pd
import numpy as np

import seaborn as sns
sns.set_style("darkgrid")

import nltk
nltk.download('stopwords')

In [None]:
# Load data

file = "../data/blueprints/un-general-debates-blueprint.csv.gz"
df = pd.read_csv(file)
df.sample(2)

## Getting an overview of the data with Pandas

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='O')

In [None]:
# Get selection

df.sample(frac=0.1)

In [None]:
df['length'] = df['text'].str.len()
df.head(5)

In [None]:
df.describe().T

In [None]:
df[['country', 'speaker']].describe(include='O').T

In [None]:
## Check for missing data

df.isna().sum()

In [None]:
# Fill na for speaker

df['speaker'].fillna('unknown', inplace=True)

In [None]:
# Plotting value distributions

df['length'].plot(kind='box', vert=False, figsize=(10, 1))

In [None]:
df['length'].plot(kind='hist', bins=30, figsize=(10, 2))

In [None]:
where = df['country'].isin(['USA', 'FRA', 'GBR', 'CHN', 'RUS'])
cp = sns.catplot(data=df[where], x="country", y="length", kind='box', hue='country')

In [None]:
vp = sns.catplot(data=df[where], x="country", y="length", kind='violin', hue='country')

In [None]:
## Visualizing Developments Over Time

df.groupby('year').size().plot(title="Number of Countries", figsize=(5, 3));

In [None]:
df.groupby('year').agg({'length': 'mean'}).plot(title="Avg. Speech Length", ylim=(0,30000), figsize=(5, 3));

## Blueprint: Building a simple text preprocessing pipeline

In [None]:
# Tokenization

import regex as re

def tokenize(text):
    return re.findall(r'[\w-]*\p{L}[\w-]*', text)

In [None]:
text = "Let's defeat SARS-CoV-2 together in 2020!"
tokens = tokenize(text)
print("|".join(tokens))

In [None]:
# Stop words

import nltk

stopwords = set(nltk.corpus.stopwords.words('english'))

In [None]:
def remove_stop(tokens):
    return [t for t in tokens if t.lower() not in stopwords]

In [None]:
pipeline = [str.lower, tokenize, remove_stop]

def prepare(text, pipeline):
    tokens = text
    for transform in pipeline:
        tokens = transform(tokens)
    return tokens

In [None]:
text = "Let's defeat SARS-CoV-2 together in 2020!"
print(prepare(text, pipeline))

In [None]:
df['text'].apply(str.upper)

In [None]:
df['tokens'] = df['text'].apply(prepare, pipeline=pipeline)

In [None]:
df['num_tokens'] = df['tokens'].map(len)

In [None]:
df.head(5)

## Blueprint: Counting words with a Counter

In [None]:
from collections import Counter

tokens = tokenize("She likes my cats and my cats like my sofa.")

counter = Counter(tokens)
print(counter)

In [None]:
more_tokens = tokenize("She likes dogs and cats.")
counter.update(more_tokens)
print(counter)

In [None]:
counter = Counter()

df['tokens'].map(counter.update);

In [None]:
counter.most_common(10)

In [None]:
def count_words(df, column='tokens', preprocess=None, min_freq=2):

    # process tokens and update counter
    def update(doc):
        tokens = doc if preprocess is None else preprocess(doc)
        counter.update(tokens)

    # create counter and run through all data
    counter = Counter()
    df[column].map(update)

    # transform counter into a DataFrame
    freq_df = pd.DataFrame.from_dict(counter, orient='index', columns=['freq'])
    freq_df = freq_df.query('freq >= @min_freq')
    freq_df.index.name = 'token'

    return freq_df.sort_values('freq', ascending=False)

In [None]:
freq_df = count_words(df)
freq_df.head(5)

In [None]:
count_words(df, column='text',
                preprocess=lambda text: re.findall(r"\w{10,}", text)).head(5)


## Blueprint: Counting a frequency diagram

In [None]:
ax = freq_df.head(15).plot(kind='barh', width=0.95)
ax.invert_yaxis()
ax.set(xlabel='Frequency', ylabel='Token', title='Top Words');

## Blueprint: Counting word clouds

In [None]:
from wordcloud import WordCloud
from matplotlib import pyplot as plt

text = df.query("year==2015 and country=='USA'")['text'].values[0]

wc = WordCloud(max_words=100, stopwords=stopwords)
wc.generate(text)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off");

In [None]:
def wordcloud(word_freq, title=None, max_words=200, stopwords=None):

    wc = WordCloud(width=800, height=400,
                   background_color= "black", colormap="Paired",
                   max_font_size=150, max_words=max_words)

    # convert DataFrame into dict
    if type(word_freq) == pd.Series:
        counter = Counter(word_freq.fillna(0).to_dict())
    else:
        counter = word_freq

    # filter stop words in frequency counter
    if stopwords is not None:
        counter = {token:freq for (token, freq) in counter.items()
                              if token not in stopwords}
    wc.generate_from_frequencies(counter)

    plt.title(title)

    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")

In [None]:
freq_2015_df = count_words(df[df['year']==2015])
plt.figure()
wordcloud(freq_2015_df['freq'], max_words=100)

In [None]:
wordcloud(freq_2015_df['freq'], max_words=100, stopwords=freq_df.head(50).index)

## Blueprint: Ranking with TF-IDF

In [None]:
def compute_idf(df, column='tokens', preprocess=None, min_df=2):

    def update(doc):
        tokens = doc if preprocess is None else preprocess(doc)
        counter.update(set(tokens))

    # count tokens
    counter = Counter()
    df[column].map(update)

    # create DataFrame and compute idf
    idf_df = pd.DataFrame.from_dict(counter, orient='index', columns=['df'])
    idf_df = idf_df.query('df >= @min_df')
    idf_df['idf'] = np.log(len(df)/idf_df['df'])+0.1
    idf_df.index.name = 'token'
    return idf_df

In [None]:
idf_df = compute_idf(df)

In [None]:
idf_df

In [None]:
freq_df['tfidf'] = freq_df['freq'] * idf_df['idf']

In [None]:
freq_df['tfidf'].head()

In [None]:
freq_1970 = count_words(df[df['year'] == 1970])
freq_2015 = count_words(df[df['year'] == 2015])

freq_1970['tfidf'] = freq_1970['freq'] * idf_df['idf']
freq_2015['tfidf'] = freq_2015['freq'] * idf_df['idf']

In [None]:
wordcloud(freq_1970['freq'], title='1970 - TF',
          stopwords=['twenty-fifth', 'twenty-five'])

In [None]:
wordcloud(freq_2015['freq'], title='2015 - TF',
          stopwords=['seventieth'])

In [None]:
wordcloud(freq_1970['tfidf'], title='1970 - TF-IDF',
          stopwords=['twenty-fifth', 'twenty-five', 'twenty', 'fifth'])

In [None]:
wordcloud(freq_2015['tfidf'], title='2015 - TF-IDF',
          stopwords=['seventieth'])

## Blueprint: Finding a keyword-in-context

In [None]:
# from textacy.text_utils import KWIC

from textacy.extract import keyword_in_context

def kwic(doc_series, keyword, window=35, print_samples=5):

    def add_kwic(text):
        kwic_list.extend(keyword_in_context(text, keyword, ignore_case=True,
                              window_width=window))

    kwic_list = []
    doc_series.map(add_kwic)

    if print_samples is None or print_samples==0:
        return kwic_list
    else:
        k = min(print_samples, len(kwic_list))
        print(f"{k} random samples out of {len(kwic_list)} " + \
              f"contexts for '{keyword}':")
        for sample in random.sample(list(kwic_list), k):
            print(re.sub(r'[\n\t]', ' ', sample[0])+'  '+ \
                  sample[1]+'  '+\
                  re.sub(r'[\n\t]', ' ', sample[2]))

In [None]:
kwic(df[df['year'] == 2015]['text'], 'sdgs', print_samples=5)

## Blueprint: Analyzing n-grams

In [None]:
def ngrams(tokens, n=2, sep=' '):
    return [sep.join(ngram) for ngram in zip(*[tokens[i:] for i in range(n)])]

text = "the visible manifestation of the global climate change"
tokens = tokenize(text)
print("\n".join(ngrams(tokens, 2)))

In [None]:
def ngrams(tokens, n=2, sep=' ', stopwords=set()):
    return [sep.join(ngram) for ngram in zip(*[tokens[i:] for i in range(n)])
            if len([t for t in ngram if t in stopwords])==0]

print("Bigrams:", "|".join(ngrams(tokens, 2, stopwords=stopwords)))
print("Trigrams:", "|".join(ngrams(tokens, 3, stopwords=stopwords)))

In [None]:
df['bigrams'] = df['text'].apply(prepare, pipeline=[str.lower, tokenize]) \
                          .apply(ngrams, n=2, stopwords=stopwords)

count_words(df, 'bigrams').head(5)

In [None]:
# concatenate existing IDF DataFrame with bigram IDFs
idf_df = pd.concat([idf_df, compute_idf(df, 'bigrams', min_df=10)]);

freq_df = count_words(df[df['year'] == 2015], 'bigrams');
freq_df['tfidf'] = freq_df['freq'] * idf_df['idf'];
wordcloud(freq_df['tfidf'], title='all bigrams', max_words=50)

## Blueprint: Comparing frequencies across time intervals and categories

In [None]:
def count_keywords(tokens, keywords):
    tokens = [t for t in tokens if t in keywords]
    counter = Counter(tokens)
    return [counter.get(k, 0) for k in keywords]

In [None]:
keywords = ['nuclear', 'terrorism', 'climate', 'freedom']
tokens = ['nuclear', 'climate', 'climate', 'freedom', 'climate', 'freedom']

print(count_keywords(tokens, keywords))

In [None]:
def count_keywords_by(df, by, keywords, column='tokens'):

    freq_matrix = df[column].apply(count_keywords, keywords=keywords)
    freq_df = pd.DataFrame.from_records(freq_matrix, columns=keywords)
    freq_df[by] = df[by] # copy the grouping column(s)

    return freq_df.groupby(by=by).sum().sort_values(by)

In [None]:
freq_df = count_keywords_by(df, by='year', keywords=keywords)

In [None]:
freq_df.head()

In [None]:
freq_df.plot(kind='line', figsize=(10,3));

In [None]:
keywords = ['terrorism', 'terrorist', 'nuclear', 'war', 'oil',
            'syria', 'syrian', 'refugees', 'migration', 'peacekeeping',
            'humanitarian', 'climate', 'change', 'sustainable', 'sdgs']

freq_df = count_keywords_by(df, by='year', keywords=keywords)

# compute relative frequencies based on total number of tokens per year
freq_df = freq_df.div(df.groupby('year')['num_tokens'].sum(), axis=0)
# apply square root as sublinear filter for better contrast
freq_df = freq_df.apply(np.sqrt)

plt.figure(figsize = (10,5))
ax = sns.heatmap(data=freq_df.T,
            xticklabels=True, yticklabels=True, cbar=False, cmap="Reds");

In [None]:
## Albrecht et al. "things to consider" for keyword analysis
# - Prefer relative frequencies for any kind of comparison.
# - Be careful with the interpretation of frequency diagrams based on keyword lists.
# - Use sublinear scaling.