In [1]:
country = "Denmark"
path2SP = f"/Users/carlostoruno/OneDrive - World Justice Project/EU Subnational/EU-S Data/Automated Qualitative Checks/Data"

country_data = f"{path2SP}/data-summarization/{country}/{country}_master.parquet.gzip"

In [3]:
import pandas as pd
import gensim
from gensim import corpora

import spacy
import re
import pyLDAvis
import pyLDAvis.gensim



In [4]:
df = pd.read_parquet(country_data)

In [5]:
subset_pillar_1 = (
    df.copy()
    .loc[df["associated_pillar"].isin(["Pillar 1"])]
)

In [6]:
# Load spaCy model
nlp = spacy.load('en_core_web_lg') # Remember to download the model by: $sudo python -m spacy download en_core_web_lg

def process_text(text, nlp=nlp):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    
    # Process text with spaCy
    doc = nlp(text)
    tokens = [token for token in doc if not token.is_stop and token.is_alpha]
    lemmatized_tokens = [token.lemma_.lower() for token in tokens]

    return " ".join(lemmatized_tokens)

In [7]:
preproc_texts = [process_text(article) for article in subset_pillar_1["content_trans"].to_list()]

In [8]:
preproc_tokens = [text.split() if isinstance(text, str) else text for text in preproc_texts]

# Create a dictionary from the processed data
dictionary = corpora.Dictionary(preproc_tokens)

# Create a bag-of-words representation of the processed data
corpus = [dictionary.doc2bow(text) for text in preproc_tokens]

In [12]:
# Train LDA model
num_topics = 4
lda_model = gensim.models.ldamodel.LdaModel(corpus, 
                                            num_topics=num_topics, 
                                            id2word=dictionary, 
                                            passes=15)

In [13]:
# Print the topics
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.011*"say" + 0.010*"minister" + 0.010*"government" + 0.009*"danish" + 0.008*"denmark" + 0.006*"party" + 0.006*"people" + 0.004*"time" + 0.004*"state" + 0.004*"year"')
(1, '0.007*"child" + 0.007*"say" + 0.007*"year" + 0.006*"case" + 0.005*"read" + 0.005*"ago" + 0.005*"new" + 0.004*"time" + 0.004*"school" + 0.004*"danish"')
(2, '0.010*"tax" + 0.008*"say" + 0.008*"government" + 0.007*"case" + 0.007*"danish" + 0.007*"law" + 0.006*"billion" + 0.006*"year" + 0.006*"dkk" + 0.006*"company"')
(3, '0.030*"police" + 0.023*"case" + 0.019*"year" + 0.017*"court" + 0.014*"man" + 0.013*"old" + 0.010*"say" + 0.007*"copenhagen" + 0.007*"charge" + 0.006*"accord"')


In [14]:
# Visualize the LDA topics
lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.show(lda_display, local=False)

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [28/Oct/2024 10:04:08] "GET / HTTP/1.1" 200 -



stopping Server...


In [46]:
doc_topics = [lda_model[doc] for doc in corpus]

# Identify the dominant topic for each document
dominant_topics = []
for i, doc_topic in enumerate(doc_topics):
    dominant_topic = sorted(doc_topic, key=lambda x: x[1], reverse=True)[0][0]
    dominant_topics.append((i, dominant_topic, preproc_texts[i]))

# Create a DataFrame with document index, dominant topic, and text
df_dominant_topics = pd.DataFrame(dominant_topics, columns=['Doc_ID', 'Dominant_Topic', 'Text'])

# View sample documents for each topic
num_samples_per_topic = 5  # You can adjust this number
sampled_docs = df_dominant_topics.groupby('Dominant_Topic').apply(lambda x: x.sample(num_samples_per_topic), include_groups=False)
sampled_docs = sampled_docs.reset_index(drop=False)

In [47]:
sampled_docs

Unnamed: 0,Dominant_Topic,level_1,Doc_ID,Text
0,0,447,447,woman unsafe nightlife man subject important u...
1,0,1416,1416,fact legislation visitation section chapter ad...
2,0,1117,1117,copenhagen district court sentence swedish man...
3,0,838,838,dina ali el nazzal shout loudly living room ha...
4,0,984,984,advertisement debate debate post debate post p...
5,1,943,943,year old man sentence murder young woman køge ...
6,1,145,145,robbery jewelery store grønnegade copenhagen t...
7,1,957,957,judge court aarhus decide monday ground remand...
8,1,202,202,courtroom court frederiksberg year old man bri...
9,1,1042,1042,large police action thursday morning result pe...
