In [86]:
import pandas as pd
import numpy as np
import matplotlib as plt
import sklearn as sk
import tarfile
import io
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import string

In [87]:
def read_tgz_to_dataframe(file_path):
    # Open the .tgz file
    with tarfile.open(file_path, 'r:gz') as tar:
        # Initialize an empty list to store the data
        data = []
        
        # Iterate through each file in the archive
        for member in tar.getmembers():
            if member.isfile() and member.name.lower().endswith('.txt'):
                # Extract the file content
                f = tar.extractfile(member)
                if f is not None:
                    # Read the content and decode it to string, ignoring problematic characters
                    content = f.read().decode('utf-8', errors='ignore')
                    
                    # Process the content
                    # For example, split into lines and create a DataFrame
                    # lines = content.split('\n')
                    df = pd.DataFrame({'text': content}, index=[0])
                    
                    # Add filename as a column
                    df['filename'] = member.name
                    
                    # Append the data to our list
                    data.append(df)
    
    # Combine all dataframes
    if data:
        final_df = pd.concat(data, ignore_index=True)
        return final_df
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no text files were found
    

In [88]:
df = read_tgz_to_dataframe("./FDS_UNdata/UN General Debate Corpus/UNGDC_1946-2023.tgz")

In [89]:
# Extract country ISO, session number, and year using regex
df[['country ISO', 'session number', 'year']] = df['filename'].str.extract(r'.*/([A-Z]{3})_(\d{2})_(\d{4})')

# Convert session number and year to integer types
df['session number'] = pd.to_numeric(df['session number'])
df['year'] = pd.to_numeric(df['year'])

# Some files contain a header for MAC OS, we ingore these
df = df[df['country ISO'].notna() & df['session number'].notna() & df['year'].notna()].copy()

In [90]:
# Download necessary NLTK data
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download()

def find_related_words(word, n=100):
    related_words = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            related_words.add(lemma.name().replace('_', ' '))
        for hyp in syn.hypernyms():
            related_words.update([lemma.name().replace('_', ' ') for lemma in hyp.lemmas()])
        for hypo in syn.hyponyms():
            related_words.update([lemma.name().replace('_', ' ') for lemma in hypo.lemmas()])
    
    return list(related_words)[:n]

In [91]:
education_keywords = find_related_words("education")

In [92]:
# Define keywords for each theme
education_keywords = find_related_words("education")
economic_keywords = find_related_words("economy")
social_keywords = find_related_words("social policy")
foreign_affairs_keywords = find_related_words("foreign affairs")
environment_keywords = find_related_words("environment")
# Add more keyword lists for other themes

# Preprocess text
def preprocess_text(text):
    # Convert to lowercase and tokenize
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in string.punctuation and word not in stop_words]
    return tokens

# Scoring function
def calculate_score(tokens, keywords):
    score = sum(1 for word in tokens if word in keywords)
    return score

# Apply preprocessing to the 'text' column
df['processed_text'] = df['text'].apply(preprocess_text)

# Create new columns with scores
df['economic_score'] = df['processed_text'].apply(lambda x: calculate_score(x, economic_keywords))
df['economic_score_per_thousand'] = df['economic_score'] * 1000 / len(df['text'])

df['social_issues_score'] = df['processed_text'].apply(lambda x: calculate_score(x, social_keywords))
df['social_issues_score_per_thousand'] = df['social_issues_score'] * 1000 / len(df['text'])

df['eduation_score'] = df['processed_text'].apply(lambda x: calculate_score(x, education_keywords))
df['eduation_score_per_thousand'] = df['eduation_score'] * 1000 / len(df['text'])

df['foreign_affairs_score'] = df['processed_text'].apply(lambda x: calculate_score(x, foreign_affairs_keywords))
df['foreign_affairs_score_per_thousand'] = df['foreign_affairs_score'] * 1000 / len(df['text'])

df['environment_score'] = df['processed_text'].apply(lambda x: calculate_score(x, environment_keywords))
df['environment_score_per_thousand'] = df['environment_score'] * 1000 / len(df['text'])


In [93]:
df_grouped = df.groupby(by=["country ISO"]).mean().reset_index()

In [94]:
df_grouped.sort_values(by="economic_score_per_thousand", ascending=False) 

Unnamed: 0,country ISO,session number,year,economic_score,economic_score_per_thousand,social_issues_score,social_issues_score_per_thousand,eduation_score,eduation_score_per_thousand,foreign_affairs_score,foreign_affairs_score_per_thousand,environment_score,environment_score_per_thousand
86,JAM,47.868852,1992.868852,10.803279,1.005237,0.0,0.0,3.704918,0.344740,0.0,0.0,9.377049,0.872527
149,RUS,39.688312,1984.688312,8.558442,0.796356,0.0,0.0,9.285714,0.864028,0.0,0.0,17.051948,1.586671
24,BRB,51.314815,1996.314815,8.555556,0.796088,0.0,0.0,3.388889,0.315333,0.0,0.0,6.333333,0.589312
73,GUY,50.444444,1995.444444,8.481481,0.789195,0.0,0.0,4.444444,0.413552,0.0,0.0,9.000000,0.837443
31,CHL,39.688312,1984.688312,8.402597,0.781855,0.0,0.0,4.194805,0.390323,0.0,0.0,8.935065,0.831401
...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,QAT,52.000000,1997.000000,2.716981,0.252813,0.0,0.0,2.018868,0.187854,0.0,0.0,6.981132,0.649589
161,SSD,71.750000,2016.750000,2.416667,0.224869,0.0,0.0,4.166667,0.387705,0.0,0.0,4.333333,0.403213
54,ERI,63.000000,2008.000000,2.000000,0.186098,0.0,0.0,1.096774,0.102054,0.0,0.0,3.322581,0.309164
25,BRN,58.243243,2003.243243,1.675676,0.155920,0.0,0.0,1.486486,0.138316,0.0,0.0,3.054054,0.284177
