In [3]:
import pandas as pd
import numpy as np
import matplotlib as plt
import sklearn as sk
import tarfile
import io
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [22]:
def read_tgz_to_dataframe(file_path):
    # Open the .tgz file
    with tarfile.open(file_path, 'r:gz') as tar:
        # Initialize an empty list to store the data
        data = []
        
        # Iterate through each file in the archive
        for member in tar.getmembers():
            if member.isfile() and member.name.lower().endswith('.txt'):
                # Extract the file content
                f = tar.extractfile(member)
                if f is not None:
                    # Read the content and decode it to string, ignoring problematic characters
                    content = f.read().decode('utf-8', errors='ignore')
                    
                    # Process the content
                    # For example, split into lines and create a DataFrame
                    lines = content.split('\n')
                    df = pd.DataFrame({'text': lines})
                    
                    # Add filename as a column
                    df['filename'] = member.name
                    
                    # Append the data to our list
                    data.append(df)
    
    # Combine all dataframes
    if data:
        final_df = pd.concat(data, ignore_index=True)
        return final_df
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no text files were found
    

In [29]:
df = read_tgz_to_dataframe("./FDS_UNdata/UN General Debate Corpus/UNGDC_1946-2023.tgz")

In [40]:
# Extract country ISO, session number, and year using regex
df[['country ISO', 'session number', 'year']] = df['filename'].str.extract(r'.*/([A-Z]{3})_(\d{2})_(\d{4})')

# Convert session number and year to integer types
df['session number'] = df['session number'].astype(int)
df['year'] = df['year'].astype(int)

# Some files from the very first years were not in utf-8 and could not be decoded properly. We drop them (< 2% of total).
df = df[df['country ISO'].notna() & df['session number'].notna() & df['year'].notna()].copy()


In [4]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Define keywords for each theme
economic_keywords = ['economy', 'trade', 'finance', 'budget', 'investment', 'market', 'gdp', 'inflation']
social_keywords = ['education', 'healthcare', 'welfare', 'equality', 'poverty', 'human rights', 'social justice']
# Add more keyword lists for other themes

# Preprocess text
def preprocess_text(text):
    # Convert to lowercase and tokenize
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in string.punctuation and word not in stop_words]
    return tokens

# Scoring function
def calculate_score(tokens, keywords, max_score=10):
    keyword_count = sum(1 for word in tokens if word in keywords)
    score = min(keyword_count, max_score)  # Cap the score at max_score
    return score

# Apply preprocessing to the 'text' column
df['processed_text'] = df['text'].apply(preprocess_text)

# Create new columns with scores
df['economic_score'] = df['processed_text'].apply(lambda x: calculate_score(x, economic_keywords))
df['social_issues_score'] = df['processed_text'].apply(lambda x: calculate_score(x, social_keywords))
# Add more lines for other themes

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\EduardCP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\EduardCP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


NameError: name 'df' is not defined