## Load functions

### File conversion

In [2]:
# File conversion functions
from countryinfo import CountryInfo
import pycountry 
import pandas as pd 
# import spacy
import re
import os 

def convert_to_text(directory_path):
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        print(filename)
        print(file_path)
        
        if filename.endswith('.pdf'):
            text = extract_text('upr_docs/'+filename)
            text_path = os.path.join('upr-files/', filename[:-4] + '.txt')
            with open(text_path, 'w',encoding='utf-8') as text_file:
                text_file.write(text)
            print(f'{filename} converted to {filename[:-4]}.txt')

        elif filename.endswith('.docx'):
            doc = docx.Document(file_path)
            text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
            text_path = os.path.join(directory_path, filename[:-5] + '.txt')
            with open(text_path, 'w',encoding='utf-8') as text_file:
                text_file.write(text)
            print(f'{filename} converted to {filename[:-5]}.txt')

        elif filename.endswith('.doc'):
            word_app = win32com.client.Dispatch('Word.Application')
            word_app.Visible = False
            doc = word_app.Documents.Open(file_path)
            txt_path = file_path[:-4] + '.txt'
            doc.SaveAs(txt_path, FileFormat=win32com.client.constants.wdFormatText)
            doc.Close()
            word_app.Quit()
            print(f'{filename} converted to {os.path.basename(txt_path)}')


def fix_document(path):
    with open(path, 'r',encoding='utf-8') as file:
        text = file.read()
    # Replace sequences of whitespace with single space
    text = ' '.join(text.split())
    # If you want to write the cleaned text back into the file
    with open(path, 'w',encoding='utf-8') as file:
        file.write(text)
    return text 
    
def fix_documents(paths):
    texts = []
    for path in paths:
        texts.append(fix_document(path))
    
    return
# nlp = spacy.load("en_core_web_trf")
# doc = nlp(text)

def convert_excel_to_csv(excel_file_path, output_dir):
    """
    Converts an Excel file to a CSV file.
    
    Parameters:
        excel_file_path (str): Path to the Excel file
        output_dir (str): Directory where the CSV files will be saved
        
    Returns:
        None
    """
    try:
        # Read the Excel file
        xls = pd.ExcelFile(excel_file_path)
        
        # Loop through each sheet in the Excel file
        for sheet_name in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name)
            
            # Create a CSV file name based on the Excel file name and sheet name
            csv_file_name = f"{os.path.splitext(os.path.basename(excel_file_path))[0]}_{sheet_name}.csv"
            csv_file_path = os.path.join(output_dir, csv_file_name)
            
            # Save the DataFrame to CSV
            df.to_csv(csv_file_path, index=False)
            
        print(f"Successfully converted {excel_file_path} to CSV.")
    except Exception as e:
        print(f"An error occurred while converting {excel_file_path}: {e}")

def mass_convert_excel_to_csv(input_dir, output_dir):
    """
    Converts all Excel files in the input directory to CSV files and saves them in the output directory.
    
    Parameters:
        input_dir (str): Directory containing Excel files
        output_dir (str): Directory where the CSV files will be saved
    
    Returns:
        None
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    for filename in os.listdir(input_dir):
        if filename.endswith('.xls') or filename.endswith('.xlsx'):
            excel_file_path = os.path.join(input_dir, filename)
            convert_excel_to_csv(excel_file_path, output_dir)



In [4]:
treaty_data = pd.read_csv('treaty_data/treaty_data.csv')
treaty_data.head()

Unnamed: 0,cow,country,year,title,agrmtno,core,type,issue1,issue2,createyear,entryforceyear,signyear,ratyear,involvedIO,statereport,indcomplaint,statecomplaint
0,2,United States,1947,Convention Relating to the Status of Stateless...,10,0,Convention,Stateless/Migrant,,1954,1960,-88,-88,,,,
1,2,United States,1948,Convention Relating to the Status of Stateless...,10,0,Convention,Stateless/Migrant,,1954,1960,-88,-88,,,,
2,2,United States,1949,Convention Relating to the Status of Stateless...,10,0,Convention,Stateless/Migrant,,1954,1960,-88,-88,,,,
3,2,United States,1950,Convention Relating to the Status of Stateless...,10,0,Convention,Stateless/Migrant,,1954,1960,-88,-88,,,,
4,2,United States,1951,Convention Relating to the Status of Stateless...,10,0,Convention,Stateless/Migrant,,1954,1960,-88,-88,,,,


### Text analysis

In [None]:
# Used to generate the Datasets 

def is_valid_country(country_name):
    try:
        country_info = CountryInfo(country_name)
        return country_info.info() is not None
    except:
        return False

def extract_country_treaty(target,text,treaties):
    treaty_abv = treaties['Treaty']
    substrings = treaties['Query']
    results = []
    
    # List to hold the extracted data
    data = []

    # Splitting the text into sentences based on semicolons
    sentences = text.split(";")

    # Defining the regular expression pattern to extract treaties and countries
    pattern = re.compile(r'(.*?)(\((.*?)\))+')

    # Iterating through each sentence to extract treaties and countries
    for sentence in sentences:
        matches = pattern.findall(sentence)
        if matches:
            treaty = matches[0][0].strip()
            for match in matches:
                country = match[2]
                if country and is_valid_country(country):
                    data.append((country, treaty))
                    
    
    for substring in substrings:
        for d in data:
            if substring.lower() in d[1].lower():
                results.append((target,d[0],treaty_abv[substrings.tolist().index(substring)],substring,d[1]))
            # else:
            #     results.append((target,d[0],"NOT FOUND","NOT FOUND",d[1]))
                

    # Creating a DataFrame with the extracted data
    df = pd.DataFrame(results, columns=['Target Country','Source Country', 'Treaty','Full Name','Mention'])
    return df

def count_countries(text):
    # Preprocess the text to remove special characters and split into words
    words = re.findall(r'\b\w+\b', text)

    # Initialize a dictionary to store the country name counts
    country_counts = {}

    # Iterate through the words and check if they match any country names
    for word in words:
        try:
            country = pycountry.countries.get(name=word)
            if country:
                country_name = country.name
                if country_name in country_counts:
                    country_counts[country_name] += 1
                else:
                    country_counts[country_name] = 1
        except LookupError:
            continue
    
    return country_counts


def count_countries_without_regex(text):
    # Split the text into words
    words = text.split()

    # Initialize a dictionary to store the country name counts
    country_counts = {}

    # Iterate through the words and check if they match any country names
    for word in words:
        # Normalize the word by converting to lowercase and removing punctuation
        normalized_word = word.lower().strip(".,!?")

        try:
            country = pycountry.countries.get(name=normalized_word)
            if country:
                country_name = country.name
                if country_name in country_counts:
                    country_counts[country_name] += 1
                else:
                    country_counts[country_name] = 1
        except LookupError:
            continue
    
    return country_counts




### Process Documents

In [None]:
path = 'upr-files\A_HRC_53_11-EN.txt'
treaties = pd.read_csv('treaties.csv',encoding='utf-8')
text = fix_document(path)x
mentions = extract_country_treaty('Korea',text, treaties)
count_countries_without_regex(text)

In [None]:
text

In [None]:
mentions.to_csv('sample_results.csv')
mentions.head()

In [None]:
texts = fix_documents()
    

### GDELT extraction