In [1]:
import pandas as pd
import numpy as np
import os 

In [2]:
CORPORA_DIR_PATH = os.path.join(os.path.pardir, "corpora", "UN General Debate Corpus", "TXT")
CORPORA_DIR_PATH

'../corpora/UN General Debate Corpus/TXT'

In [3]:
def list_files(dir_path):
    files = []

    for root, dirs, filenames in os.walk(dir_path):
        for f in filenames:
            # add to list file names with their last directories (without dir_path)
            if f.endswith(".txt"):
                files.append(os.path.join(os.path.basename(root), f))
    return files


txt_files = list_files(CORPORA_DIR_PATH)

### Adding paths to text files 

It is handled differently for speeches pre-2023 and for speeches from 2023. 

- For older speeches, we use the original metadata with ISO codes corresponding to the file names. 
- For newer speeches, we use ISO codes saved during the scraping process. 

In [4]:
metadata = pd.read_csv("enhanced_metadata.csv")
metadata

Unnamed: 0,Year,Session,ISO Code,Country,Name of Person Speaking,Post,Population,TFR,HDI,GDP,Unemployment Rate,Gini,CO2,Democracy Index,Region Name,Sub-region Name
0,2022,77,BRA,Brazil,Jair Bolsonaro,President,215313498.0,1.65,,1.901461e+12,9.461,51.3,2.245458,6.78,Americas,Latin America and the Caribbean
1,2022,77,SEN,Senegal,Macky Sail,President,17316449.0,4.36,,2.537043e+10,3.434,40.3,0.673835,5.72,Africa,Sub-Saharan Africa
2,2022,77,CHL,Chile,Gabriel Boric Font,President,19603733.0,1.74,,2.814772e+11,7.784,47.7,4.304166,8.22,Americas,Latin America and the Caribbean
3,2022,77,JOR,Jordan,Abdullah Ii Ibn Al Hussein,King,11285869.0,3.02,,4.311994e+10,17.874,33.7,2.030201,3.17,Asia,Western Asia
4,2022,77,COL,Colombia,Gustavo Petro Urrego,President,51874024.0,1.75,,3.557413e+11,10.726,50.8,1.922308,6.72,Americas,Latin America and the Caribbean
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10676,2023,78,KAZ,Kazakhstan,Kassym-Jomart Tokayev,President,19606633.0,2.45,,,4.991,26.9,,,Asia,Central Asia
10677,2023,78,IRN,Iran,Seyyed Ebrahim Raisi,President,89172767.0,1.52,,,11.098,38.8,,,Asia,Southern Asia
10678,2023,78,KGZ,Kyrgyz Republic,Sadyr Zhaparov,President,6735347.0,2.73,,,4.762,26.8,,,Asia,Central Asia
10679,2023,78,DEU,Germany,Olaf Scholz,Chancellor,83294633.0,1.51,,,3.258,31.7,,,Europe,Western Europe


In [5]:
original_base_metadata = pd.read_excel(os.path.join("metadata_sources", "base_metadata.xlsx"))
original_base_metadata

Unnamed: 0,Year,Session,ISO Code,Country,Name of Person Speaking,Post,Unnamed: 6
0,2022,77,BRA,Brazil,Jair Bolsonaro,President,
1,2022,77,SEN,Senegal,Macky Sail,President,
2,2022,77,CHL,Chile,Gabriel Boric Font,President,
3,2022,77,JOR,Jordan,Abdullah II ibn Al Hussein,King,
4,2022,77,COL,Colombia,Gustavo Petro Urrego,President,
...,...,...,...,...,...,...,...
10554,1946,1,USA,United States of America,Mr. Austin,,
10555,1946,1,IRN,Iran,Mr. Entezam,,
10556,1946,1,URY,Uruguay,Mr. Blanco,,
10557,1946,1,LUX,Luxembourg,Mr. Bech,,


In [6]:
new_speeches_metadata = pd.read_csv(os.path.join("metadata_sources", "attributes.csv"))
new_speeches_metadata

Unnamed: 0,pdf_name,speaker_name,speaker_position
0,ni_en.pdf,Denis Ronaldo Moncada Colindres,Minister for Foreign Affairs
1,in_en.pdf,Subrahmanyam Jaishankar,Minister for External Affairs
2,jm_en.pdf,Kamina Johnson Smith,Minister for Foreign Affairs and Foreign Trade
3,bt_en.pdf,Tandi Dorji,Minister for Foreign Affairs
4,zm_en.pdf,Stanley Kakubo,Minister for Foreign Affairs
...,...,...,...
117,kz_en.pdf,Kassym-Jomart Tokayev,President
118,ir_en.pdf,Seyyed Ebrahim Raisi,President
119,kg_en.pdf,Sadyr Zhaparov,President
120,de_en.pdf,Olaf Scholz,Chancellor


In [7]:
metadata_part1 = metadata.loc[metadata["Year"] < 2023]
metadata_part2 = metadata.loc[metadata["Year"] == 2023]

In [8]:
assert len(metadata_part1) == len(original_base_metadata)
assert np.all(metadata_part1["Year"] == original_base_metadata["Year"])
assert np.all(metadata_part1["Session"] == original_base_metadata["Session"])

In [9]:
assert len(metadata_part2) == len(new_speeches_metadata)
assert np.all(metadata_part2["Name of Person Speaking"].values == new_speeches_metadata["speaker_name"].str.title().values)

In [10]:
dir_path = (
    "Session "
    + original_base_metadata["Session"].astype(str).str.zfill(2)
    + " - "
    + original_base_metadata["Year"].astype(str)
)

file_path_original = (
    original_base_metadata["ISO Code"].astype(str)
    + "_"
    + original_base_metadata["Session"].astype(str).str.zfill(2)
    + "_"
    + original_base_metadata["Year"].astype(str)
    + ".txt"
)

file_path_improved = (
    metadata_part1["ISO Code"].astype(str)
    + "_"
    + metadata_part1["Session"].astype(str).str.zfill(2)
    + "_"
    + metadata_part1["Year"].astype(str)
    + ".txt"
)

metadata_part1["text_path"] = list(map(os.path.join, dir_path, file_path_original))
text_path_improved = pd.Series(map(os.path.join, dir_path, file_path_improved))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_part1["text_path"] = list(map(os.path.join, dir_path, file_path_original))


In [11]:
metadata_part1.loc[~metadata_part1["text_path"].isin(txt_files), "text_path"]

504       Session 75 - 2020/POR_75_2020.txt
1020      Session 72 - 2017/POR_72_2017.txt
1140      Session 72 - 2017/PKR_72_2017.txt
1186      Session 71 - 2016/ZFA_71_2016.txt
1191      Session 71 - 2016/POR_71_2016.txt
1395      Session 70 - 2015/CMR_70_2015.txt
1514      Session 70 - 2015/SGP_70_2015.txt
1545      Session 70 - 2015/UZB_70_2015.txt
1799       Session 68 - 2013/EC_68_2013.txt
2188       Session 66 - 2011/EC_66_2011.txt
4074      Session 56 - 2001/CHE_56_2001.txt
5510      Session 48 - 1993/CSK_48_1993.txt
6154      Session 44 - 1989/BTN_44_1989.txt
6249      Session 44 - 1989/PSE_44_1989.txt
6282     Session 44 - 1989/YDYE_44_1989.txt
6436     Session 43 - 1988/YDYE_43_1988.txt
6557      Session 42 - 1987/PSE_42_1987.txt
6589     Session 42 - 1987/YDYE_42_1987.txt
6594      Session 42 - 1987/nan_42_1987.txt
6739     Session 41 - 1986/YDYE_41_1986.txt
6833      Session 40 - 1985/NIC_40_1985.txt
6878     Session 40 - 1985/YDYE_40_1985.txt
7027     Session 39 - 1984/YDYE_

In [12]:
metadata_part1.loc[
    (~metadata_part1["text_path"].isin(txt_files)) & (~text_path_improved.str.contains("not found")), "text_path"
] = text_path_improved[(~metadata_part1["text_path"].isin(txt_files)) & (~text_path_improved.str.contains("not found"))]

In [13]:
metadata_part1

Unnamed: 0,Year,Session,ISO Code,Country,Name of Person Speaking,Post,Population,TFR,HDI,GDP,Unemployment Rate,Gini,CO2,Democracy Index,Region Name,Sub-region Name,text_path
0,2022,77,BRA,Brazil,Jair Bolsonaro,President,215313498.0,1.65,,1.901461e+12,9.461,51.3,2.245458,6.78,Americas,Latin America and the Caribbean,Session 77 - 2022/BRA_77_2022.txt
1,2022,77,SEN,Senegal,Macky Sail,President,17316449.0,4.36,,2.537043e+10,3.434,40.3,0.673835,5.72,Africa,Sub-Saharan Africa,Session 77 - 2022/SEN_77_2022.txt
2,2022,77,CHL,Chile,Gabriel Boric Font,President,19603733.0,1.74,,2.814772e+11,7.784,47.7,4.304166,8.22,Americas,Latin America and the Caribbean,Session 77 - 2022/CHL_77_2022.txt
3,2022,77,JOR,Jordan,Abdullah Ii Ibn Al Hussein,King,11285869.0,3.02,,4.311994e+10,17.874,33.7,2.030201,3.17,Asia,Western Asia,Session 77 - 2022/JOR_77_2022.txt
4,2022,77,COL,Colombia,Gustavo Petro Urrego,President,51874024.0,1.75,,3.557413e+11,10.726,50.8,1.922308,6.72,Americas,Latin America and the Caribbean,Session 77 - 2022/COL_77_2022.txt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10554,1946,1,USA,United States,Mr. Austin,,140031458.0,2.83,,,,38.9,15.904564,,Americas,Northern America,Session 01 - 1946/USA_01_1946.txt
10555,1946,1,IRN,Iran,Mr. Entezam,,16071828.0,6.94,,,,46.7,,,Asia,Southern Asia,Session 01 - 1946/IRN_01_1946.txt
10556,1946,1,URY,Uruguay,Mr. Blanco,,2133431.0,2.69,,,,54.7,,,Americas,Latin America and the Caribbean,Session 01 - 1946/URY_01_1946.txt
10557,1946,1,LUX,Luxembourg,Mr. Bech,,297340.0,2.33,,,,28.9,,,Europe,Western Europe,Session 01 - 1946/LUX_01_1946.txt


In [14]:
dir_path = "Session " + metadata_part2["Session"].astype(str).str.zfill(2) + " - " + metadata_part2["Year"].astype(str)

file_path = (
    metadata_part2["ISO Code"].astype(str)
    + "_"
    + metadata_part2["Session"].astype(str).str.zfill(2)
    + "_"
    + metadata_part2["Year"].astype(str)
    + ".txt"
)

metadata_part2["text_path"] = list(map(os.path.join, dir_path, file_path))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_part2["text_path"] = list(map(os.path.join, dir_path, file_path))


In [15]:
metadata = pd.concat([metadata_part1, metadata_part2], axis=0)

In [16]:
metadata = metadata.loc[metadata["text_path"].isin(txt_files)]

In [17]:
metadata = metadata.sort_values(
    by=["Year", "Session", "Country", "Name of Person Speaking"], ascending=[False, True, True, True]
)

In [18]:
metadata = metadata.reset_index(drop=True)

In [19]:
metadata.to_csv("enhanced_metadata.csv", index=False)