In [31]:
import json
import string
from pathlib import Path
from PIL import Image

import numpy as np
import geopandas as gpd
import pandas as pd
import requests
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from tqdm.notebook import tqdm
from youtube_transcript_api import YouTubeTranscriptApi
from wordcloud import WordCloud, STOPWORDS

pd.options.display.max_columns = None

DATA_DIR = Path().absolute().parent.parent / "data"

In [32]:
url_un_spreadsheet = 'https://docs.google.com/spreadsheets/d/1qtqfnRSW24j-XLN7SRKywDCuFatARCH8pUg1Rr6I2vI/export?format=csv'
response = requests.get(url_un_spreadsheet)
with open(DATA_DIR / 'UN Speeches.csv', 'wb') as f:
    f.write(response.content)

df_speech_url = pd.read_csv(DATA_DIR / "UN Speeches.csv")

In [33]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc: str) -> str:
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

def get_corpus_from_file(country: str, start: int = 0, end:int = 3600, path: Path = DATA_DIR / "2023") -> str:
    with open(path / f"{country}.json") as f:
        json_data = json.load(f)
    corpus = [x['text'] for x in json_data if x['start'] > start and x['start'] < end]
    large_corpus = ' '.join([x for x in corpus])
    return large_corpus

def get_transcript(video_id: str, start: str = None, end: str = None) -> list[str]:
    # TODO Add filter with start time and end time, to cut introductions.
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        corpus = [x['text'] for x in transcript]
        large_corpus = ' '.join([x for x in corpus])
        return transcript, large_corpus
    except Exception as e:
        print(e)
        return None, None

def get_video_url(country: str) -> str:
    if isinstance(df_speech_url[df_speech_url["country"] == country]["start"].values[0], str):
        h,m,s=df_speech_url[df_speech_url["country"] == country]["start"].values[0].split(":")
        seconds = int(h)*3600+int(m)*60+int(s)
        url = f'{df_speech_url[df_speech_url["country"] == country]["url"].values[0]}&t={seconds}'
        return url
    else:
        url = df_speech_url[df_speech_url["country"] == country]["url"].values[0]
        return url
    
def save_json(data: dict, country: str, output_path: Path = DATA_DIR / "2023"):
    with open(output_path / f"{country}.json", "w") as outfile:
        json.dump(data, outfile)

def download_speech_transcriptions(df_speech_url: pd.DataFrame, overwrite: bool = False):
    pbar = tqdm(df_speech_url.iterrows(), total=len(df_speech_url))
    for i, r in pbar:
        pbar.set_description(r['country'])
        if (DATA_DIR / "2023" / f"{r['country']}.json").is_file() and not overwrite:
            continue
        transcript, large_corpus = get_transcript(r['url'].split('?v=')[-1])
        if transcript:
            save_json(transcript, r['country'])

In [34]:
download_speech_transcriptions(df_speech_url, overwrite=False)

  0%|          | 0/166 [00:00<?, ?it/s]


Could not retrieve a transcript for the video https://www.youtube.com/watch?v=lqsT_YK5Odo! This is most likely caused by:

Subtitles are disabled for this video

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!


In [None]:
# Generate masks for the countries

# countries shape https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/110m/cultural/ne_110m_admin_0_countries.zip
geo_data = gpd.read_file(DATA_DIR / 'ne_110m_admin_0_countries.zip')

# Generate masks
pbar = tqdm(df_speech_url.iterrows(), total=len(df_speech_url))
for i, r in pbar:
    pbar.set_description(r['country'])
    if len(geo_data[geo_data['SOVEREIGNT']==r['country']]) > 0:
        ax = geo_data[geo_data['SOVEREIGNT']==r['country']].plot();
        ax.axis('off');
        ax.figure.savefig(DATA_DIR / "masks" / f"{r['country']}.jpg");
    else:
        print(f"{r['country']} not found.")

"""
Palau not found.
Monaco not found.
Marshall Islands not found.
Sao Tome & Principe not found.
Comoros not found.
Dominica not found.
Kiribati not found.
Micronesia not found.
Palestine not found.
Nauru not found.
Liechtenstein not found.
Mauritius not found.
Malta not found.
Barbados not found.
Andorra not found.
Saint Lucia not found.
Seychelles not found.
Antigua and Barbuda not found.
Grenada not found.
Tuvalu not found.
Tonga not found.
Singapore not found.
Bahrain not found.
Bahamas not found.
Saint Vincent and Grenadines not found.
Samoa not found.
Saint Kitts and Nevis not found.
Cabo Verde not found.
"""

In [36]:
# geo_data[geo_data['SOV_A3'].isin(['COM'])]
# geo_data['SOVEREIGNT'].unique()
# geo_data['NAME_EN'].unique()
# geo_data[geo_data['SOVEREIGNT']=='United States of America']
# geo_data[geo_data['SOVEREIGNT']=='Libya'].plot()

In [37]:
# Wordcloud generation
stopwords = set(STOPWORDS)

pbar = tqdm(df_speech_url.iterrows(), total=len(df_speech_url))
for i, r in pbar:
    pbar.set_description(r['country'])
    try:
        country_mask = np.array(Image.open(DATA_DIR / "masks" / f"{r['country']}.jpg"))
        wc = WordCloud(background_color="white", max_words=2000, mask=country_mask, stopwords=stopwords, contour_width=3, contour_color='steelblue')
        
        if isinstance(r['start'], str) and isinstance(r['end'], str):
            h_start, m_start, s_start = r['start'].values[0].split(':')
            start = int(h_start) * 60*60 + int(m_start)*60 + int(s_start)
            h_end, m_end, s_end = r['end'].values[0].split(':')
            end = int(h_end) * 60*60 + int(m_end)*60 + int(s_end)
            corpus = get_corpus_from_file(r['country'], start=start, end=end)
        elif isinstance(r['start'], str) :
            h, m, s = r['start'].values[0].split(':')
            start = int(h) * 60*60 + int(m)*60 + int(s)
            corpus = get_corpus_from_file(r['country'], start=start)
        else:
            corpus = get_corpus_from_file(r['country'])
        corpus = clean(corpus)
        wc.generate(corpus)
        wc.to_file(DATA_DIR / "wordclouds" / f"{r['country']}_words.png");
    except:
        pass

  0%|          | 0/166 [00:00<?, ?it/s]

In [38]:
# This is to download the stopwords
# import nltk
# import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context
# # nltk.download('stopwords')
# nltk.download('wordnet')
# # nltk.download()

In [39]:
# from nltk.draw.dispersion import dispersion_plot
# import matplotlib.pyplot as plt
# plt.style.use('ggplot')

In [40]:
def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
    """
    Generate a lexical dispersion plot.

    :param text: The source text
    :type text: list(str) or iter(str)
    :param words: The target words
    :type words: list of str
    :param ignore_case: flag to set if case should be ignored when searching text
    :type ignore_case: bool
    :return: a matplotlib Axes object that may still be modified before plotting
    :rtype: Axes
    """

    try:
        import matplotlib.pyplot as plt
    except ImportError as e:
        raise ImportError(
            "The plot function requires matplotlib to be installed. "
            "See https://matplotlib.org/"
        ) from e

    word2y = {
        word.casefold() if ignore_case else word: y
        # for y, word in enumerate((words))
        for y, word in enumerate(reversed(words))
    }
    xs, ys = [], []
    for x, token in enumerate(text):
        token = token.casefold() if ignore_case else token
        y = word2y.get(token)
        if y is not None:
            xs.append(x)
            ys.append(y)

    words = words[::-1]
    _, ax = plt.subplots()
    ax.plot(xs, ys, "|")
    ax.set_yticks(list(range(len(words))), words, color="C0")
    ax.set_ylim(-1, len(words))
    ax.set_title(title)
    ax.set_xlabel("Word Offset")
    return ax

# dispersion_plot(text, [str(w) for w, f in fdist.most_common(10)])

In [41]:
# geo_data[geo_data['ADMIN']=="Chile"][['ADMIN', 'POP_EST', 'POP_RANK', 'GDP_MD', 'ECONOMY', 'INCOME_GRP', 'CONTINENT', 'REGION_UN', 'SUBREGION', 'REGION_WB']]

In [42]:
geo_data.head()

Unnamed: 0,featurecla,scalerank,LABELRANK,SOVEREIGNT,SOV_A3,ADM0_DIF,LEVEL,TYPE,TLC,ADMIN,ADM0_A3,GEOU_DIF,GEOUNIT,GU_A3,SU_DIF,SUBUNIT,SU_A3,BRK_DIFF,NAME,NAME_LONG,BRK_A3,BRK_NAME,BRK_GROUP,ABBREV,POSTAL,FORMAL_EN,FORMAL_FR,NAME_CIAWF,NOTE_ADM0,NOTE_BRK,NAME_SORT,NAME_ALT,MAPCOLOR7,MAPCOLOR8,MAPCOLOR9,MAPCOLOR13,POP_EST,POP_RANK,POP_YEAR,GDP_MD,GDP_YEAR,ECONOMY,INCOME_GRP,FIPS_10,ISO_A2,ISO_A2_EH,ISO_A3,ISO_A3_EH,ISO_N3,ISO_N3_EH,UN_A3,WB_A2,WB_A3,WOE_ID,WOE_ID_EH,WOE_NOTE,ADM0_ISO,ADM0_DIFF,ADM0_TLC,ADM0_A3_US,ADM0_A3_FR,ADM0_A3_RU,ADM0_A3_ES,ADM0_A3_CN,ADM0_A3_TW,ADM0_A3_IN,ADM0_A3_NP,ADM0_A3_PK,ADM0_A3_DE,ADM0_A3_GB,ADM0_A3_BR,ADM0_A3_IL,ADM0_A3_PS,ADM0_A3_SA,ADM0_A3_EG,ADM0_A3_MA,ADM0_A3_PT,ADM0_A3_AR,ADM0_A3_JP,ADM0_A3_KO,ADM0_A3_VN,ADM0_A3_TR,ADM0_A3_ID,ADM0_A3_PL,ADM0_A3_GR,ADM0_A3_IT,ADM0_A3_NL,ADM0_A3_SE,ADM0_A3_BD,ADM0_A3_UA,ADM0_A3_UN,ADM0_A3_WB,CONTINENT,REGION_UN,SUBREGION,REGION_WB,NAME_LEN,LONG_LEN,ABBREV_LEN,TINY,HOMEPART,MIN_ZOOM,MIN_LABEL,MAX_LABEL,LABEL_X,LABEL_Y,NE_ID,WIKIDATAID,NAME_AR,NAME_BN,NAME_DE,NAME_EN,NAME_ES,NAME_FA,NAME_FR,NAME_EL,NAME_HE,NAME_HI,NAME_HU,NAME_ID,NAME_IT,NAME_JA,NAME_KO,NAME_NL,NAME_PL,NAME_PT,NAME_RU,NAME_SV,NAME_TR,NAME_UK,NAME_UR,NAME_VI,NAME_ZH,NAME_ZHT,FCLASS_ISO,TLC_DIFF,FCLASS_TLC,FCLASS_US,FCLASS_FR,FCLASS_RU,FCLASS_ES,FCLASS_CN,FCLASS_TW,FCLASS_IN,FCLASS_NP,FCLASS_PK,FCLASS_DE,FCLASS_GB,FCLASS_BR,FCLASS_IL,FCLASS_PS,FCLASS_SA,FCLASS_EG,FCLASS_MA,FCLASS_PT,FCLASS_AR,FCLASS_JP,FCLASS_KO,FCLASS_VN,FCLASS_TR,FCLASS_ID,FCLASS_PL,FCLASS_GR,FCLASS_IT,FCLASS_NL,FCLASS_SE,FCLASS_BD,FCLASS_UA,geometry
0,Admin-0 country,1,6,Fiji,FJI,0,2,Sovereign country,1,Fiji,FJI,0,Fiji,FJI,0,Fiji,FJI,0,Fiji,Fiji,FJI,Fiji,,Fiji,FJ,Republic of Fiji,,Fiji,,,Fiji,,5,1,2,2,889953.0,11,2019,5496,2019,6. Developing region,4. Lower middle income,FJ,FJ,FJ,FJI,FJI,242,242,242,FJ,FJI,23424813,23424813,Exact WOE match as country,FJI,,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,FJI,-99,-99,Oceania,Oceania,Melanesia,East Asia & Pacific,4,4,4,-99,1,0.0,3.0,8.0,177.975427,-17.826099,1159320625,Q712,فيجي,ফিজি,Fidschi,Fiji,Fiyi,فیجی,Fidji,Φίτζι,פיג'י,फ़िजी,Fidzsi-szigetek,Fiji,Figi,フィジー,피지,Fiji,Fidżi,Fiji,Фиджи,Fiji,Fiji,Фіджі,فجی,Fiji,斐济,斐濟,Admin-0 country,,Admin-0 country,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000..."
1,Admin-0 country,1,3,United Republic of Tanzania,TZA,0,2,Sovereign country,1,United Republic of Tanzania,TZA,0,Tanzania,TZA,0,Tanzania,TZA,0,Tanzania,Tanzania,TZA,Tanzania,,Tanz.,TZ,United Republic of Tanzania,,Tanzania,,,Tanzania,,3,6,2,2,58005463.0,16,2019,63177,2019,7. Least developed region,5. Low income,TZ,TZ,TZ,TZA,TZA,834,834,834,TZ,TZA,23424973,23424973,Exact WOE match as country,TZA,,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,TZA,-99,-99,Africa,Africa,Eastern Africa,Sub-Saharan Africa,8,8,5,-99,1,0.0,3.0,8.0,34.959183,-6.051866,1159321337,Q924,تنزانيا,তানজানিয়া,Tansania,Tanzania,Tanzania,تانزانیا,Tanzanie,Τανζανία,טנזניה,तंज़ानिया,Tanzánia,Tanzania,Tanzania,タンザニア,탄자니아,Tanzania,Tanzania,Tanzânia,Танзания,Tanzania,Tanzanya,Танзанія,تنزانیہ,Tanzania,坦桑尼亚,坦尚尼亞,Admin-0 country,,Admin-0 country,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982..."
2,Admin-0 country,1,7,Western Sahara,SAH,0,2,Indeterminate,1,Western Sahara,SAH,0,Western Sahara,SAH,0,Western Sahara,SAH,1,W. Sahara,Western Sahara,B28,W. Sahara,,W. Sah.,WS,Sahrawi Arab Democratic Republic,,Western Sahara,,Self admin.; Claimed by Morocco,Western Sahara,,4,7,4,4,603253.0,11,2017,907,2007,7. Least developed region,5. Low income,WI,EH,EH,ESH,ESH,732,732,732,-99,-99,23424990,23424990,Exact WOE match as country,B28,,B28,SAH,MAR,SAH,SAH,SAH,SAH,MAR,SAH,SAH,SAH,SAH,SAH,SAH,MAR,MAR,SAH,MAR,SAH,SAH,SAH,SAH,SAH,MAR,MAR,MAR,SAH,SAH,MAR,SAH,SAH,SAH,-99,-99,Africa,Africa,Northern Africa,Middle East & North Africa,9,14,7,-99,1,4.7,6.0,11.0,-12.630304,23.967592,1159321223,Q6250,الصحراء الغربية,পশ্চিম সাহারা,Westsahara,Western Sahara,Sahara Occidental,صحرای غربی,Sahara occidental,Δυτική Σαχάρα,סהרה המערבית,पश्चिमी सहारा,Nyugat-Szahara,Sahara Barat,Sahara Occidentale,西サハラ,서사하라,Westelijke Sahara,Sahara Zachodnia,Sara Ocidental,Западная Сахара,Västsahara,Batı Sahra,Західна Сахара,مغربی صحارا,Tây Sahara,西撒哈拉,西撒哈拉,Admin-0 dependency,,Admin-0 dependency,,Unrecognized,,,,,Unrecognized,,,,,,,Unrecognized,Unrecognized,,Unrecognized,,,,,,Unrecognized,Unrecognized,Unrecognized,,,Unrecognized,,,,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948..."
3,Admin-0 country,1,2,Canada,CAN,0,2,Sovereign country,1,Canada,CAN,0,Canada,CAN,0,Canada,CAN,0,Canada,Canada,CAN,Canada,,Can.,CA,Canada,,Canada,,,Canada,,6,6,2,2,37589262.0,15,2019,1736425,2019,1. Developed region: G7,1. High income: OECD,CA,CA,CA,CAN,CAN,124,124,124,CA,CAN,23424775,23424775,Exact WOE match as country,CAN,,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,CAN,-99,-99,North America,Americas,Northern America,North America,6,6,4,-99,1,0.0,1.7,5.7,-101.9107,60.324287,1159320467,Q16,كندا,কানাডা,Kanada,Canada,Canadá,کانادا,Canada,Καναδάς,קנדה,कनाडा,Kanada,Kanada,Canada,カナダ,캐나다,Canada,Kanada,Canadá,Канада,Kanada,Kanada,Канада,کینیڈا,Canada,加拿大,加拿大,Admin-0 country,,Admin-0 country,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742..."
4,Admin-0 country,1,2,United States of America,US1,1,2,Country,1,United States of America,USA,0,United States of America,USA,0,United States,USA,0,United States of America,United States,USA,United States,,U.S.A.,US,United States of America,,United States,,,United States of America,,4,5,1,1,328239523.0,17,2019,21433226,2019,1. Developed region: G7,1. High income: OECD,US,US,US,USA,USA,840,840,840,US,USA,23424977,23424977,Exact WOE match as country,USA,,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA,-99,-99,North America,Americas,Northern America,North America,24,13,6,-99,1,0.0,1.7,5.7,-97.482602,39.538479,1159321369,Q30,الولايات المتحدة,মার্কিন যুক্তরাষ্ট্র,Vereinigte Staaten,United States of America,Estados Unidos,ایالات متحده آمریکا,États-Unis,Ηνωμένες Πολιτείες Αμερικής,ארצות הברית,संयुक्त राज्य अमेरिका,Amerikai Egyesült Államok,Amerika Serikat,Stati Uniti d'America,アメリカ合衆国,미국,Verenigde Staten van Amerika,Stany Zjednoczone,Estados Unidos,США,USA,Amerika Birleşik Devletleri,Сполучені Штати Америки,ریاستہائے متحدہ امریکا,Hoa Kỳ,美国,美國,Admin-0 country,,Admin-0 country,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000..."
