<a href="https://colab.research.google.com/github/gylam/siads696-sprsum2024-team05/blob/main/RP_StratchPad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#relevant links
# https://reliefweb.int/updates
# https://apidoc.reliefweb.int/

import requests
import pandas as pd
import numpy as np
import time

In [None]:
def data_extraction():
    #data extraction based on api call here
    limit = 400
    offset = 0
    df = pd.DataFrame()

    #extraction 200K articles as a pilot set. Total English articles available is 800K
    while offset <= 200000:
        try:
            response = requests.get(f"""https://api.reliefweb.int/v1/reports?appname=rwint-user-0&profile=full&
                                &filter[field]=language&filter[value]=English&slim=1
                                &limit={str(limit)}&offset={str(offset)}""").json()["data"]
        except:
            continue
        article_id = []
        title_list = []
        body_list = []
        themes_list = []
        count = 0
        curr_pull_df = pd.DataFrame()

        for i in response:
            #first check is for the theme; if the article doesnt have a theme associated with it (no label for supervised learning)
            #we discard it
            try:
                #since there may be multiple themes associated with an article
                temp_theme_list = i["fields"]["theme"] #one dictionary object per theme
                themes_list.append([theme["name"] for theme in temp_theme_list]) #extract of all the themes associated with that article
            except:
        #         print(count)
                continue

            article_id.append(i["id"])
            try:
                title_list.append(str(i["fields"]["title"]))
            except:
                title_list.append(np.NaN) #if the article has no title
            try:
                body_list.append(str(i["fields"]["body"]))
            except:
                body_list.append(np.NaN) #if no excerpt has been provided


        curr_pull_df["id"] = article_id
        curr_pull_df["title"] = title_list
        curr_pull_df["body"] = body_list
        curr_pull_df["themes"] = themes_list
        curr_pull_df["combined_text"] = curr_pull_df.apply(lambda x: str(x["title"]) + str(x["body"]), axis = 1)

        df = pd.concat([df, curr_pull_df], axis = 0)
        offset +=limit
    return df

start_time = time.time()
df = data_extraction()
end_time = time.time()
print(end_time - start_time)
print(df.shape)
df.head()

292.6811957359314
(138585, 5)


Unnamed: 0,id,title,body,themes,combined_text
0,3914083,The H2H Network activates its fund mechanism t...,Humanitarian needs have never been higher. Inn...,[Humanitarian Financing],The H2H Network activates its fund mechanism t...
1,3913994,Reaching the final straw: Shedding light on al...,**EXECUTIVE SUMMARY**\n\nAlmost 12 years since...,"[Education, Food and Nutrition, Health, Protec...",Reaching the final straw: Shedding light on al...
2,3913981,UN High Commissioner for Human Rights Volker T...,"Kyiv, 7 December 2022\n\nGood afternoon and th...",[Protection and Human Rights],UN High Commissioner for Human Rights Volker T...
3,3913926,We Went Up to the Mountains: The Return of Dis...,It’s Monday afternoon and the school bell anno...,"[Education, Protection and Human Rights]",We Went Up to the Mountains: The Return of Dis...
4,3913806,UNICEF hands over 110 child-friendly classroom...,"*Over 8,000 Bangladeshi children in Cox’s Baza...","[Education, Water Sanitation Hygiene]",UNICEF hands over 110 child-friendly classroom...


In [None]:
print(f"{round(df.id.nunique()/df.shape[0]*100, 2)}% unique articles")
df.isna().sum()

74.14% unique articles


id                   0
title                0
body             10941
themes               0
combined_text        0
dtype: int64

In [None]:
#pilot is with CountVectorizer but we also need to check performance with other vectorizers

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(lowercase = True,
                     min_df = 5,
                     stop_words = "english",  #doesn't exclude alpha numeric words, typos and foreign language scripts
                     token_pattern = r"\b[a-zA-Z]+", #excludes alphanum and foreign scripts but not typos

                    #do we want to include n-grams?
#                     ngram_range = (1,2),
                    ).fit(df.combined_text)
df_cv = cv.transform(df.combined_text) #returns sparse matrix
print(df_cv.shape)

cv.get_feature_names_out()[:20] #[200:250]
#how to exclude missplet words and repeated alphabets?

# array(['aa', 'aaa', 'aab', 'aac', 'aad', 'aadchit', 'aadjibade', 'aadmer',
#        'aaf', 'aag', 'aah', 'aahi', 'aai', 'aairs', 'aaj', 'aal', 'aale',
#        'aaley', 'aalisha', 'aalmsri'], dtype=object)

(138585, 75151)


array(['aa', 'aaa', 'aab', 'aac', 'aad', 'aadchit', 'aadjibade', 'aadmer',
       'aaf', 'aag', 'aah', 'aahi', 'aai', 'aairs', 'aaj', 'aal', 'aale',
       'aaley', 'aalisha', 'aalmsri'], dtype=object)

In [None]:
#testing TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(lowercase = True,
                     min_df = 5,
                     stop_words = "english",  #doesn't exclude alpha numeric words, typos and foreign language scripts
                     token_pattern = r"\b[a-zA-Z]+", #excludes alphanum and foreign scripts but not typos

                    #do we want to include n-grams?
#                     ngram_range = (1,2),
                    ).fit(df.combined_text)
df_tf = tf.transform(df.combined_text) #returns sparse matrix
print(df_tf.shape)

tf.get_feature_names_out()[:20] #[200:250]
#how to exclude missplet words and repeated alphabets?

# array(['aa', 'aaa', 'aab', 'aac', 'aad', 'aadchit', 'aadjibade', 'aadmer',
#        'aaf', 'aag', 'aah', 'aahi', 'aai', 'aairs', 'aaj', 'aal', 'aale',
#        'aaley', 'aalisha', 'aalmsri'], dtype=object)

(138585, 75151)


array(['aa', 'aaa', 'aab', 'aac', 'aad', 'aadchit', 'aadjibade', 'aadmer',
       'aaf', 'aag', 'aah', 'aahi', 'aai', 'aairs', 'aaj', 'aal', 'aale',
       'aaley', 'aalisha', 'aalmsri'], dtype=object)

In [None]:
# #Unsupervised learning - topic extraction:

# #create climate subset
# df["climate_theme_flag"] = df.themes.apply(lambda x: "Climate Change and Environment" in x)
# df_unsup = df[df["climate_theme_flag"]] ~3000 articles (this is a subset of 200K articles in the pilot study)

# #vectorizer
# df_unsup_cv = cv.transform(df_unsup.combined_text)

#lDA - topic extraction - sample 5topics
from sklearn.decomposition import NMF, LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components = 10).fit(df_unsup_cv)

# lda.get_feature_names_out()
_ = pd.DataFrame(lda.components_.T, index = cv.get_feature_names_out(), columns = lda.get_feature_names_out())
#                  ["Top1", "Top2", "Top3", "Top4", "Top5"])


In [None]:
#Top words by topic - change sort column name
# _.head()
_.sort_values("latentdirichletallocation1", ascending = False).head(20)

# ##observations:
# 1. significant word cleaning is still needed
# 2. we should prob remove "climate", "change" as this is adding any new info to the topics

Unnamed: 0,latentdirichletallocation0,latentdirichletallocation1,latentdirichletallocation2,latentdirichletallocation3,latentdirichletallocation4,latentdirichletallocation5,latentdirichletallocation6,latentdirichletallocation7,latentdirichletallocation8,latentdirichletallocation9
women,254.524709,1875.735463,7.188495,70.708567,41.701597,2.295105,80.395042,25.003741,182.975833,241.471447
rights,8.028883,1402.682666,14.750268,266.044787,0.100005,0.100009,25.779905,136.393822,71.377589,12.742064
human,45.921254,931.225318,331.331952,781.896941,216.301419,0.10001,342.697625,42.917611,163.872921,105.73495
gender,64.017591,872.959324,0.100008,37.056546,1.770964,0.160024,20.52902,2.762365,83.656775,133.987384
s,432.45908,490.606088,2389.330851,2271.392847,1793.587062,28.398039,546.513314,442.281797,2890.697938,1450.732984
change,11.765742,488.005095,1357.773354,5084.343579,4560.547419,0.12151,564.202325,139.554065,2272.848387,813.838525
climate,18.073502,459.809284,1556.858355,8459.237945,9500.639227,0.109103,897.739282,283.964674,5598.684072,1178.884555
said,4.524003,433.312247,1351.152809,2562.016899,551.277677,0.100004,259.543769,138.432252,377.258474,443.381866
special,7.566183,329.906244,30.39395,256.56371,55.410761,0.100007,9.997418,0.100023,140.870949,7.090755
girls,70.988184,313.034432,0.100007,12.822656,0.10001,0.100043,0.100008,71.158654,107.248437,3.347569


In [None]:
_.head(10)