## Extracting relevant articles

In [1]:
#2-step keyword extraction
#Text Mining Domains, VU

import pickle
import glob
import pandas as pd
import json
import os
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

def new_dict_concat_six (unpickled_dict):
    '''
    takes an unpickled dictionary as input with headline as [1] and list of first five sentences as [2]
    converts to a new dict with same key but value is string of concatenated headline + first five sentences
    filters the values of the keywords
    reurns: new dictionary with keys and concatenated headlines and first_five sentences of those articles which 
        contain the keywords
    '''        
    output = [unpickled_dict[key][1]+ '. ' + ' '.join(unpickled_dict[key][2]) for key in unpickled_dict]
    
    # calling test_dict only gets keys
    new_dict = dict(zip(unpickled_dict, output))
    
    return new_dict

def no_filter(infolder):
    '''Reads all pickles files, converts them to .json'''
    for file in glob.glob(infolder):
        #prep filepaths for writing files
        path = os.path.split(file)[0].split('/')[0]
        basename = os.path.basename(file)

        unpickled_dict = pickle.load(open (file, 'rb'))
        #print(unpickled_dict)
        # run the function above to concat headlines and first_five
        new_dict = new_dict_concat_six(unpickled_dict)

        #write filtered articles - choosing json this time so I can inspect them in IDE
        if not os.path.exists(f'filteredpickles/unfiltered'):
            os.makedirs(f'filteredpickles/unfiltered')

        with open(f'filteredpickles/unfiltered/{basename.rstrip(".gz.pkl")}.json', 'w', encoding = 'utf-8') as outfile:
            json.dump(new_dict, outfile)
                  
def keyword_search(keywords, big_df, treshold = 0):
    '''Loads a (hand-crafted) SDG-specific keyword list.
    Filters all articles containing 'treshold' or more members of the keyword list
    Returns a dataframe containing relevant articles.'''
#phase 2 of keyword search
    article_dictlist = []
    #do a keyword search for the company
    i = 0
    for text in big_df['text']:
        keywords_found = set()
        for keyword in keywords:
            if keyword in text:
                keywords_found.add(keyword)
        if len(keywords_found) >= treshold:
            article_dict = {'identifier' : big_df.index[i], 'text' : text, 'keywords' : keywords_found}
            article_dictlist.append(article_dict)
        i += 1
    articles = pd.DataFrame(article_dictlist)
    return articles

In [2]:
#create json files for all subcorpora
#needs to be run once on your system
#no_filter(infolder)

## Retrieving Tf-idf scores for keywords

In [3]:
#Retrieve random articles to balance tf-idf scores for keywords

def random_sample(n):
    '''Returns a random sample, containing n articles'''
    df_list = []
    i = 0
    path = "filteredpickles/unfiltered"
    while i < 50:
        filename = random.choice(os.listdir(path))
        jsonners = json.load(open(f'{path}/{filename}', 'rb'))
        df_list.append(pd.DataFrame.from_dict(jsonners, orient = 'index', columns = ['text']))
        i+=1
    random_df = pd.concat(df_list)
    random_sample = random_df.sample(n = n)
    return random_sample

In [4]:
#create json files for all subcorpora, needed for random sampling
#no_filter(infolder)

Compute tf-idf scores for keywords in relevant articles

In [5]:
def compute_tf_idf(articles, keywordlist):
    '''This function computes tf-idf scores for all terms in :keywordlist that are in each article'''
    random_df = random_sample(30000)
    mixed_df = pd.concat([articles,random_df])
    
    stopWords = set(stopwords.words('english'))
    #train and apply tfidf
    #may take a while
    vectorizer = TfidfVectorizer(stop_words = stopWords, ngram_range = (1,2))
    tf_idf_vecs = vectorizer.fit_transform(mixed_df['text'])
    #last step: get tfidf scores for keywords in selected articles    
    doc_scores = []

    keywords_corpus = []
    for keyword in keywordlist:
        if keyword in vectorizer.get_feature_names():
            keywords_corpus.append(keyword)
    print('keywords used:', keywords_corpus)
    
    for i in range(len(articles)):
        df = pd.DataFrame(tf_idf_vecs[i].T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
        df = df.sort_values(by=["tfidf"], ascending=False)
        #print(df)
        doc_score = 0
        keywords_found = set()
        for keyword in keywords_corpus:
            keyword_score = df['tfidf'][keyword]
            if keyword_score > 0:
                keywords_found.add(keyword)
                doc_score += keyword_score
        doc_scores.append(doc_score)
        
    scores_df = pd.DataFrame(doc_scores, columns = ['Score'])
    articles = pd.concat([articles, scores_df], axis = 1)
    articles = articles.set_index('identifier')
    articles = articles.sort_values(by="Score", ascending = False)
    return articles

In [6]:
def keyword_search_tf_idf(topic, treshold):
    '''This function guides the full process of extracting relevant articles using keyword_search, and '''
    #retrieve keyword list
    with open(f'keywords/{topic}.txt', 'r', encoding = 'utf-8') as infile:
        keywords = infile.read().splitlines()

    #scrape relevant articles (keyword search)
    df_list = []
    for file in glob.glob('filteredpickles/unfiltered/*'):
        jsonners = json.load(open(file, 'rb'))
        df = pd.DataFrame.from_dict(jsonners, orient='index', columns= ['text'])
        relevant_articles = keyword_search(keywords, df, treshold = treshold)
        df_list.append(relevant_articles)
    #concatenate all relevant articles into one dataframe    
    big_df = pd.concat(df_list, ignore_index = True)
    #write search results to a csv file
    with open(f'results/keyword_lookup_{topic}_t{treshold}.csv', 'w', encoding = 'utf-8') as outfile:
        outfile.write(big_df.to_csv())
        
    #compute tf-idf scores of keywords in relevant articles
    articles = compute_tf_idf(big_df, keywords)
    with open(f'results/keyword_lookup_tfidf_{topic}_t{treshold}_bigrams.csv', 'w', encoding = 'utf-8') as outfile:
        outfile.write(articles.to_csv())
    
    return articles

Keyword search and TF-IDF-based ranking

In [7]:
topic = 'sustainable consumption'
treshold = 3
articles = keyword_search_tf_idf(topic, treshold)
articles

keywords used: ['sustainable consumption', 'sustainable production', 'efficiency', 'waste management', 'recycling', 'reuse', 'sustainability', 'procurement', 'awareness', 'sustainable tourism', 'local culture', 'local products', 'local production']


Unnamed: 0_level_0,text,keywords,Score
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CNA_ENG_20080516.0038,Taiwan's recycling shines on international sta...,"{efficiency, recycling, awareness}",0.617393
NYT_ENG_20020312.0012,"IN NEW YORK CITY, DOING WELL TRUMPS DOING GOOD...","{awareness, reuse, recycling}",0.445192
XIN_ENG_20050112.0044,HK to build Recovery Park for recycling indust...,"{procurement, reuse, recycling}",0.426368
XIN_ENG_20041030.0279,Clean-up campaign launched in Kathmandu Valley...,"{waste management, awareness, reuse, recycling}",0.385239
XIN_ENG_20070607.0278,IDB approves 60 million-dollar loan to attract...,"{waste management, awareness, sustainable tour...",0.349732
XIN_ENG_20021003.0434,Thailand supports local production of aromatic...,"{awareness, local products, local production}",0.299095
WPB_ENG_20100525.0072,Wal-Mart's `Green' Muscle Fuels Unruh's Goal o...,"{efficiency, sustainability, sustainable produ...",0.267863
XIN_ENG_20080627.0103,Feature: World's tourism industry grapples wit...,"{sustainable tourism, local culture, sustainab...",0.267011
XIN_ENG_20101106.0059,World dairy summit in New Zealand eyes sustain...,"{sustainability, reuse, recycling}",0.261911
XIN_ENG_20101013.0464,EU calls for cooperation with Asia for SMEs su...,"{sustainable consumption, awareness, sustainab...",0.261024


In [8]:
topic = 'gender equality'
treshold = 3
articles = keyword_search_tf_idf(topic, treshold)
articles

keywords used: ['gender equality', 'discrimination', 'violence', 'trafficking', 'exploitation', 'sexual exploitation', 'forced marriage', 'genital mutilation', 'domestic work', 'public services', 'shared responsibility', 'participation', 'equal opportunities', 'leadership']


Unnamed: 0_level_0,text,keywords,Score
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
XIN_ENG_20020918.0091,Cambodia to enhance law enforcement against tr...,"{trafficking, sexual exploitation, exploitation}",0.585917
AFP_ENG_20051026.0765,US official slams UN on troops' child sex abus...,"{trafficking, sexual exploitation, exploitation}",0.563182
XIN_ENG_20030612.0151,US tags Philippines as main source of human tr...,"{trafficking, sexual exploitation, exploitation}",0.562675
AFP_ENG_20011220.0236,Youth speak out against sexual exploitation. Y...,"{sexual exploitation, discrimination, exploita...",0.534778
XIN_ENG_20011009.0137,"Cambodia Combats Trafficking, Sexual Exploitat...","{trafficking, sexual exploitation, exploitation}",0.525071
...,...,...,...
APW_ENG_20070601.1127,Philadelphia council says city can end Boy Sco...,"{participation, discrimination, leadership}",0.065091
LTW_ENG_19970208.0021,U.S. Revises Plan for All-African Military For...,"{decision-making, violence, participation}",0.063783
XIN_ENG_20030725.0130,Program of Action for Sustainable Development ...,"{public services, participation, decision-making}",0.062764
NYT_ENG_19990604.0340,EDITORIAL: EUROPE'S NEW MILITARY ASPIRATIONS. ...,"{participation, leadership, decision-making}",0.060213


In [9]:
topic = 'poverty'
treshold = 5
articles = keyword_search_tf_idf(topic, treshold)
articles

keywords used: ['poverty', 'social protection', 'protection systems', 'protection measures', 'vulnerable', 'equal rights', 'economic resources', 'poor', 'ownership', 'control', 'property', 'resilience', 'exposure', 'shocks', 'disasters', 'mobilization', 'development cooperation', 'adequate means', 'poverty eradication', 'human rights', 'working conditions', 'inhuman', 'workers', 'fair wage', 'extortion', 'slavery', 'famine', 'minimum wage', 'child labor', 'hunger', 'entrepreneur', 'free education', 'low wage', 'development', 'social security', 'natural disaster']


Unnamed: 0_level_0,text,keywords,Score
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
XIN_ENG_20070628.0209,Officials say relocation only effective remedy...,"{workers, poverty eradication, poverty, develo...",0.656799
XIN_ENG_20071123.0250,Niger hosts UN strategic meeting on poverty er...,"{mobilization, poverty eradication, poverty, d...",0.616652
APW_ENG_19960920.0621,"Agricultural Workers Get Poor Pay, Poor Protec...","{social protection, workers, working condition...",0.604646
XIN_ENG_20081209.0269,Full Text: Wang Chen: China Registers Historic...,"{control, poverty, hunger, development, poor, ...",0.595886
APW_ENG_19960920.0682,"FOR RELEASE AT 0100 GMT MONDAY, TIME SET BY SO...","{social protection, workers, working condition...",0.594786
...,...,...,...
NYT_ENG_19990709.0227,UNDATED: productivity increases.. ``When there...,"{control, entrepreneur, poverty, development, ...",0.077821
NYT_ENG_20051003.0237,GAP WIDENS BETWEEN HAVES AND HAVE-NOTS. The in...,"{workers, control, entrepreneur, development, ...",0.073776
APW_ENG_20031209.0770,Recent winners of the Nobel Memorial Prize in ...,"{famine, control, property, development, poverty}",0.071433
XIN_ENG_20090511.0238,Full text: China's Actions for Disaster Preven...,"{disasters, workers, control, development, nat...",0.069701
