## Extracting relevant articles

In [1]:
#2-step keyword extraction
#Text Mining Domains, VU

import pickle
import glob
import pandas as pd
import json
import os
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

def new_dict_concat_six (unpickled_dict):
    '''
    takes an unpickled dictionary as input with headline as [1] and list of first five sentences as [2]
    converts to a new dict with same key but value is string of concatenated headline + first five sentences
    filters the values of the keywords
    reurns: new dictionary with keys and concatenated headlines and first_five sentences of those articles which 
        contain the keywords
    '''        
    output = [unpickled_dict[key][1]+ '. ' + ' '.join(unpickled_dict[key][2]) for key in unpickled_dict]
    
    # calling test_dict only gets keys
    new_dict = dict(zip(unpickled_dict, output))
    
    return new_dict

def no_filter(infolder):
    '''Reads all pickles files, converts them to .json'''
    for file in glob.glob(infolder):
        #prep filepaths for writing files
        path = os.path.split(file)[0].split('/')[0]
        basename = os.path.basename(file)

        unpickled_dict = pickle.load(open (file, 'rb'))
        #print(unpickled_dict)
        # run the function above to concat headlines and first_five
        new_dict = new_dict_concat_six(unpickled_dict)

        #write filtered articles - choosing json this time so I can inspect them in IDE
        if not os.path.exists(f'filteredpickles/unfiltered'):
            os.makedirs(f'filteredpickles/unfiltered')

        with open(f'filteredpickles/unfiltered/{basename.rstrip(".gz.pkl")}.json', 'w', encoding = 'utf-8') as outfile:
            json.dump(new_dict, outfile)
                  
def keyword_search(keywords, big_df, treshold = 0):
    '''Loads a (hand-crafted) SDG-specific keyword list.
    Filters all articles containing 'treshold' or more members of the keyword list
    Returns a dataframe containing relevant articles.'''
#phase 2 of keyword search
    article_dictlist = []
    #do a keyword search for the company
    i = 0
    for text in big_df['text']:
        keywords_found = set()
        for keyword in keywords:
            if keyword in text:
                keywords_found.add(keyword)
        if len(keywords_found) >= treshold:
            article_dict = {'identifier' : big_df.index[i], 'text' : text, 'keywords' : keywords_found}
            article_dictlist.append(article_dict)
        i += 1
    articles = pd.DataFrame(article_dictlist)
    return articles

In [12]:
#create json files for all subcorpora
#needs to be run once on your system
#no_filter(infolder)

topic = 'sustainable consumption'
treshold = 3

In [13]:
with open(f'keywords/{topic}.txt', 'r', encoding = 'utf-8') as infile:
    keywords = infile.read().splitlines()

df_list = []
for file in glob.glob('filteredpickles/unfiltered/*'):
    jsonners = json.load(open(file, 'rb'))
    df = pd.DataFrame.from_dict(jsonners, orient='index', columns= ['text'])
    relevant_articles = keyword_search(keywords, df, treshold = treshold)
    df_list.append(relevant_articles)
#concatenate all relevant articles into one dataframe    
big_df = pd.concat(df_list, ignore_index = True)     

In [15]:
with open(f'results/keyword_lookup_{topic}_t{treshold}.csv', 'w', encoding = 'utf-8') as outfile:
    outfile.write(big_df.to_csv())

## Retrieving Tf-idf scores for keywords

In [5]:
#Retrieve random articles to balance tf-idf scores for keywords

def random_sample(n):
    df_list = []
    i = 0
    path = "filteredpickles/unfiltered"
    while i < 50:
        filename = random.choice(os.listdir(path))
        jsonners = json.load(open(f'{path}/{filename}', 'rb'))
        df_list.append(pd.DataFrame.from_dict(jsonners, orient = 'index', columns = ['text']))
        i+=1
    random_df = pd.concat(df_list)
    random_sample = random_df.sample(n = n)
    return random_sample

In [6]:
#create json files for all subcorpora, needed for random sampling
#no_filter(infolder)

In [16]:
big_df

Unnamed: 0,identifier,text,keywords
0,AFP_ENG_20080809.0576,Olympics: Yao becomes UN environment campaigne...,"{awareness, efficiency, waste management}"
1,APW_ENG_20070905.1543,"Coca-Cola announces new plant, sets 100 percen...","{awareness, recycling, reuse}"
2,CNA_ENG_20050829.0018,EPA TRYING TO TACKLE ILLEGAL FACTORY DUMPING O...,"{efficiency, waste management, recycling}"
3,CNA_ENG_20080424.0039,Government to promote 'green technology' among...,"{efficiency, waste management, recycling}"
4,CNA_ENG_20080516.0038,Taiwan's recycling shines on international sta...,"{awareness, efficiency, recycling}"
5,NYT_ENG_20000204.0266,Mohawk to buy plastic bottles from Coke bottle...,"{procurement, recycling, waste management}"
6,NYT_ENG_20020312.0012,"IN NEW YORK CITY, DOING WELL TRUMPS DOING GOOD...","{awareness, recycling, reuse}"
7,NYT_ENG_20050704.0206,TRASH REMOVAL GOAL MIGHT BE DUMPED. The amount...,"{recycling, reuse, waste management}"
8,NYT_ENG_20070417.0259,WANT EPA HOTEL AND CONVENTION CENTER BUSINESS?...,"{procurement, efficiency, reuse, recycling}"
9,WPB_ENG_20100525.0072,Wal-Mart's `Green' Muscle Fuels Unruh's Goal o...,"{efficiency, sustainable production, sustainab..."


Compute tf-idf scores for keywords in relevant articles

In [8]:
def compute_tf_idf(articles, keywordlist):
    random_df = random_sample(30000)
    mixed_df = pd.concat([articles,random_df])
    
    stopWords = set(stopwords.words('english'))
    #train and apply tfidf
    #may take a while
    vectorizer = TfidfVectorizer(stop_words = stopWords, ngram_range = (1,1))
    tf_idf_vecs = vectorizer.fit_transform(mixed_df['text'])
    #last step: get tfidf scores for keywords in selected articles    
    doc_scores = []

    keywords_corpus = []
    for keyword in keywordlist:
        if keyword in vectorizer.get_feature_names():
            keywords_corpus.append(keyword)
    print('keywords used:', keywords_corpus)
    
    for i in range(len(articles)):
        df = pd.DataFrame(tf_idf_vecs[i].T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
        df = df.sort_values(by=["tfidf"], ascending=False)
        #print(df)
        doc_score = 0
        keywords_found = set()
        for keyword in keywords_corpus:
            keyword_score = df['tfidf'][keyword]
            if keyword_score > 0:
                keywords_found.add(keyword)
                doc_score += keyword_score
        if len(keywords_found) > 0:
            doc_score = doc_score / len(keywords_found)
        doc_scores.append(doc_score)
        
    scores_df = pd.DataFrame(doc_scores, columns = ['Score'])
    articles = pd.concat([articles, scores_df], axis = 1)
    articles = articles.set_index('identifier')
    articles = articles.sort_values(by="Score", ascending = False)
    return articles

In [17]:
articles = compute_tf_idf(big_df, keywords)

keywords used: ['efficiency', 'recycling', 'reuse', 'sustainability', 'procurement', 'awareness']


In [18]:
with open(f'results/keyword_lookup_tfidf_{topic}_t{treshold}_unigrams.csv', 'w', encoding = 'utf-8') as outfile:
    outfile.write(articles.to_csv())

In [19]:
articles

Unnamed: 0_level_0,text,keywords,Score
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CNA_ENG_20080516.0038,Taiwan's recycling shines on international sta...,"{awareness, efficiency, recycling}",0.302638
NYT_ENG_20020312.0012,"IN NEW YORK CITY, DOING WELL TRUMPS DOING GOOD...","{awareness, recycling, reuse}",0.246263
XIN_ENG_20050112.0044,HK to build Recovery Park for recycling indust...,"{procurement, recycling, reuse}",0.235516
XIN_ENG_20101013.0464,EU calls for cooperation with Asia for SMEs su...,"{sustainable consumption, sustainable producti...",0.171157
XIN_ENG_20101106.0059,World dairy summit in New Zealand eyes sustain...,"{recycling, reuse, sustainability}",0.149503
APW_ENG_20070905.1543,"Coca-Cola announces new plant, sets 100 percen...","{awareness, recycling, reuse}",0.125116
WPB_ENG_20101028.0037,Australian Landlords Prepare for New Energy Ef...,"{efficiency, sustainability, recycling}",0.12411
XIN_ENG_20080227.0121,Plan to recycle schoolbooks gets mixed recepti...,"{awareness, recycling, reuse}",0.120987
XIN_ENG_20090830.0231,"Beijing sets ""recycling"" day and offers door-t...","{awareness, efficiency, recycling}",0.119119
XIN_ENG_20041030.0279,Clean-up campaign launched in Kathmandu Valley...,"{awareness, recycling, reuse, waste management}",0.118422
