In [10]:
#2-step keyword extraction
#Text Mining Domains, VU

import pickle
import glob
import pandas as pd
import json
import os

def new_dict_concat_six (unpickled_dict):
    '''
    takes an unpickled dictionary as input with headline as [1] and list of first five sentences as [2]
    converts to a new dict with same key but value is string of concatenated headline + first five sentences
    filters the values of the keywords
    reurns: new dictionary with keys and concatenated headlines and first_five sentences of those articles which 
        contain the keywords
    '''        
    output = [unpickled_dict[key][1]+ '. ' + ' '.join(unpickled_dict[key][2]) for key in unpickled_dict]
    
    # calling test_dict only gets keys
    new_dict = dict(zip(unpickled_dict, output))
    
    return new_dict

def first_filter(keywords, infolder):
    for file in glob.glob(infolder):
        #prep filepaths for writing files
        path = os.path.split(file)[0].split('/')[0]
        basename = os.path.basename(file)

        unpickled_dict = pickle.load(open (file, 'rb'))
        #print(unpickled_dict)
        # run the function above to concat headlines and first_five
        new_dict = new_dict_concat_six(unpickled_dict)
        #print(new_dict)
        #rough filter for only keywords (see above)
        output_dict = {k: v for k, v in new_dict.items() for keyword in keywords if keyword in v}

        #write filtered articles - choosing json this time so I can inspect them in IDE
        if not os.path.exists(f'filteredpickles/{keywords[0]}'):
            os.makedirs(f'filteredpickles/{keywords[0]}')

        with open(f'filteredpickles/{keywords[0]}/{basename.rstrip(".gz.pkl")}.json', 'w', encoding = 'utf-8') as outfile:
            json.dump(output_dict, outfile)

In [11]:
#rough filter, choose articles containing 'poverty' or 'aid',
#write them to json files
#computationally the most heavy step
keywords=['poverty', ' aid']
infolder = 'pickles/*'
first_filter(keywords, infolder)

In [3]:
#phase 2: fine-grained filter

df_list = []
for file in glob.glob('filteredpickles/*/*'):
    jsonners = json.load(open(file, 'rb'))
    columns=['text']
    df_list.append(pd.DataFrame.from_dict(jsonners, orient='index', columns= columns))
       
big_df = pd.concat(df_list)
pd.set_option('display.max_colwidth', None)

In [4]:
print(len(big_df))

195588


In [8]:
def keyword_search(keywords, big_df, treshold = 0):
    articles = []
    article_keywords = []
    #do a keyword search for the company
    for text in big_df['text']:
        keywords_found = set()
        for keyword in keywords:
            if keyword in text:
                keywords_found.add(keyword)
        if len(keywords_found) >= treshold:
            articles.append(text)
            article_keywords.append(keywords_found)
    return articles, article_keywords

In [12]:
#currently works for: poverty, gender equality (need to fix prefilter for the latter)
topic = 'gender equality'

In [14]:
with open(f'keywords/{topic}.txt', 'r', encoding = 'utf-8') as infile:
    keywords = infile.read().splitlines()

#fine-grained search, kwarg treshold indicates the number of keywords that should be present in the text
articles, keywords_per_article = keyword_search(keywords, big_df, treshold = 3)

print('Number of articles found:', len(articles))

Number of articles found: 102


In [None]:
for i in range(len(articles)):
    print(keywords_per_article[i])
    print(articles[i])

In [7]:
def keyword_company_search(company, keywords, big_df, treshold = 0):
    articles = []
    article_keywords = []
    
    found_company = False
    for text in big_df['text']:
        if company in text:   
            keywords_found = set()
            for keyword in keywords:
                if keyword in text:
                    keywords_found.add(keyword)
            if len(keywords_found) >= treshold:
                articles.append(text)
                article_keywords.append(keywords_found)
    return articles, article_keywords

with open('keywords/poverty.txt', 'r', encoding = 'utf-8') as infile:
    keywordz = infile.read().splitlines()

articles, keywords_per_article = keyword_company_search('Shell', keywordz, big_df, treshold = 3)

print(len(articles))

3
