In [1]:
#2-step keyword extraction
#Text Mining Domains, VU

import pickle
import glob
import pandas as pd
import json
import os

def new_dict_concat_six (unpickled_dict):
    '''
    takes an unpickled dictionary as input with headline as [1] and list of first five sentences as [2]
    converts to a new dict with same key but value is string of concatenated headline + first five sentences
    filters the values of the keywords
    reurns: new dictionary with keys and concatenated headlines and first_five sentences of those articles which 
        contain the keywords
    '''        
    output = [unpickled_dict[key][1]+ '. ' + ' '.join(unpickled_dict[key][2]) for key in unpickled_dict]
    
    # calling test_dict only gets keys
    new_dict = dict(zip(unpickled_dict, output))
    
    return new_dict

def first_filter(keywords, infolder):
    for file in glob.glob(infolder):
        #prep filepaths for writing files
        path = os.path.split(file)[0].split('/')[0]
        basename = os.path.basename(file)

        unpickled_dict = pickle.load(open (file, 'rb'))
        #print(unpickled_dict)
        # run the function above to concat headlines and first_five
        new_dict = new_dict_concat_six(unpickled_dict)
        #print(new_dict)
        #rough filter for only keywords (see above)
        output_dict = {k: v for k, v in new_dict.items() for keyword in keywords if keyword in v}

        #write filtered articles - choosing json this time so I can inspect them in IDE
        if not os.path.exists(f'filteredpickles/{keywords[0]}'):
            os.makedirs(f'filteredpickles/{keywords[0]}')

        with open(f'filteredpickles/{keywords[0]}/{basename.rstrip(".gz.pkl")}.json', 'w', encoding = 'utf-8') as outfile:
            json.dump(output_dict, outfile)

In [2]:
#rough filter, choose articles containing 'poverty' or 'aid',
#write them to json files
#computationally the most heavy step
keywords=['poverty', ' aid']
infolder = 'pickles/*'
first_filter(keywords, infolder)

In [3]:
#phase 2: fine-grained filter

df_list = []
for file in glob.glob('filteredpickles/*/*'):
    jsonners = json.load(open(file, 'rb'))
    columns=['text']
    df_list.append(pd.DataFrame.from_dict(jsonners, orient='index', columns= columns))
       
big_df = pd.concat(df_list)
pd.set_option('display.max_colwidth', None)

In [4]:
print(len(big_df))

304350


In [5]:
def keyword_search(keywords, big_df, treshold = 0):
    articles = []
    article_keywords = []
    #do a keyword search for the company
    for text in big_df['text']:
        keywords_found = set()
        for keyword in keywords:
            if keyword in text:
                keywords_found.add(keyword)
        if len(keywords_found) >= treshold:
            articles.append(text)
            article_keywords.append(keywords_found)
    return articles, article_keywords

In [6]:
#currently works for: poverty, gender equality (need to fix prefilter for the latter)
topic = 'poverty'

In [7]:
with open(f'keywords/{topic}.txt', 'r', encoding = 'utf-8') as infile:
    keywords = infile.read().splitlines()

#fine-grained search, kwarg treshold indicates the number of keywords that should be present in the text
articles, keywords_per_article = keyword_search(keywords, big_df, treshold = 3)

print('Number of articles found:', len(articles))

Number of articles found: 3136


In [8]:
for i in range(len(articles)):
    print(keywords_per_article[i])
    print(articles[i])

{'human rights', 'development', 'hunger'}
Mugabe outlines "new vision for Africa" by Lawrence Bartlett. Zimbabwean President Robert Mugabe outlined a "new vision for Africa" Thursday, in which the continent would no longer be synonymous with economic mismanagement, civil strife, or human rights abuses, nor with poverty, hunger, ignorance and disease. Opening a meeting of African presidents and representatives of Western donor nations he said that "perhaps at no other time in Africa's history has the continent felt so acutely the need for partnership and genuine friendship than today." Africa "is an essential component of the global polity. The failure or success of Africa affects the rest of the world, for the world is one," he said. "Global peace, economic prosperity and environmental regeneration cannot be sustained unless Africa is freed from the vicissitudes of poverty and war." Mugabe said the troubled continent was embarking on a three-pronged programme of economic structural adj

{'development', 'property', 'poor'}
Public Housing Being Built in Affluent Dallas Neighborhoods 
 (Dallas). For more than 40 years, the West Dallas projects have stood for two troubling things: the past fashion of placing public housing in the nation's poorest neighborhoods, and the sharp racial divisions that still endure in this very southern city. But fashions change and West Dallas, with the formal, forgotten name of Lake West, is about to enter a new era. Under federal court order, the Dallas Housing Authority is largely dismantling the barracks-like housing project, which has been home to the city's poorest blacks. In its place, smaller complexes will be built, tucked into more prosperous neighborhoods, part of a trend being championed nationwide by Housing and Urban Development Secretary Henry Cisneros. In Dallas, that means upsetting the longtime racial and economic patterns of the city -- blacks and the poor to the south of the dusty Trinity River, whites and the middle-class 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




{'vulnerable', 'natural disaster', 'disasters'}
NW China province sets up emergency aid scheme over west-east gas
pipeline. Northwest China's Shaanxi Province  has worked out an emergency aid scheme to ensure the safe  operation of the country's massive west-to-east gas pipeline that  runs through nine provinces and municipalities including Shaanxi. A 350-km section of the pipeline -- about 3,980 kilometers long and extending from northwest China's Xinjiang Uygur Autonomous  Region to Shanghai -- stretches across the province. It was put  into operation on January 1 this year, with a daily transmission  of 9.5 million cubic meters of gas. The pipeline, under high-pressure operation, is vulnerable to  natural disasters and man-made damages, the experts said, adding  that gas leaks could lead to fire, explosion or gas poisoning. There are a lot of hidden dangers due to insufficient  coordination between the local government and the pipeline  administration company, according to the expe

Annan. China is playing a  constructive and positive role in the world affairs both  economically and politically and it will have a lot to offer and  experience to share with other countries at the upcoming 2005  World Summit next week, UN Secretary General Kofi Annan said here  on Thursday. In a joint interview with China's Xinhua News Agency and the  China Central Television six days before the summit, scheduled for Sept. 14-16, Annan said China is a very important country in the  world today. Not just economically but also politically, he believed, it is  important that Chinese president Hu Jintao joins all the leaders  from around the world to review "how we improve our collective  security, how we improve the non-proliferation issue, how we  improve human rights, how we deal with development and help the  poor". "And one of the key issues at this conference is to look at and review what we have achieved in our efforts to implement the  Millennium Development Goals (MDGs) and the 

In [9]:
def keyword_company_search(company, keywords, big_df, treshold = 0):
    articles = []
    article_keywords = []
    
    found_company = False
    for text in big_df['text']:
        if company in text:   
            keywords_found = set()
            for keyword in keywords:
                if keyword in text:
                    keywords_found.add(keyword)
            if len(keywords_found) >= treshold:
                articles.append(text)
                article_keywords.append(keywords_found)
    return articles, article_keywords

with open('keywords/poverty.txt', 'r', encoding = 'utf-8') as infile:
    keywordz = infile.read().splitlines()

articles, keywords_per_article = keyword_company_search('Shell', keywordz, big_df, treshold = 3)

print(len(articles))

3
