In [1]:
#2-step keyword extraction
#Text Mining Domains, VU

import pickle
import glob
import pandas as pd
import json
import os

def new_dict_concat_six (unpickled_dict):
    '''
    takes an unpickled dictionary as input with headline as [1] and list of first five sentences as [2]
    converts to a new dict with same key but value is string of concatenated headline + first five sentences
    filters the values of the keywords
    reurns: new dictionary with keys and concatenated headlines and first_five sentences of those articles which 
        contain the keywords
    '''        
    output = [unpickled_dict[key][1]+ '. ' + ' '.join(unpickled_dict[key][2]) for key in unpickled_dict]
    
    # calling test_dict only gets keys
    new_dict = dict(zip(unpickled_dict, output))
    
    return new_dict

def first_filter(keywords, infolder):
    for file in glob.glob(infolder):
        #prep filepaths for writing files
        path = os.path.split(file)[0].split('/')[0]
        basename = os.path.basename(file)

        unpickled_dict = pickle.load(open (file, 'rb'))
        #print(unpickled_dict)
        # run the function above to concat headlines and first_five
        new_dict = new_dict_concat_six(unpickled_dict)
        #print(new_dict)
        #print(new_dict)
        #rough filter for only keywords (see above)
        output_dict = {k: v for k, v in new_dict.items() for keyword in keywords if keyword in v}

        #write filtered articles - choosing json this time so I can inspect them in IDE
        if not os.path.exists(f'filteredpickles/{keywords[0]}'):
            os.makedirs(f'filteredpickles/{keywords[0]}')

        with open(f'filteredpickles/{keywords[0]}/{basename.rstrip(".gz.pkl")}.json', 'w', encoding = 'utf-8') as outfile:
            json.dump(output_dict, outfile)
                  
def keyword_search(keywords, big_df, treshold = 0):
#phase 2 of keyword search
    articles = []
    article_keywords = []
    article_ids = []
    #do a keyword search for the company
    i = 0
    for text in big_df['text']:
        keywords_found = set()
        for keyword in keywords:
            if keyword in text:
                keywords_found.add(keyword)
        if len(keywords_found) >= treshold:
            articles.append(text)
            article_keywords.append(keywords_found)
            article_ids.append(big_df.index[i])
        i += 1
    return articles, article_ids, article_keywords
                  
def keyword_search2(keywords, big_df, treshold = 0):
#phase 2 of keyword search
    article_dictlist = []
    #do a keyword search for the company
    i = 0
    for text in big_df['text']:
        keywords_found = set()
        for keyword in keywords:
            if keyword in text:
                keywords_found.add(keyword)
        if len(keywords_found) >= treshold:
            article_dict = {'Identifier' : big_df.index[i], 'Text' : text, 'Keywords' : keywords_found, 'Number' : i}
            article_dictlist.append(article_dict)
        i += 1
    articles = pd.DataFrame(article_dictlist)
    return articles
                  
def first_filter2(keywords, infolder):
    for file in glob.glob(infolder):
        #prep filepaths for writing files
        path = os.path.split(file)[0].split('/')[0]
        basename = os.path.basename(file)

        unpickled_dict = pickle.load(open (file, 'rb'))
        #print(unpickled_dict)
        # run the function above to concat headlines and first_five
        new_dict = new_dict_concat_six(unpickled_dict)
        #print(new_dict)
        #print(new_dict)
        #rough filter for only keywords (see above)
        output_dict = {k: v for k, v in new_dict.items() for keyword in keywords if keyword in v}

        #write filtered articles - choosing json this time so I can inspect them in IDE
        if not os.path.exists(f'filteredpickles/{keywords[0]}'):
            os.makedirs(f'filteredpickles/{keywords[0]}')

        with open(f'filteredpickles/{keywords[0]}/{basename.rstrip(".gz.pkl")}.json', 'w', encoding = 'utf-8') as outfile:
            json.dump(output_dict, outfile)

In [2]:
#define the keywords for the 1st rough search
keywords=['poverty', 'Poverty']

#rough filter, choose articles containing 'poverty' or 'aid',
#write them to json files
#computationally the most heavy step

infolder = 'pickles/*'
#create json files for all subcorpora with 1 or more docs containing the keyword(s).
first_filter(keywords, infolder)

In [3]:
keyword = keywords[0]
#read in the filtered json files
df_list = []
for file in glob.glob(f'filteredpickles/{keyword}/*'):
    jsonners = json.load(open(file, 'rb'))
    columns=['text']
    df_list.append(pd.DataFrame.from_dict(jsonners, orient='index', columns= columns))
big_df = pd.concat(df_list)
pd.set_option('display.max_colwidth', None)
print(f'The number of articles left after the first filter is: {len(big_df)}')

The number of articles left after the first filter is: 54455


In [4]:
topic = 'poverty'

with open(f'keywords/{topic}.txt', 'r', encoding = 'utf-8') as infile:
    keywords2 = infile.read().splitlines()
    
articles = keyword_search2(keywords2, big_df, treshold = 3)
#print(articles)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

#to do: figure out how to apply vectorization to the entire corpus
#or: select a representative example to train the tfidfvectorizer!

#2nd to do: find a way to give tf-idf scores to 2-grams and 3-grams

#train and apply Tfidf
vectorizer = TfidfVectorizer(use_idf = True, stop_words = stopWords)
tf_idf_vecs = vectorizer.fit_transform(big_df['text'])

In [6]:
keywords_corpus = []
for keyword in keywords2:
    if keyword in vectorizer.get_feature_names():
        keywords_corpus.append(keyword)
print('keywords used:', keywords_corpus)
            

keywords used: ['vulnerable', 'poor', 'ownership', 'control', 'property', 'resilience', 'exposure', 'shocks', 'disasters', 'mobilization', 'inhuman', 'workers', 'extortion', 'slavery', 'famine', 'hunger', 'entrepreneur', 'development']


In [7]:
#third step:
#compute tf-idf scores for selected docs. takes a few minutes
doc_scores = []
for i in articles['Number']:
    df = pd.DataFrame(tf_idf_vecs[i].T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
    df = df.sort_values(by=["tfidf"], ascending=False)
    doc_score = 0
    keywords_found = set()
    for keyword in keywords_corpus:
        keyword_score = df['tfidf'][keyword]
        if keyword_score > 0:
            keywords_found.add(keyword)
            doc_score += keyword_score
    doc_scores.append(doc_score)


In [8]:
scores_df = pd.DataFrame(doc_scores, columns = ['Score'])

In [9]:
articles_w_scores = pd.concat([articles, scores_df], axis = 1)

In [10]:
articles_complete = articles_w_scores.drop('Number', axis=1)

In [11]:
articles_complete = articles_complete.sort_values(by="Score", ascending = False)

In [12]:
articles_complete[:10]

Unnamed: 0,Identifier,Text,Keywords,Score
442,APW_ENG_20051120.0537,"Britain pledges money to international fund to help poor countries deal with economic shocks. Britain will contribute 50 million pounds (US$85 million; euro75 million) to a new international fund that helps poor countries deal with economic shocks, Treasury chief Gordon Brown announced Sunday. The International Monetary Fund scheme will offer relief in the wake of high oil prices, natural disasters, conflicts or other adverse developments. The fund is expected to be capable of subsidizing lending of more than 1.5 billion pounds (US$2.8 billion; euro2.3 billion). Brown said global action is needed in response to the doubling of oil prices over the last year _ ""the largest sustained oil price shock since the 1970s."" ""This new IMF facility will play a vital role supporting poverty reduction in some of the world's most vulnerable countries, ensuring that development is not undermined by changes in economic conditions, whether those come from terms of trade shocks, high oil prices, or natural disasters,"" he said.","{natural disaster, poor, shocks, vulnerable, disasters, development}",0.751235
436,APW_ENG_20050915.1138,"Venezuela expands land reform, claiming farmlands for poor and stirring some concerns. Venezuela has claimed about 600,000 hectares (about 1.5 million acres) of farmlands as state property to be turned over to poor farmers, giving hope to some while raising concern among others about the future of private property in the country. Vice President Jose Vicente Rangel defended the agrarian reform program Thursday, saying Venezuela ""is broadening the system of private property"" by making lands available to poor farmers. ""That is social justice,"" he said. The government's National Lands Institute so far has declared 21 ranches to be state property, saying they were determined to be ""idle"" and those who claimed ownership couldn't prove it through documents. The government has helped poor farmers form cooperatives to start to work the land, despite legal challenges from those who used to run the ranches. President Hugo Chavez, who on Thursday took his anti-poverty message to the United Nations, said earlier this week that ""private property has to be subordinate to the general interest."" However, he has insisted private property rights will be upheld, as outlined in Venezuela's constitution.","{property, ownership, poor}",0.704637
450,APW_ENG_20060222.0356,"Norway contributes to new IMF fund to help poor countries handle economic shocks. Norway pledged 240 million kroner (US$35.5 million or euro30 million) on Wednesday to a new global fund to help poor countries deal with economic shocks. The program was set up by the International Monetary Fund in November to provide highly subsidized loans to poor countries when their economies are hit by war, natural disaster or extreme fluctuations in import or export prices. ""This new scheme will act as a buffer and will help to ensure continuity in the fight against poverty, even in cases where vulnerable, poor countries have to deal with sudden severe economic shocks due to external circumstances,"" Norwegian Aid Minister Erik Solheim said. The fund, called the Exogenous Shocks Facility, was set up as part of the IMF's Poverty Reduction and Growth Facility Trust following a proposal from the world's richest nations, called the Group of Eight. Norway, which is rich on offshore oil, is the world's largest foreign aid donor per capita.","{shocks, natural disaster, poor, vulnerable}",0.683885
1701,XIN_ENG_20091202.0200,"UN chief calls for eradication of all slavery practices on\ninternational day of slavery abolition. Contemporary forms of slavery remain a ""grave and unresolved problem"" across all continents, UN Secretary-general Ban Ki-moon warned on Wednesday, calling for greater efforts to address poverty and social inequalities which leave people vulnerable to enslavement. In a message marking the International Day for the Abolition of Slavery, observed annually on Dec. 2, Ban said that the list of new and old forms of slavery is ""shockingly long."" That list includes debt bondage, serfdom, forced labor, child labor and servitude, trafficking of persons and human organs, sexual slavery, forced marriage, the exploitation of prostitutes and the use of child soldiers. ""The majority who suffer are the poor and socially excluded groups such as minorities and migrants,"" said Ban. ""The overlapping factors of poverty, class and race create structural problems and cycles of marginalization that are hard to break."" The secretary-general noted that gender inequalities, lack of education, desperation for work and demand for cheap labor also trap people in a life of subjugation, a vulnerability the global economic and financial crises threaten to heighten.","{slavery, child labor, poor, vulnerable}",0.621138
119,AFP_ENG_20051120.0373,"Britain pledges tens of millions to IMF to fight poverty. Britain plans to contribute as much as 50 million pounds (85 million dollars, 73 million euros) to the International Monetary Fund (IMF) to help poor countries deal with economic shocks, the British finance minister said Sunday. Chancellor of the Exchequer Gordon Brown said ""this new IMF facility will play a vital role supporting poverty reduction in some of the world's most vulnerable countries, ensuring that development is not undermined by changes in economic conditions, whether those come from terms of trade shocks, high oil prices or natural disasters."" Brown also said that France and Saudi Arabia also have promised to make contributions. ""But further contributions are needed from other donors -- including from oil producers -- to fund the new facility in full,"" Brown added.","{poor, shocks, vulnerable, disasters, development}",0.602788
470,APW_ENG_20061224.0241,"China ' s parliament takes up measures to update property, tax laws. Parliament on Sunday took up measures meant to bring Chinese law into line with a more open, capitalist-style economy by protecting private property and equalizing taxes for foreign and domestic companies. The proposed property law is the most controversial measure to come before parliament in recent years. Earlier versions prompted an outcry by leftists, who complained it would undermine state control of the economy and worsen the growing gap between an elite who have profited from China's reforms and the poor majority. The National People's Congress began considering a seventh draft on Sunday that ""strikes a balance between private property and state ownership,"" said the official Xinhua News Agency. It said backers hoped to pass it when the NPC holds its next full meeting in March. The Communist Party amended the constitution in 2004 to enshrine private property rights for the first time since its 1949 revolution. That followed two decades of reform that let hundreds of millions of Chinese lift themselves out of poverty as entrepreneurs started businesses, bought homes and traded stocks. The debate over legal changes meant to enforce such protections highlights enduring concern about the impact of China's rapid but uneven growth, which has set off protests over poverty, taxes and seizures of farmland for redevelopment.","{control, entrepreneur, property, poor, ownership, development}",0.59974
1613,XIN_ENG_20071209.0012,"Report links weather-related disasters with climate change. The equivalent of a third of the world's population has already been affected by weather- related disasters and this is set to soar because of climate change unless urgent international action is taken, according to a report issued here this week. Governments must commit at least 50 billion U.S. dollars every year to helping the world's most vulnerable communities prepare to save their own lives and livelihoods, says the report ""Climate of Disaster"" published by Tearfund, one of the UK's leading relief and development agencies. In the past 10 years, weather-related disasters have killed over 443,000 people, affected 2.5 billion people and cost an estimated 600 billion U.S. dollars in economic losses. With climate change increasing the number and intensity of extreme events such as floods and droughts, more and more people are becoming vulnerable to a range of environmental disasters, according to the report. Without urgent action, this trend is set to rise, leading to unprecedented levels of suffering and deaths. Poor people will be hit hardest as they are the least able to cope, and live in the most vulnerable areas of the world. With each new disaster, precious gains made in poverty eradication are swept away, warns the report. The following are the key highlights from the report ""Climate of Disaster"":","{poverty eradication, development, vulnerable, disasters}",0.593763
3,AFP_ENG_19941105.0221,"(picture) Clinton announces housing initiative. President Bill Clinton announced Saturday that he was asking a task force for ideas on how to expand home ownership to include more poor and minority buyers. In an address to the National Association of Realtors, the president said that home ownership had been sliding since 1980. ""We have to turn this around ... and I am convinced that we can do it,"" he said, saying his goal was to bring home ownership to ""an all time high in the US before the century is over."" To accomplish this, Clinton said he had asked Henry Cisneros, the secretary of housing and urban development, to create a task force that drew on such diverse groups as real estate agents, mortgage bankers, urban development experts and anti-poverty workers. They would draw up a strategy and report to the White House within six months, Clinton said.","{development, ownership, poor, workers}",0.58289
1351,XIN_ENG_20031008.0147,"Ugandans urged to reduce number, impact of disasters. Ugandans on Wednesday were called to reduce the number and impact of disasters by building sustainable communities that have the long-term capacity to live with risk. The call was contained in a statement issued by Lt. Gen. Moses Ali, first deputy prime minister and minister for disaster preparedness and refugees, to mark the International Day for Disaster Reduction which falls on Oct. 8, 2003. Ali said the theme of this year's International Day for Disaster Reduction is ""turning the tide on disasters towards sustainable development,"" adding that ""this theme is reminds us that the task is not just to effectively respond to disasters in order to save lives, but more to reduce the chances of hazards turning into disasters, taking life away and destroying development gains."" ""In Uganda, drought, floods, landslides, windstorms and hailstorms destroy an average of 800,000 hectares of crops making economic loses in excess of 60 million US dollars every year. Economic loses resulting from transport accidents and fires is estimated 25 million dollars annually. Death tolls resulting from natural and man-made disasters exceed 5,000 annually. Between 1980 and 2003, 1 in 30 people in Uganda were affected by a natural or man-made disaster and fell into the poverty bracket,"" he noted. ""At intervals of 5 to 10 years, an earthquake occurs in western Uganda destroying infrastructure and property estimated at over 40 million dollars. Also between 3 and 5 years periodically an El Nino rain phenomenon occurs followed by a severe drought in Uganda, also causing destruction to property and infrastructure estimated at over 200 million dollars,"" he said.","{development, property, disasters}",0.559259
346,APW_ENG_20020618.1182,"International Red Cross: spend before disasters, not just after Eds:\nEMBARGO set by source at 0001gmt, Wednesday, June 19. By spending more aid money on getting poor countries ready for disasters, rich nations could save thousands of lives, face smaller bills after floods or hurricanes, and boost the fight against poverty, the international Red Cross said Wednesday. The World Disasters Report, issued annually by the International Federation of Red Cross and Red Crescent Societies, said the economic and social cost of such events could be slashed if donors used their money more wisely _ before, rather than just after high profile disasters. ``Disasters aren't just little blips on the development curve,'' said Red Cross official Peter Walker, who edited the 239-page study. ``If we don't take them into account then we haven't got a chance'' of lifting poor countries out of poverty. Walker said donors should think like investors and target a larger proportion of their foreign aid to disaster preparedness. He did not suggest figures, but said the sums involved would be less than the current dlrs 6.3 billion a year it costs to rebuild after a catastrophe. The report said the number of people affected by natural and manmade disasters climbed to 170 million last year from an annual 70 million in the 1970s.","{development, poor, disasters}",0.55387
