In [66]:
#2-step keyword extraction
#Text Mining Domains, VU

import pickle
import glob
import pandas as pd
import json
import os

def new_dict_concat_six (unpickled_dict):
    '''
    takes an unpickled dictionary as input with headline as [1] and list of first five sentences as [2]
    converts to a new dict with same key but value is string of concatenated headline + first five sentences
    filters the values of the keywords
    reurns: new dictionary with keys and concatenated headlines and first_five sentences of those articles which 
        contain the keywords
    '''        
    output = [unpickled_dict[key][1]+ '. ' + ' '.join(unpickled_dict[key][2]) for key in unpickled_dict]
    
    # calling test_dict only gets keys
    new_dict = dict(zip(unpickled_dict, output))
    
    return new_dict

def first_filter(keywords, infolder):
    for file in glob.glob(infolder):
        #prep filepaths for writing files
        path = os.path.split(file)[0].split('/')[0]
        basename = os.path.basename(file)

        unpickled_dict = pickle.load(open (file, 'rb'))
        #print(unpickled_dict)
        # run the function above to concat headlines and first_five
        new_dict = new_dict_concat_six(unpickled_dict)
        #print(new_dict)
        #print(new_dict)
        #rough filter for only keywords (see above)
        output_dict = {k: v for k, v in new_dict.items() for keyword in keywords if keyword in v}

        #write filtered articles - choosing json this time so I can inspect them in IDE
        if not os.path.exists(f'filteredpickles/{keywords[0]}'):
            os.makedirs(f'filteredpickles/{keywords[0]}')

        with open(f'filteredpickles/{keywords[0]}/{basename.rstrip(".gz.pkl")}.json', 'w', encoding = 'utf-8') as outfile:
            json.dump(output_dict, outfile)
                  
def keyword_search(keywords, big_df, treshold = 0):
#phase 2 of keyword search
    article_dictlist = []
    #do a keyword search for the company
    i = 0
    for text in big_df['text']:
        keywords_found = set()
        for keyword in keywords:
            if keyword in text:
                keywords_found.add(keyword)
        if len(keywords_found) >= treshold:
            article_dict = {'Identifier' : big_df.index[i], 'Text' : text, 'Keywords' : keywords_found, 'Big_df index' : i}
            article_dictlist.append(article_dict)
        i += 1
    articles = pd.DataFrame(article_dictlist)
    return articles
                  
                  
def no_filter(infolder):
    for file in glob.glob(infolder):
        #prep filepaths for writing files
        path = os.path.split(file)[0].split('/')[0]
        basename = os.path.basename(file)

        unpickled_dict = pickle.load(open (file, 'rb'))
        #print(unpickled_dict)
        # run the function above to concat headlines and first_five
        new_dict = new_dict_concat_six(unpickled_dict)

        #write filtered articles - choosing json this time so I can inspect them in IDE
        if not os.path.exists(f'filteredpickles/unfiltered'):
            os.makedirs(f'filteredpickles/unfiltered')

        with open(f'filteredpickles/unfiltered/{basename.rstrip(".gz.pkl")}.json', 'w', encoding = 'utf-8') as outfile:
            json.dump(new_dict, outfile)

In [60]:
#define the keywords for the 1st rough search
keywords=['poverty', 'Poverty']

#rough filter, choose articles containing 'poverty' or 'aid',
#write them to json files
#computationally the most heavy step

infolder = 'pickles/*'
#create json files for all subcorpora with 1 or more docs containing the keyword(s).
first_filter(keywords, infolder)

In [61]:
keyword = keywords[0]
#read in the filtered json files
df_list = []
for file in glob.glob(f'filteredpickles/{keyword}/*'):
    jsonners = json.load(open(file, 'rb'))
    columns=['text']
    df_list.append(pd.DataFrame.from_dict(jsonners, orient='index', columns= columns))
big_df = pd.concat(df_list)
pd.set_option('display.max_colwidth', None)
print(f'The number of articles left after the first filter is: {len(big_df)}')

The number of articles left after the first filter is: 54455


In [None]:
no_filter(infolder)

In [70]:
import random


df_list = []
i = 0
while i < 50:
    path = "filteredpickles/unfiltered"
    filename = random.choice(os.listdir(path))
    jsonners = json.load(open(f'{path}/{filename}', 'rb'))
    df_list.append(pd.DataFrame.from_dict(jsonners, orient = 'index', columns = ['text']))
    i+=1

In [87]:
random_df = pd.concat(df_list)
print(len(random_df))

440432


In [62]:
topic = 'poverty'

with open(f'keywords/{topic}.txt', 'r', encoding = 'utf-8') as infile:
    keywords2 = infile.read().splitlines()
    
articles = keyword_search(keywords2, big_df, treshold = 3)
#print(articles)

In [75]:
mixed_df = pd.concat([big_df,random_df])

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

#to do: figure out how to apply vectorization to the entire corpus
#or: select a representative example to train the tfidfvectorizer!

#2nd to do: find a way to give tf-idf scores to 2-grams and 3-grams

#train and apply Tfidf
vectorizer = TfidfVectorizer(use_idf = True, stop_words = stopWords)
tf_idf_vecs = vectorizer.fit_transform(mixed_df['text'])

In [88]:
#print(tf_idf_vecs)

  (0, 133472)	0.11244104863496775
  (0, 214379)	0.05984875788515843
  (0, 53636)	0.08030536484157844
  (0, 67204)	0.04956203310996382
  (0, 29670)	0.10258812314721022
  (0, 104519)	0.09117467163100387
  (0, 86120)	0.06398747560684362
  (0, 227269)	0.08109966262456396
  (0, 159671)	0.17113556341819738
  (0, 191153)	0.037773790657286574
  (0, 125331)	0.10724811024403348
  (0, 35985)	0.14671437390850908
  (0, 195726)	0.06325560614025016
  (0, 81459)	0.057974556854915496
  (0, 4000)	0.07436434424040826
  (0, 302934)	0.08465238583875216
  (0, 3359)	0.0752879815054523
  (0, 99912)	0.10392272596878264
  (0, 159188)	0.06944762247255404
  (0, 216013)	0.10226390972652007
  (0, 197902)	0.09534031492539272
  (0, 296267)	0.11082363601558565
  (0, 93620)	0.07146050953555214
  (0, 114728)	0.14972901654043722
  (0, 227521)	0.09005140102988617
  :	:
  (494885, 21109)	0.04390941962899101
  (494885, 183020)	0.06155337276798539
  (494885, 140211)	0.053877872929335616
  (494885, 34690)	0.058873156108762544

In [78]:
keywords_corpus = []
for keyword in keywords2:
    if keyword in vectorizer.get_feature_names():
        keywords_corpus.append(keyword)
print('keywords used:', keywords_corpus)
            

keywords used: ['vulnerable', 'poor', 'ownership', 'control', 'property', 'resilience', 'exposure', 'shocks', 'disasters', 'mobilization', 'inhuman', 'workers', 'extortion', 'slavery', 'famine', 'hunger', 'entrepreneur', 'development']


In [79]:
#third step:
#compute tf-idf scores for selected docs. takes a few minutes
doc_scores = []
for i in articles['Big_df index']:
    df = pd.DataFrame(tf_idf_vecs[i].T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
    df = df.sort_values(by=["tfidf"], ascending=False)
    #print(df)
    doc_score = 0
    keywords_found = set()
    for keyword in keywords_corpus:
        keyword_score = df['tfidf'][keyword]
        if keyword_score > 0:
            keywords_found.add(keyword)
            doc_score += keyword_score
    if len(keywords_found) > 0:
        doc_score = doc_score / len(keywords_found)
    doc_scores.append(doc_score)


Empty DataFrame
Columns: []
Index: [00, 000, 0000, 0001, 0001gmt, 000466, 0005, 000696, 000896, 0009, 000m, 000th, 000us, 001, 0010, 0012, 0015, 001748, 002, 0024, 0026, 003, 0030, 0031, 0033, 0035, 0037, 0038, 0039, 004, 0040, 0041, 0044, 0045, 0046, 0047, 0049, 005, 0050, 0051, 0054, 0055, 0056, 0057, 0058, 0059, 006, 0060, 0062, 0063, 0065, 0066, 0069, 00696, 007, 0070, 0073, 0075, 0079, 008, 0081, 0086, 0087, 0089, 0090, 0091, 0097, 009704, 0098, 009905, 009908, 00pm, 00s, 01, 010, 0100, 010004, 010010, 0100gmt, 0101, 011, 0110, 0115, 01181906, 01181911, 0119, 012, 0127, 013, 0130, 0134, 0135, 0136, 0139, 014, 0140, 0141, 015, 0150, 0157, ...]

[107547 rows x 0 columns]


In [42]:
print(df['tfidf'])

fijian         0.656915
waqa           0.257551
dollars        0.255263
fiji           0.236556
labor          0.167536
                 ...   
floodplains    0.000000
floodlit       0.000000
floodlights    0.000000
floodings      0.000000
äcars          0.000000
Name: tfidf, Length: 107547, dtype: float64


In [80]:
scores_df = pd.DataFrame(doc_scores, columns = ['Score'])

In [81]:
articles_w_scores = pd.concat([articles, scores_df], axis = 1)

In [83]:
articles_complete = articles_w_scores.drop('Big_df index', axis=1)

In [84]:
articles_complete = articles_complete.sort_values(by="Score", ascending = False)

In [89]:
articles_complete[:50]

Unnamed: 0,Identifier,Text,Keywords,Score
410,APW_ENG_20041110.0267,"Brazilian government is failing to feed its people, human rights\ngroup says. A human rights organization criticized the Brazilian government's program to eliminate hunger, saying Wednesday the plan has delivered little in the way of real change. Speaking in Geneva before presenting the Germany-based group's findings to the United Nations, FIAN International said the South American country's government was as far away as ever from providing a lasting solution for the more than 50 million Brazilians living under the poverty line. Eradicating hunger was President Luiz Inacio Lula da Silva's main election promise when he swept into office two years ago. But his government now ""has very little time to prove that its Zero Hunger program does not turn into a zero solution to hunger,"" said Clovis Zimmermann, who headed the study. The aim of the much-heralded Zero Hunger program was to ensure that all Brazilians had enough to eat by redistributing land and providing monthly relief to the country's poorest families. ""If at the end of my mandate all Brazilians have the possibility to eat breakfast, lunch and dinner, I will have fulfilled the mission of my life,"" Silva said at his inauguration in January 2003.","{poor, human rights, hunger}",0.394969
293,APW_ENG_19990227.0166,"With BC-India-Economy Budget aims to help the poor. In a country where a third of the population lives in poverty, the 1999-2000 union budget presented Saturday unveiled modest measures to improve health, education and employment for the poor. ``This is an excellent, balanced budget, a pro-poor and pro-farmer budget. It will generate employment, reduce disparities and help the poor in meeting his or her daily requirements,'' said Prime Minister Atal Bihari Vajpayee. Finance Minister Yashwant Sinha proposed a program to create jobs by encouraging small-scale industry in 100 rural areas. Sinha said 1.8 million more schools would dot the countryside, where most Indians live, guaranteeing free education to poor children. The government would pay for the schools from existing education funds but would also encourage local communities to contribute in the effort. Local councils were also encouraged to collect funds from villagers to supplement federal funds for clinics.","{pro-poor, poor, free education}",0.313843
903,XIN_ENG_19950427.0232,"World Bank: 750 Million Go Hungry Every Day. The World Bank said that some 750 million people go hungry every day in today's world. The persistence of hunger in the world has posed one of the most daunting development challenges facing the global community today, the World Bank said in a strategy statement Wednesday. ""If we want to reduce hunger effectively, we have to reduce poverty,"" the Bank's President Lewis T. Preston said in a foreword to the report: ""The World Bank's Strategy for Reducing Poverty and Hunger."" Contrary to popular perception, not all hunger is caused by droughts, famines, and wars, the report said. The more subtle form of hunger, that is chronic, widespread and deep-rooted, is ""silent hunger"" caused by people lacking the capacity to produce food or the income to buy it. ""People-centered policies work best in combating hunger and poverty,"" said Ismail Serageldin, the Bank's vice-president, under whose direction the report was accomplished. ""Ample evidence exists to show that broad-based, sustainable economic growth coupled with improved access to education, health care and social services reduces poverty and goes a long way to fighting hunger,"" added Serageldin.","{famine, development, hunger}",0.300196
813,NYT_ENG_20021110.0186,"EDITORIAL: A BREAK FOR LOW-WAGE WORKERS. The New York Times said in an editorial for Monday, Nov. 11: The idea of compelling employers doing city business to pay workers a living wage _ sufficiently above the minimum wage to break the poverty line _ has taken hold in about 80 cities and communities around the nation. Now, New York has come up with its own modest version, which should be a first step toward helping some of the city's poorest workers. This is not a good time to be spending city money. But this is a worthy cause and we're glad the City Council has produced a prudent bill that should not cost the city more than $1 million the first year. The council's plan, passed last week with bipartisan support and sent to Mayor Michael Bloomberg, would raise the pay of 50,000 home health care attendants _ most of them black, Hispanic and immigrant women _ who help care for sick, elderly and disabled patients through home care agencies that contract with the city through Medicaid. Their pay would increase to $8.10 an hour with benefits (about 60 cents above the current average), or $9.60 an hour without benefits, rising to a minimum of $10 an hour by mid-2006. It's no coincidence that these workers are members of the powerful SEIU Local 1199, headed by Dennis Rivera, which led a coalition pressing for the bill. The measure reflects union muscle _ the American Federation of Teachers also won living-wage coverage for some of its child care workers. It's also politically pragmatic, seeking to cover workers in a way that is less likely to draw a mayoral veto. Bloomberg balked at any bill that would affect private firms that might move elsewhere, such as employers in commercial properties leased to the city or companies that receive city subsidies.","{minimum wage, poor, workers}",0.29865
1113,XIN_ENG_20000129.0124,"Trade Unions -- First Aid for Workers in Need. The provincial trade union of Liaoning, a heavy industry base in northeast China, has pledged to be the first source of aid for poverty-haunted workers. Liaoning will set up a network for these workers to ensure they receive all the benefits and social security they are entitled to. Trade unions at all level must guarantee that no poor workers or their families suffer, said Zhang Zhenxi, vice-chairman of the provincial trade union. Zhang said trade unions should spare no efforts to care for poor workers and provide their families with money, clothing, and food. He said that officials will aid 5,000 additional poor households this year in order to help them eventually escape poverty.","{poor, workers, social security}",0.291531
1236,XIN_ENG_20011015.0247,"National Workshop on 2nd Five-Year Social-Economic Development Plan\nHeld in Cambodia. Addressing the workshop, organized by the Ministry of Planning and the Asian Development Bank, Sok An, senior minister and cabinet minister, said the SEDPII must serve as an important guide to reduce poverty and improve living standard of millions of the people, particularly those who live in rural areas and often affected with natural disaster. The senior minister noted that the five-year plan will focus on the following priorities: long-term sustainable economic development at annual rate of 6 to 7 percent; effective and sustainable exploitation and management of natural and environment resources. To achieve the goal, firstly, the Royal government in the next five years would create a preferable environment for investment attraction by maintaining peace, stability, social order, human rights protection and democracy, as well as developing a general framework with transparency, accountability and predictability, Sok An said. Secondly, the government must make efforts to mobilize investment resources on priority areas, such as agriculture, tourism, human resource development, and bringing the role of women into play in development of agriculture and rural community, he added. Thirdly, institution building and good governance are the key foundation in the concept of sustainable development and social justice in order to strengthen rules of law, deepen public administrative reform, combat against corruption, the minister pointed out.","{natural disaster, development, human rights}",0.285366
1173,XIN_ENG_20001219.0309,"Full Text of White Paper on Population in China (2). 4. In line with the strategic goal of the nation's modernization drive and proceeding from national conditions, the Chinese Government has formulated and implemented a population policy which conforms to China's reality and has greatly contributed to the stabilization of the national and the world population and to the promotion of human development and progress. The Chinese Government is willing to continue its efforts together with the international community to effectively solve the problem of population and development. The Chinese Government firmly believes that China's population and development cause will develop further in the 21st century and that China will make still greater contribution to the civilization and progress of mankind! . Current Situation and Prospect 5. Since the 1970s, especially since the initiation of the reform and opening-up drive, China has formulated a basic state policy to promote family planning in an all-round way so as to slow down population growth and improve population quality in terms of health and education. The Government encourages late marriage and late childbearing, and advocates the practice of ""one couple, one child"" and of ""having a second child with proper spacing in accordance with the laws and regulations"". Family planning is also advocated among the ethnic minorities. Various provinces, autonomous regions and municipalities directly under the Central Government have formulated their own policies and regulations according to local conditions. 6. The Chinese Government pays great attention to the issue of population and development and has placed it on the agenda as an important part in the overall plan of China's national economic and social development. The Government consistently emphasizes that population growth should be compatible with socio-economic development and be in concert with resource utilization and environmental protection. Since the 1990s, the Central Government has convened a National Summit Meeting on the issue of population and development once a year for the sake of adopting important decisions and measures based upon discussion and analysis of the major problems. The Government organizes and coordinates the relevant departments and mass organizations to implement the population and family planning program, striving to integrate the family planning program with economic development, poverty eradication, protection of ecological environment, rational resource utilization, universal education, advancement of public health and social security, and improvement of women's status. This is aimed at seeking a thorough solution to the problem of population and development.","{development, social security, poverty eradication}",0.279855
1626,XIN_ENG_20080320.0286,"Full Text: China's economic, social development plan (8). II. Overall Requirements and Major Objectives for Economic and Social Development in 2008 In pursuing economic and social development in 2008, we must fully implement the guidelines adopted at the Seventeenth Party Congress, hold high the great banner of socialism with Chinese characteristics, take Deng Xiaoping Theory and the important thought of Three Represents as our guide and thoroughly apply the Scientific Outlook on Development. We will focus our efforts on transforming the pattern of economic development and improving the socialist market economy. We will continue to strengthen and improve macroeconomic regulation, work vigorously to advance reform and opening up and strengthen the country's capacity for independent innovation, and work hard to improve the economic structure and quality of economic growth. We must intensify efforts to save energy, reduce emissions and protect the environment. More attention will be paid to enhancing people's wellbeing and promoting social harmony as we strive for sound and fast economic development. In view of the above requirements and what is needed versus what is possible, and in conformity with the Outline of the Eleventh Five-Year Plan for National Economic and Social Development, we have set the following major objectives for economic and social development in 2008: - Raising the quality of economic growth. We need to further improve the economic structure, accelerate the development of tertiary industry, increase the contribution of high-tech industries to the economy and increase R&D expenditures as a percentage of GDP to 1.6%. Energy consumption per unit of GDP, sulfur dioxide emissions and chemical oxygen demand should fall by a larger margin than last year. Government revenue and corporate profits are expected to grow steadily. On the basis of structural improvement, improved efficiency, lower energy consumption and environmental protection measures, China's GDP is expected to grow by about 8% in 2008. A major reason for introducing the above targets for economic development is to communicate to society the government's regulatory intentions and guide all sectors to focus their work and attention on transforming the pattern of development, improving the quality of economic growth, saving energy and reducing emissions in order to achieve both sound and fast development. All local authorities should set targets for local GDP growth at an appropriate level in line with local conditions and avoid blind competition and setting targets at each administrative level that are too high. - Continuing to improve people's lives. Urban employment should expand by 10 million new jobs and the urban registered unemployment rate should stay at about 4.5%. The incomes of urban and rural residents should continue to increase fairly fast. Per capita net income of rural residents should increase by 6% or more in real terms. The new system of rural cooperative medical care should cover all rural areas and will receive more subsidies from the central and local government budgets. The number of rural people living in poverty should be reduced by 2 million or more. Fiscal guarantees to ensure adequate funding for compulsory education in rural areas will be strengthened. The program of free education for urban students taking compulsory education will be fully instituted. Natural population growth should stay within 0.7%. The major factors taken into consideration in setting the above goals are: national economic growth has been steady and fast for several years in a row, the financial strength of the country has further increased, the performance of enterprises has markedly improved and the impact of the vigorous employment policy is becoming increasingly noticeable. In addition, the implementation of policy measures to promote increase in people's incomes, especially that of farmers, and strengthen the social safety net has made it necessary and created the conditions for us to further expand employment, increase the incomes of urban and rural residents and address issues affecting people's wellbeing such as education and medical care. (More)","{development, protection measures, free education}",0.279491
301,APW_ENG_19990624.0855,"Report: Natural disasters, disease on rise worldwide By HENRY WASSWA. Epidemics and natural disasters are on the rise around the world, especially in underdeveloped countries where expanding urban slums pose health hazards, according to a Red Cross report presented Thursday. The 200-page 1999 World Disasters Report said global warming, deforestation and urban crowding and poverty in the developing world are escalating the spread of contagious and deadly diseases and have left populations vulnerable to natural disasters. ``The developing world will continue to be hardest hit by the escalating effects of human-driven climate change, environmental degradation and population pressures,'' said the report that was presented at the start of a two-day seminar in the Ugandan capital on preparing for and averting disasters. Red Cross regional spokesman John Sparrow said his agency had chosen to launch the report in Uganda because of the East African country's dedication to disaster-preparedness. Jane Francis Kuka, State Minister for Disaster Preparedness and Refugees, said she found the report disturbing.","{vulnerable, natural disaster, disasters}",0.262675
153,AFP_ENG_20061202.0007,"Disaster-prone Philippines suffering more and more: analysts\nby Karl Wilson. The Philippines is naturally prone to disasters but rising poverty and climate change are making the problem steadily worse, analysts said. Despite repeated disasters, many people are too poor to leave dangerous areas, they say. Some 30,000 people fled the Mayon volcano when it started rumbling in August only to return when the activity subsided. But this week, typhoon-triggered mudslides swept hundreds of these people to their deaths. Roger-Mark De Souza of the Washington-based Population Reference Bureau said the danger from natural disasters here has risen markedly in recent years. ""The risk to human life from natural disasters in the Philippines has increased dramatically over the past generation,"" he said in a recent report.","{natural disaster, poor, disasters}",0.261849
