# Preparation

In [1]:
import pandas as pd
import os, random, glob, json
from sentence_transformers import SentenceTransformer
import nltk
import numpy as np
from sklearn.svm import LinearSVC

# Get dataframe of related and unrelated texts

In [2]:
df_list=[]

for file in glob.glob('filteredpickles/poverty_pickles/*/*'):
    jsonners = json.load(open(file, 'rb'))
    columns=['text']
    df_list.append(pd.DataFrame.from_dict(jsonners, orient='index', columns= columns))

In [3]:
big_df = pd.concat(df_list)

# Jan's Keyword Lookup

In [4]:
def keyword_search(keywords, big_df, treshold = 0):
    articles = []
    article_keywords = []
    #do a keyword search for the company
    for text in big_df['text']:
        keywords_found = set()
        for keyword in keywords:
            if keyword in text:
                keywords_found.add(keyword)
        if len(keywords_found) >= treshold:
            articles.append(text)
            article_keywords.append(keywords_found)
    return articles, article_keywords

In [5]:
topic = 'poverty'

In [6]:
with open(f'keywords/{topic}.txt', 'r', encoding = 'utf-8') as infile:
    keywords = infile.read().splitlines()

#fine-grained search, kwarg treshold indicates the number of keywords that should be present in the text
articles, keywords_per_article = keyword_search(keywords, big_df, treshold = 4)

print('Number of articles found:', len(articles))

Number of articles found: 435


In [7]:
filtered_df = pd.DataFrame(articles)
# df

In [8]:
filtered_df.columns=['text']

### add labels column

In [23]:
filtered_df['label'] = 'related'

## Random Sampling

In [9]:
#Tip: to save memory write the line to a csv file instead of yield

def random_list (filename):
    with open (filename, 'r') as infile:
        print(filename)
        file = pd.read_json(infile, orient = 'index')
    random_index = random.randint(0,file.shape[0])
    if keyword not in file.iloc[random_index-1].text:
        #if memory is an issue, uncomment these 3 line:
#         text_only = file.iloc[random_index-1]['text'].rstrip('\n')+'\n'
#         with open ('balancing_data.csv', 'a') as outfile:
#             outfile.write(text_only)
        # and comment out this section
        yield file.iloc[random_index-1]
        
    else:
        pass

In [10]:
num = filtered_df.shape[0]
keyword = 'poverty'
syphon = []
while len(syphon) < num:
    filename = f'pickles/jsons2/{random.choice(os.listdir("pickles/jsons2"))}'
    syphon.extend(random_list(filename))
    

pickles/jsons2/pickle417.json
pickles/jsons2/pickle44.json
pickles/jsons2/pickle312.json
pickles/jsons2/pickle315.json
pickles/jsons2/pickle181.json
pickles/jsons2/pickle290.json
pickles/jsons2/pickle472.json
pickles/jsons2/pickle17.json
pickles/jsons2/pickle267.json
pickles/jsons2/pickle466.json
pickles/jsons2/pickle451.json
pickles/jsons2/pickle491.json
pickles/jsons2/pickle427.json
pickles/jsons2/pickle397.json
pickles/jsons2/pickle12.json
pickles/jsons2/pickle152.json
pickles/jsons2/pickle243.json
pickles/jsons2/pickle29.json
pickles/jsons2/pickle106.json
pickles/jsons2/pickle160.json
pickles/jsons2/pickle250.json
pickles/jsons2/pickle364.json
pickles/jsons2/pickle224.json
pickles/jsons2/pickle90.json
pickles/jsons2/pickle361.json
pickles/jsons2/pickle5.json
pickles/jsons2/pickle156.json
pickles/jsons2/pickle24.json
pickles/jsons2/pickle396.json
pickles/jsons2/pickle66.json
pickles/jsons2/pickle380.json
pickles/jsons2/pickle307.json
pickles/jsons2/pickle318.json
pickles/jsons2/pick

KeyboardInterrupt: 

In [26]:
list_for_balancing = []

for gen_list in syphon:
    list_for_balancing.append(gen_list)

In [30]:
balance_df = pd.DataFrame(list_for_balancing)

In [34]:
pd.set_option('display.max_colwidth', -1)
balance_df['text']

NYT_ENG_19990801.0137    COMMENTARY: VOODOO REDUX. Here we go again. Back in 1980, Ronald Reagan assured one and all that he could cut taxes sharply, increase defense spending substantially and balance the federal budget. If it had been a television commercial instead of real life we would have seen a close-up of Reagan happily saying, ``I can do that,'' followed by a shot of him, perplexed, asking, ``How am I gonna do that?'' George Herbert Walker Bush famously derided Reagan's supply-side fantasies as ``voodoo economics.'' And the veteran Washington Post reporter Lou Cannon, in his book ``President Reagan: The Role of a Lifetime,'' described the reaction of James Baker, Reagan's own chief-of-staff, to the transformation of economic fantasy into national policy. He wrote: ``Though not particularly well-versed in economics, Baker suspected that there was something screwy about the idea that massive tax cuts would increase government revenues. Later, he would privately express regrets t

In [35]:
balance_df.drop('date', inplace=True, axis = 1)

### add label

In [36]:
balance_df['label']= 'unrelated'
balance_df

Unnamed: 0,text,label
NYT_ENG_19990801.0137,"COMMENTARY: VOODOO REDUX. Here we go again. Back in 1980, Ronald Reagan assured one and all that he could cut taxes sharply, increase defense spending substantially and balance the federal budget. If it had been a television commercial instead of real life we would have seen a close-up of Reagan happily saying, ``I can do that,'' followed by a shot of him, perplexed, asking, ``How am I gonna do that?'' George Herbert Walker Bush famously derided Reagan's supply-side fantasies as ``voodoo economics.'' And the veteran Washington Post reporter Lou Cannon, in his book ``President Reagan: The Role of a Lifetime,'' described the reaction of James Baker, Reagan's own chief-of-staff, to the transformation of economic fantasy into national policy. He wrote: ``Though not particularly well-versed in economics, Baker suspected that there was something screwy about the idea that massive tax cuts would increase government revenues. Later, he would privately express regrets that the deficits had `gotten away' from the administration and wished he had paid more attention to the consequences of the tax cuts.''",unrelated
NYT_ENG_19980119.0621,"ISLAND LIVING AT BARGAIN PRICES. Show me a better deal than a 600-square-foot cottage next to San Francisco Bay for $400, including a three-car garage, garden, private security and quick access to the Financial District, and I'll pay the rent for a month. Four hundred a month is what Wendy Linka, the $65,858-a-year director of marketing for Treasure Island, pays for the caretaker's cottage behind Admiral Nimitz's old mansion on Yerba Buena Island. Now that the Navy is gone, the mayor's office is the landlord. The rent, figured by the city of San Francisco at $1 per square foot, is $600. Linka pays $200 less because of duties as the mansion's caretaker. In October I wrote that Linka lived in the cottage rent-free. Sorry, I was a few bucks and a few days off. Documents show that the city began deducting the $400 from her paycheck on Oct. 1, 1997, with additional funds deducted for rent back to May 1, when her lease began.",unrelated
APW_ENG_20061117.1112,"One child abused every hour in Zimbabwe, child protection groups report. New data shows a child is abused every hour in Zimbabwe and more than half the reported cases involve sexual abuse, a coalition of child protection groups said Friday. ""Are Zimbabweans really horrified by these statistics?"" said Childline director Audrey Gumbo. ""Are we really being jolted into action? Because this is what is needed -- action,"" including the strict enforcement of existing child protection laws. The Child Protection Working Group said sexual abuse was worsened by beliefs that sexually transmitted diseases, including HIV/AIDS, could be cured by having sex with a virgin. ""This is the most repulsive of myths,"" said Betty Makoni, director of Girl Child Network, a member of the Child Protection Working Group. Data released ahead of the World Day for the Prevention of Child Abuse on Sunday showed 8,600 cases of abuse -- or 24 a day, or one an hour -- were reported last year across this country of nearly 12 million people. Other cases went unreported. There was no indication trends had changed.",unrelated
AFP_ENG_19970131.0060,"South Korean authorities arrest Hanbo chief. Chung Tae-Soo, head of South Korea's debt-stricken Hanbo Group, was arrested Friday on charges of issuing unpaid promissory notes, a television report said. Following overnight questioning at the Prosecutor General's Office, Chung was charged with fraud for allegedly issuing promissory notes worth millions of dollars while fully aware they could not be honoured. The Hanbo chief was also alleged to have embezzled some 50 million dollars in illegal loans from a mutual trust fund of which he was the largest shareholder. The Hanbo Group's four major subsidiaries -- Hanbo Energy, Sang-A Pharmaceutical Co., Hanbo Steel and General Construction Co. and Hanbo Corp. -- have sought bankruptcy protection. The Hanbo debt troubles have triggered allegations that the group used high-level connections to obtain loans without sufficient collateral.",unrelated
APW_ENG_20070804.0559,"Finnish Rally Results. Results Saturday from the Finnish Rally world championship event after the second leg, 20 of 23 special stages, 319.6 kilometers: 1. Marcus Gronholm, Finland, Ford, 2 hours, 36 minutes, 45.2 seconds. 2. Mikko Hirvonen, Finland, Ford, 20.4 seconds behind. 3. Sebastien Loeb, France, Citroen, 1:00.5. 4. Chris Atkinson, Australia, Subaru, 2:41.3.",unrelated
...,...,...
APW_ENG_20100920.0398,"Fashion greats remember McQueen in London service. Fashionistas celebrated the life and legacy of Alexander McQueen in a solemn ceremony at St. Paul's Cathedral on Monday, seven months after his suicide shocked the designing world. The service for McQueen briefly brought London Fashion Week to a halt as top designers, models and editors came to pay their respects to the enfant terrible of British fashion, who was 40 when he died in early February. He had a history of depression and was said to be devastated by the recent loss of his mother. His suicide has deprived the British fashion world of its biggest and most controversial star. The service was attended by many of the models and actresses who loved McQueen's work, including Naomi Campbell, Kate Moss, Sarah Jessica Parker, Jade Parfitt and others, including designer Stella McCartney. Anna Wintour, editor-in-chief of the American edition of Vogue magazine, was the first of McQueen's admirers to speak. Wearing an elegant black-and-gold outfit, she praised his ""exceptional legacy of brilliant inspiration.""",unrelated
AFP_ENG_19941115.0022,"Tamils urge reciprocal gesture to Tiger truce. Tamil politicians urged the government Tuesday to formally respond to a unilateral ceasefire declared over the weekend by separatist Tamil Tiger guerrillas. The Liberation Tigers of Tamil Eelam (LTTE) called the one-week truce to mark Saturday's inauguration of new President Chandrika Kumaratunga, who had initiated talks with the rebels to end ethnic bloodletting. ""The LTTE by its ceasefire has demonstrated that it wants the peace process to continue. What is going to be the government's response?"" Tamil legislator Dharmalingam Sidhathan said. Sidhathan leads the anti-LTTE Democratic People's Liberation Front (DPLF). President Kumaratunga suspended talks with Tigers after the rebels were implicated in killing 57 people at an opposition rally last month, including opposition leader, Gamini Dissanayake.",unrelated
NYT_ENG_19990517.0460,"GENERAL DYNAMICS TO BUY GULFSTREAM, MAKER OF JET PLANES. The General Dynamics Corp., a military contractor that specializes in making nuclear submarines, surface ships and tanks, announced Monday that it would acquire the Gulfstream Aerospace Corp., a maker of corporate jets, for $5.3 billion in a deal that returns General Dynamics to a business it once shed. The Gulfstream acquisition, a 1-for-1 stock swap that would give General Dynamics shares to Gulfstream shareholders, would add a new and potentially cyclical business to General Dynamics' product line, which is heavily laden with armaments that are steady cash generators. It would also move General Dynamics back to the aerospace industry, a business it stepped away from in the early 1990s when it sold Cessna Aircraft, a maker of business jets, to Textron and when it sold its F-16 fighter jet operations to Lockheed Martin as well as its missile division to Hughes Electronics. The deal for Gulfstream elicited mix reaction on Wall Street. General Dynamics tumbled nearly 10 percent, or $6.1875, to $65.25, after having traded as low as $63.25 by midday. Still, while many on Wall Street were skeptical about the deal, others said Gulfstream would add a strong cash generator to the company, even if it took General Dynamics in a new direction. ``The deal makes a lot of sense from a financial perspective,'' said Pierre Chao, an analyst with Morgan Stanley Dean Witter. ``General Dynamics is adding a company with a solid background and great cash flow, which will allow them to fuel further defense acquisitions.'' He said the market was forgetting that General Dynamics used to be in the aerospace business.",unrelated
AFP_ENG_20090729.0244,"Croatia, Slovenia PMs to meet over border row. Croatia said Wednesday that its new prime minister will meet this week with her counterpart from Slovenia, which is blocking Zagreb's EU accession over a long- running border row. Jadranka Kosor, who became Croatia's first woman premier earlier this month, was to hold talks Friday with Slovenia's Borut Pahor in Trakoscan, some 70 kilometres (43 miles) north of Zagreb, said a government statement. ""With this meeting, a bilateral dialogue on the level of the premiers which started in February will continue,"" it said referring to a meeting in Slovenia between Pahor and Kosor's predecessor Ivo Sanader. Slovenia has been blocking Croatia's talks to join the European Union since December over a border dispute dating back to 1991, when both countries proclaimed independence from the former Yugoslavia. Kosor said Wednesday she was ""very optimisic"" of resolving the issue, which concerns a small slice of land and sea crucial for Slovenia's access to the Adriatic Sea and international shipping waters.",unrelated


## concat related and unrelated

In [37]:
balanced_labeled_df = pd.concat([filtered_df, balance_df])

In [38]:
# the id column is not an issue since they are not used for representation
balanced_labeled_df

Unnamed: 0,text,label
0,"(picture) Clinton announces housing initiative. President Bill Clinton announced Saturday that he was asking a task force for ideas on how to expand home ownership to include more poor and minority buyers. In an address to the National Association of Realtors, the president said that home ownership had been sliding since 1980. ""We have to turn this around ... and I am convinced that we can do it,"" he said, saying his goal was to bring home ownership to ""an all time high in the US before the century is over."" To accomplish this, Clinton said he had asked Henry Cisneros, the secretary of housing and urban development, to create a task force that drew on such diverse groups as real estate agents, mortgage bankers, urban development experts and anti-poverty workers. They would draw up a strategy and report to the White House within six months, Clinton said.",related
1,"Sanctions a last resort against exploitation of Asia's children. Human rights, including those of children and workers, should be defended by trade and aid sanctions only as a last resort and never applied unilaterally, a child rights' activist said here on Thursday. There is a role for sanctions, said Vitit Muntarbhorn, executive director of Child Rights ASIANET and former United Nations special rapporteur on the sale of children, citing their contributions to ending apartheid in South Africa. ""There is one country in the region for which sanctions are appropriate now. It is Burma. The junta is violating civil, economic, political, social and human rights, and using forced porterage and child labor near the border,"" Vitit said. In most cases, however, exploitation of child labor must be considered against a country's history of human rights, and its efforts to re- educate former child laborers, he said at the fourth annual Child Workers in Asia (CWA) Regional Seminar on Child Labor here. ""The best interests of the child must come first. In some instances the threat of trade sanctions has led only to children losing their income or being pushed into the informal sectors (of employment) where they are even more vulnerable,"" he added.",related
2,"China turns tables, details ""terrible plight"" of US children. China turned the tables Thursday on US critics of its treatment of orphans, issuing a lengthy article detailing the ""terrible plight"" of numerous children facing poverty and violence in the United States. The report, signed by Ren Yenshi and to be published in the party mouthpiece People's Daily Friday, was released Thursday by Xinhua, accusing US human rights organisations of resorting to the ""mean trick"" of fabricating allegations about other countries while ignoring problems at home. ""It is in the world's only superpower that the social tragedies of ill-treating and cruelly injuring children, as well as of hurting them physically and psychologically in various ways takes place every day,"" the article said. ""The US has a very poor record on social protection of children,"" Xinhua said, mimicking international human rights' organisations frequent appeals to China in urging the US government to ""take measures to improve its record."" The publication of the article comes some six weeks after New York- based Human Rights Watch/Asia released a 350-page report accusing workers in China's state orphanages of routinely neglecting children and leaving many to starve to death in a policy that received tacit state sanction.",related
3,"China turns tables, details ""terrible plight"" of US children. China turned the tables Thursday on US critics of its treatment of orphans, issuing a lengthy article detailing the ""terrible plight"" of numerous children facing poverty and violence in the United States. The report, signed by Ren Yenshi and to be published on Friday in the party mouthpiece the People's Daily, was released Thursday by Xinhua, accusing US human rights organisations of resorting the ""mean trick"" of fabricating allegations about other countries while ignoring problems at home. ""It is in the world's only superpower that the social tragedies of ill-treating and cruelly injuring children, as well as of hurting them physically and psychologically in various ways takes place every day,"" the article said. ""The US has a very poor record on social protection of children,"" Xinhua said, mimicking international human rights' organisations frequent appeals to China in urging the US government to ""take measures to improve its record."" The publication of the article comes some six weeks after New York- based Human Rights Watch/Asia released a 350-page report accusing workers in China's state orphanages of routinely neglecting children and leaving many to starve to death in a policy that received tacit state sanction.",related
4,"Aid agency targets women in Afghanistan's poorest society by Marc Lavine. Some of Afghanistan's poorest women are for the first time being given a chance to control their family lives by aid agency workers in this traditionally male-dominated society. The British group Oxfam has defied Afghan custom, which keeps females firmly in the background of family management, by launching a distribution drive which targets the women of Kabul's poorest families. Women representing about 8,000 families in two of the city's most war- devastated districts have been issued blankets, children's clothes and heavy plastic sheeting, to be used for insulation or covering doors and windows left gaping by factional fighting. For most of the women who turned up to receive the aid, wearing the traditional burkas -- Islamic head-to-toe body covers -- it was the first time they had acquired property, and domestic rights, since their marriages. ""This makes us feel that our value is at last being appreciated and that we actually matter and are a crucial part of the family unit,"" Soraya, a 25-year-old widow and mother of six told AFP.",related
...,...,...
APW_ENG_20100920.0398,"Fashion greats remember McQueen in London service. Fashionistas celebrated the life and legacy of Alexander McQueen in a solemn ceremony at St. Paul's Cathedral on Monday, seven months after his suicide shocked the designing world. The service for McQueen briefly brought London Fashion Week to a halt as top designers, models and editors came to pay their respects to the enfant terrible of British fashion, who was 40 when he died in early February. He had a history of depression and was said to be devastated by the recent loss of his mother. His suicide has deprived the British fashion world of its biggest and most controversial star. The service was attended by many of the models and actresses who loved McQueen's work, including Naomi Campbell, Kate Moss, Sarah Jessica Parker, Jade Parfitt and others, including designer Stella McCartney. Anna Wintour, editor-in-chief of the American edition of Vogue magazine, was the first of McQueen's admirers to speak. Wearing an elegant black-and-gold outfit, she praised his ""exceptional legacy of brilliant inspiration.""",unrelated
AFP_ENG_19941115.0022,"Tamils urge reciprocal gesture to Tiger truce. Tamil politicians urged the government Tuesday to formally respond to a unilateral ceasefire declared over the weekend by separatist Tamil Tiger guerrillas. The Liberation Tigers of Tamil Eelam (LTTE) called the one-week truce to mark Saturday's inauguration of new President Chandrika Kumaratunga, who had initiated talks with the rebels to end ethnic bloodletting. ""The LTTE by its ceasefire has demonstrated that it wants the peace process to continue. What is going to be the government's response?"" Tamil legislator Dharmalingam Sidhathan said. Sidhathan leads the anti-LTTE Democratic People's Liberation Front (DPLF). President Kumaratunga suspended talks with Tigers after the rebels were implicated in killing 57 people at an opposition rally last month, including opposition leader, Gamini Dissanayake.",unrelated
NYT_ENG_19990517.0460,"GENERAL DYNAMICS TO BUY GULFSTREAM, MAKER OF JET PLANES. The General Dynamics Corp., a military contractor that specializes in making nuclear submarines, surface ships and tanks, announced Monday that it would acquire the Gulfstream Aerospace Corp., a maker of corporate jets, for $5.3 billion in a deal that returns General Dynamics to a business it once shed. The Gulfstream acquisition, a 1-for-1 stock swap that would give General Dynamics shares to Gulfstream shareholders, would add a new and potentially cyclical business to General Dynamics' product line, which is heavily laden with armaments that are steady cash generators. It would also move General Dynamics back to the aerospace industry, a business it stepped away from in the early 1990s when it sold Cessna Aircraft, a maker of business jets, to Textron and when it sold its F-16 fighter jet operations to Lockheed Martin as well as its missile division to Hughes Electronics. The deal for Gulfstream elicited mix reaction on Wall Street. General Dynamics tumbled nearly 10 percent, or $6.1875, to $65.25, after having traded as low as $63.25 by midday. Still, while many on Wall Street were skeptical about the deal, others said Gulfstream would add a strong cash generator to the company, even if it took General Dynamics in a new direction. ``The deal makes a lot of sense from a financial perspective,'' said Pierre Chao, an analyst with Morgan Stanley Dean Witter. ``General Dynamics is adding a company with a solid background and great cash flow, which will allow them to fuel further defense acquisitions.'' He said the market was forgetting that General Dynamics used to be in the aerospace business.",unrelated
AFP_ENG_20090729.0244,"Croatia, Slovenia PMs to meet over border row. Croatia said Wednesday that its new prime minister will meet this week with her counterpart from Slovenia, which is blocking Zagreb's EU accession over a long- running border row. Jadranka Kosor, who became Croatia's first woman premier earlier this month, was to hold talks Friday with Slovenia's Borut Pahor in Trakoscan, some 70 kilometres (43 miles) north of Zagreb, said a government statement. ""With this meeting, a bilateral dialogue on the level of the premiers which started in February will continue,"" it said referring to a meeting in Slovenia between Pahor and Kosor's predecessor Ivo Sanader. Slovenia has been blocking Croatia's talks to join the European Union since December over a border dispute dating back to 1991, when both countries proclaimed independence from the former Yugoslavia. Kosor said Wednesday she was ""very optimisic"" of resolving the issue, which concerns a small slice of land and sea crucial for Slovenia's access to the Adriatic Sea and international shipping waters.",unrelated


# add Bert Transformers

In [39]:
# bert-large-nli-mean-tokens is also usable with small improvements
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [40]:
texts = balanced_labeled_df.text.to_list()

In [41]:
embedded_texts = []
for text in texts:
    sentences = nltk.sent_tokenize(text)
    if len(sentences) >=6:
        sentence_embeddings = model.encode(sentences[:6])
    else:
        for n in range (6-len(sentences)):
            sentences.append(sentences[0])
    embedded_texts.append(sentence_embeddings)
    

## concatenated and mean embeddings

In [42]:
# use concatenated list or mean list in the train_test_split function below

concatenated_list = []
mean_list=[]

for six_embeds in embedded_texts:
    new = np.concatenate(six_embeds)
    concatenated_list.append(new)
    mean_embeds_per_text = np.mean(six_embeds, axis = 0)
    mean_list.append(mean_embeds_per_text)
    
    

# SVM classifier

In [43]:
training_labels = list(balanced_labeled_df.label)

# Test

In [47]:
from sklearn.model_selection import train_test_split
# mean_list can be exchanged for concatenated_list
X_train, X_test, y_train, y_test = train_test_split(mean_list, training_labels, test_size=0.33, random_state=42)

In [48]:
# change this to a cross-fold validation for more robust metrics

svm_var = LinearSVC(random_state=0, tol=1e-5, max_iter = 5000)
svm_var.fit(X_train, y_train)
predictions = list(svm_var.predict(X_test))

In [None]:
predictions

In [46]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

     related       0.92      0.87      0.90       151
   unrelated       0.87      0.92      0.89       135

    accuracy                           0.90       286
   macro avg       0.90      0.90      0.90       286
weighted avg       0.90      0.90      0.90       286

