In [2]:
import requests
from bs4 import BeautifulSoup
import re
import sklearn
import time
import numpy as np
import json
import pandas as pd
pd.set_option("max_colwidth", 500)

# import sys   
# sys.setrecursionlimit(25000)

from pandarallel import pandarallel


from newspaper import Article
def extract_text(url):
    try:
        time.sleep(2)
        a = Article(url)
        a.download()
        a.parse()
        text = a.text
    except:
        text = ''
    return text

def get_all_links(href,number):
    return [href.replace('0.html', '%s.html'%i) for i in range(0, number-number%10+1, 10)]

In [3]:
df = []

for url in ['https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,,,,0.html',\
            'https://www.refworld.org/publisher,AMNESTY,COUNTRYNEWS,,,,0.html',\
            'https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,,,,0.html']:

    res=requests.get(url, timeout=20)
    soup = BeautifulSoup(res.content,'html.parser')

    ul = soup.find_all('ul', class_ = 'rwbullets')[0]
    country_list = ul.find_all('li')

    outputs = []
    for country_link in country_list:
        href = 'https://www.refworld.org'+ country_link.a['href']
        country = country_link.a.text
        number = country_link.text.split()[-1]
        number = int(number)
        output = {'country':country,'number':number,'base_href':href}
        outputs.append(output)

    df.append(pd.json_normalize(outputs))

df= pd.concat(df)
df

Unnamed: 0,country,number,base_href
0,Afghanistan,30,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html"
1,Albania,24,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,ALB,,,0.html"
2,Algeria,28,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,DZA,,,0.html"
3,Angola,26,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AGO,,,0.html"
4,Antigua and Barbuda,1,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,ATG,,,0.html"
...,...,...,...
177,Viet Nam,24,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,VNM,,,0.html"
178,Western Sahara Territory,7,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ESH,,,0.html"
179,Yemen,17,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,,,0.html"
180,Zambia,3,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZMB,,,0.html"


In [4]:
df['href']= df.apply(lambda x: get_all_links(x['base_href'], x['number']), axis=1)
df= df.explode('href').reset_index(drop=True)
df

Unnamed: 0,country,number,base_href,href
0,Afghanistan,30,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html"
1,Afghanistan,30,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,10.html"
2,Afghanistan,30,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,20.html"
3,Afghanistan,30,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,30.html"
4,Albania,24,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,ALB,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,ALB,,,0.html"
...,...,...,...,...
1754,Yemen,17,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,,,0.html","https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,,,10.html"
1755,Zambia,3,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZMB,,,0.html","https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZMB,,,0.html"
1756,Zimbabwe,26,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZWE,,,0.html","https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZWE,,,0.html"
1757,Zimbabwe,26,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZWE,,,0.html","https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZWE,,,10.html"


In [5]:
def extract_row(url):
    try:
        time.sleep(1)
        res=requests.get(url, timeout=20)
        soup = BeautifulSoup(res.content,'html.parser')
    except:
        print('cant connect',url)
        return   
    
    rows = soup.find_all('td', {'class':'html'})
    
    if len(rows):
        outputs = []
        for row in rows:
            try:
                title = row.a.text
                title = re.sub("^\s+|\s+$", "", title, flags=re.UNICODE)
                href = row.a['href']
                if href.startswith('/'):
                    href = 'https://www.refworld.org' + href

                p = soup.find_all('td', {'class':'html'})[0].p.text.split('|')
                date = re.sub("^\s+|\s+$", "", p[0], flags=re.UNICODE)
                Publisher = re.sub("^\s+|\s+$", "", p[1], flags=re.UNICODE)
                types = re.sub("^\s+|\s+$", "", p[2], flags=re.UNICODE) 

            except:
                print(url)

            output = {'title':title, 'date':date,'Publisher':Publisher,'types':types, 'href': href}
            outputs.append(output)
    else:
        outputs = None      
    return outputs

In [6]:
df1= df.copy()

In [7]:
pandarallel.initialize(nb_workers=64, progress_bar=False)
batch_size = 64*8
i=0
while i < df1.shape[0]:
    print(i, end=',')
    df1.loc[i:i+batch_size-1, 'rows']=\
    df1.loc[i:i+batch_size-1, 'href'].parallel_apply(extract_row).values
    i = i + batch_size 
df1.to_csv('refword_amnesty_rows.csv', index=False)

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
0,512,1024,1536,

In [8]:
df1[df1.rows.isnull()]

Unnamed: 0,country,number,base_href,href,rows
3,Afghanistan,30,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,30.html",
79,Canada,20,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,CAN,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,CAN,,,20.html",
131,Egypt,30,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,EGY,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,EGY,,,30.html",
210,Iraq,30,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,IRQ,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,IRQ,,,30.html",
296,Mongolia,20,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,MNG,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,MNG,,,20.html",
...,...,...,...,...,...
1744,Uzbekistan,40,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,UZB,,,0.html","https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,UZB,,,20.html",
1746,Uzbekistan,40,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,UZB,,,0.html","https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,UZB,,,40.html",
1749,Viet Nam,24,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,VNM,,,0.html","https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,VNM,,,0.html",
1753,Yemen,17,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,,,0.html","https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,,,0.html",


In [9]:
df1 = df1[~df1.rows.isnull()]
df1.shape

(1467, 5)

In [10]:
df1 = df1.explode('rows').reset_index(drop=True)
df1

Unnamed: 0,country,number,base_href,href,rows
0,Afghanistan,30,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","{'title': 'Amnesty International Report 2017/18 - Afghanistan', 'date': '22 February 2018', 'Publisher': 'Publisher: Amnesty International', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,5a99395da,0.html'}"
1,Afghanistan,30,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","{'title': 'Amnesty International Report 2016/17 - Afghanistan', 'date': '22 February 2018', 'Publisher': 'Publisher: Amnesty International', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,58b034294,0.html'}"
2,Afghanistan,30,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","{'title': 'Amnesty International Report 2015/16 - Afghanistan', 'date': '22 February 2018', 'Publisher': 'Publisher: Amnesty International', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,56d05b7cc,0.html'}"
3,Afghanistan,30,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","{'title': 'Amnesty International Report 2014/15 - Afghanistan', 'date': '22 February 2018', 'Publisher': 'Publisher: Amnesty International', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,54f07e2215,0.html'}"
4,Afghanistan,30,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,,,0.html","{'title': 'Death Sentences and Executions 2013 - Asia-Pacific', 'date': '22 February 2018', 'Publisher': 'Publisher: Amnesty International', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,53bd2e278,0.html'}"
...,...,...,...,...,...
9750,Yemen,17,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,,,0.html","https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,,,10.html","{'title': 'Ratification Without Implementation: The State of Human Rights in Yemen', 'date': '7 July 1999', 'Publisher': 'Publisher: Amnesty International', 'types': 'Document type: Country Reports', 'href': 'https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,3ae6a9e90,0.html'}"
9751,Yemen,17,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,,,0.html","https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,,,10.html","{'title': 'Human Rights Concerns Following Recent Armed Conflict', 'date': '7 July 1999', 'Publisher': 'Publisher: Amnesty International', 'types': 'Document type: Country Reports', 'href': 'https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,3ae6a9ae0,0.html'}"
9752,Zambia,3,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZMB,,,0.html","https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZMB,,,0.html","{'title': 'Applying the law fairly or fatally? Police violation of human rights in Zambia', 'date': '1 April 1999', 'Publisher': 'Publisher: Amnesty International', 'types': 'Document type: Country Reports', 'href': 'https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZMB,3ae6a9c718,0.html'}"
9753,Zimbabwe,26,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZWE,,,0.html","https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZWE,,,0.html","{'title': 'Zimbabwe: Between a rock and a hard place - women human rights defenders at risk', 'date': '25 July 2007', 'Publisher': 'Publisher: Amnesty International', 'types': 'Document type: Country Reports', 'href': 'https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZWE,46a75af62,0.html'}"


In [11]:
df1[df1.rows.isnull()]

Unnamed: 0,country,number,base_href,href,rows


In [12]:
df1 = pd.json_normalize(df1['rows']).reset_index(drop=True)
# df1[df1.rows.duplicated()]
df1

Unnamed: 0,title,date,Publisher,types,href
0,Amnesty International Report 2017/18 - Afghanistan,22 February 2018,Publisher: Amnesty International,Document type: Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,5a99395da,0.html"
1,Amnesty International Report 2016/17 - Afghanistan,22 February 2018,Publisher: Amnesty International,Document type: Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,58b034294,0.html"
2,Amnesty International Report 2015/16 - Afghanistan,22 February 2018,Publisher: Amnesty International,Document type: Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,56d05b7cc,0.html"
3,Amnesty International Report 2014/15 - Afghanistan,22 February 2018,Publisher: Amnesty International,Document type: Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,54f07e2215,0.html"
4,Death Sentences and Executions 2013 - Asia-Pacific,22 February 2018,Publisher: Amnesty International,Document type: Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,53bd2e278,0.html"
...,...,...,...,...,...
9750,Ratification Without Implementation: The State of Human Rights in Yemen,7 July 1999,Publisher: Amnesty International,Document type: Country Reports,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,3ae6a9e90,0.html"
9751,Human Rights Concerns Following Recent Armed Conflict,7 July 1999,Publisher: Amnesty International,Document type: Country Reports,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,3ae6a9ae0,0.html"
9752,Applying the law fairly or fatally? Police violation of human rights in Zambia,1 April 1999,Publisher: Amnesty International,Document type: Country Reports,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZMB,3ae6a9c718,0.html"
9753,Zimbabwe: Between a rock and a hard place - women human rights defenders at risk,25 July 2007,Publisher: Amnesty International,Document type: Country Reports,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZWE,46a75af62,0.html"


In [13]:
df1.Publisher.unique(),df1.types.unique(), df1[df1.href.duplicated()].shape

(array(['Publisher: Amnesty International'], dtype=object),
 array(['Document type: Annual Reports', 'Document type: Country News',
        'Document type: Country Reports'], dtype=object),
 (0, 5))

In [14]:
pandarallel.initialize(nb_workers=64, progress_bar=False)
batch_size = 64*8

i=resume=0
while i < df1.shape[0]:
    df1.loc[i:i+batch_size-1, 'text']=\
    df1.loc[i:i+batch_size-1, 'href'].parallel_apply(extract_text).values

    print(i, end=',')
    
    if (i - resume)%(batch_size * 10) ==0:
        df1.to_csv('refworld_amnesty_text.csv', index=False)
        
    i = i + batch_size
    
df1.to_csv('refworld_amnesty_text.csv', index=False)

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
0,512,1024,1536,2048,2560,3072,3584,4096,4608,5120,5632,6144,6656,7168,7680,8192,8704,9216,9728,

In [15]:
quit()

# clean

In [3]:
df = pd.read_csv('refworld_amnesty_text.csv', header=0)
df

Unnamed: 0,title,date,Publisher,types,href,text
0,Amnesty International Report 2017/18 - Afghanistan,22 February 2018,Publisher: Amnesty International,Document type: Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,5a99395da,0.html","Amnesty International Report 2017/18 - Afghanistan\n\nPublisher Amnesty International Publication Date 22 February 2018 Cite as Amnesty International, Amnesty International Report 2017/18 - Afghanistan, 22 February 2018, available at: https://www.refworld.org/docid/5a99395da.html [accessed 14 April 2021] Disclaimer This is not a UNHCR publication. UNHCR is not responsible for, nor does it necessarily endorse, its content. Any views expressed are solely those of the author or publisher and do..."
1,Amnesty International Report 2016/17 - Afghanistan,22 February 2018,Publisher: Amnesty International,Document type: Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,58b034294,0.html","Amnesty International Report 2016/17 - Afghanistan\n\nPublisher Amnesty International Publication Date 22 February 2017 Cite as Amnesty International, Amnesty International Report 2016/17 - Afghanistan, 22 February 2017, available at: https://www.refworld.org/docid/58b034294.html [accessed 14 April 2021] Disclaimer This is not a UNHCR publication. UNHCR is not responsible for, nor does it necessarily endorse, its content. Any views expressed are solely those of the author or publisher and do..."
2,Amnesty International Report 2015/16 - Afghanistan,22 February 2018,Publisher: Amnesty International,Document type: Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,56d05b7cc,0.html","Amnesty International Report 2015/16 - Afghanistan\n\nPublisher Amnesty International Publication Date 24 February 2016 Cite as Amnesty International, Amnesty International Report 2015/16 - Afghanistan, 24 February 2016, available at: https://www.refworld.org/docid/56d05b7cc.html [accessed 14 April 2021] Disclaimer This is not a UNHCR publication. UNHCR is not responsible for, nor does it necessarily endorse, its content. Any views expressed are solely those of the author or publisher and do..."
3,Amnesty International Report 2014/15 - Afghanistan,22 February 2018,Publisher: Amnesty International,Document type: Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,54f07e2215,0.html","Amnesty International Report 2014/15 - Afghanistan\n\nPublisher Amnesty International Publication Date 25 February 2015 Cite as Amnesty International, Amnesty International Report 2014/15 - Afghanistan, 25 February 2015, available at: https://www.refworld.org/docid/54f07e2215.html [accessed 14 April 2021] Disclaimer This is not a UNHCR publication. UNHCR is not responsible for, nor does it necessarily endorse, its content. Any views expressed are solely those of the author or publisher and d..."
4,Death Sentences and Executions 2013 - Asia-Pacific,22 February 2018,Publisher: Amnesty International,Document type: Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,53bd2e278,0.html","Death Sentences and Executions 2013 - Asia-Pacific\n\nPublisher Amnesty International Publication Date 27 March 2014 Cite as Amnesty International, Death Sentences and Executions 2013 - Asia-Pacific, 27 March 2014, available at: https://www.refworld.org/docid/53bd2e278.html [accessed 14 April 2021] Disclaimer This is not a UNHCR publication. UNHCR is not responsible for, nor does it necessarily endorse, its content. Any views expressed are solely those of the author or publisher and do not n..."
...,...,...,...,...,...,...
9750,Ratification Without Implementation: The State of Human Rights in Yemen,7 July 1999,Publisher: Amnesty International,Document type: Country Reports,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,3ae6a9e90,0.html","Ratification Without Implementation: The State of Human Rights in Yemen\n\nPublisher Amnesty International Publication Date 1 March 1997 Citation / Document Symbol MDE/31/01/97 Reference Amnesty International is a worldwide voluntary movement that works to prevent some of the gravest violations by governments of people's fundamental human rights. The main focus of its campaigning is to: free all prisoners of conscience people detained an Cite as Amnesty International, Ratification Without Im..."
9751,Human Rights Concerns Following Recent Armed Conflict,7 July 1999,Publisher: Amnesty International,Document type: Country Reports,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,3ae6a9ae0,0.html","Human Rights Concerns Following Recent Armed Conflict\n\nPublisher Amnesty International Publication Date 1 September 1994 Citation / Document Symbol MDE/31/06/94 Cite as Amnesty International, Human Rights Concerns Following Recent Armed Conflict, 1 September 1994, MDE/31/06/94, available at: https://www.refworld.org/docid/3ae6a9ae0.html [accessed 14 April 2021] Comments An Amnesty International delegation visited Yemen between 9 and 23 July 1994, to carry out research into allegations of h..."
9752,Applying the law fairly or fatally? Police violation of human rights in Zambia,1 April 1999,Publisher: Amnesty International,Document type: Country Reports,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZMB,3ae6a9c718,0.html","Applying the law fairly or fatally? Police violation of human rights in Zambia\n\nPublisher Amnesty International Publication Date 1 April 1999 Citation / Document Symbol AFR 63/01/99 Cite as Amnesty International, Applying the law fairly or fatally? Police violation of human rights in Zambia, 1 April 1999, AFR 63/01/99, available at: https://www.refworld.org/docid/3ae6a9c718.html [accessed 14 April 2021] Comments Amnesty International, together with human rights organizations in Zambia, hav..."
9753,Zimbabwe: Between a rock and a hard place - women human rights defenders at risk,25 July 2007,Publisher: Amnesty International,Document type: Country Reports,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZWE,46a75af62,0.html","Zimbabwe: Between a rock and a hard place - women human rights defenders at risk\n\nPublisher Amnesty International Publication Date 25 July 2007 Citation / Document Symbol AFR 46/017/2007 Reference http://web.amnesty.org/library/Index/ENGAFR460172007 Cite as Amnesty International, Zimbabwe: Between a rock and a hard place - women human rights defenders at risk, 25 July 2007, AFR 46/017/2007, available at: https://www.refworld.org/docid/46a75af62.html [accessed 14 April 2021] Disclaimer This..."


In [4]:
disclaimer = 'Any views expressed are solely those of the author or publisher and do not necessarily reflect those of UNHCR, the United Nations or its Member States.'

In [10]:
df.Publisher = df.Publisher.apply(lambda x: x.replace('Publisher:',''))
df.types = df.types.apply(lambda x: x.replace('Document type:',''))
df = df[~df.text.isnull()]
df = df[~df.text.str.contains('Search Refworld\n\nand / or country ')]
# df = df[df.text.str.contains(disclaimer)]
df = df.reset_index(drop=True)
df

Unnamed: 0,title,date,Publisher,types,href,text
0,Amnesty International Report 2017/18 - Afghanistan,22 February 2018,Amnesty International,Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,5a99395da,0.html","Islamic Republic of Afghanistan\n\nHead of state and government: Muhammad Ashraf Ghani\n\nThe civilian population suffered widespread human rights abuses as a result of the continuing conflict. Conflict-related violence led to deaths, injuries and displacement. Civilian casualties continued to be high; the majority were killed or injured by armed insurgent groups, but a significant minority by pro-government forces. The number of people internally displaced by conflict rose to more than 2 mi..."
1,Amnesty International Report 2016/17 - Afghanistan,22 February 2018,Amnesty International,Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,58b034294,0.html","Islamic Republic of Afghanistan\n\nHead of state and government: Mohammad Ashraf Ghani\n\nThe intensifying conflict resulted in widespread human rights violations and abuses. Thousands of civilians were killed, injured or displaced in the violence, while ongoing insecurity restricted access to education, health and other services. While armed insurgent groups were responsible for the majority of civilian casualties, pro-government forces also killed and injured civilians. Anti- and pro-gover..."
2,Amnesty International Report 2015/16 - Afghanistan,22 February 2018,Amnesty International,Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,56d05b7cc,0.html","Islamic Republic of Afghanistan\n\nHead of state and government: Muhammad Ashraf Ghani Ahmadzai\n\nThere was growing insecurity with insurgency and criminal activity worsening across the country. The first three months of 2015 were the most violent of any equivalent period on record. The UN Assistance Mission in Afghanistan (UNAMA) recorded 1,592 civilians killed and 3,329 injured in the first six months of 2015, while 70% of civilian casualties were attributed to Taliban and other armed ins..."
3,Amnesty International Report 2014/15 - Afghanistan,22 February 2018,Amnesty International,Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,54f07e2215,0.html","Islamic Republic of Afghanistan\n\nHead of state and government: Muhammad Ashraf Ghani Ahmadzai (replaced Hamid Karzai in September)\n\nThere was growing insecurity throughout the country in expectation of the planned withdrawal of 86,000 foreign troops in December, as the mandate of NATO's International Security Assistance Force (ISAF) ended. The USA committed its troops to remain engaged in combat until the end of 2015. The UN Assistance Mission in Afghanistan (UNAMA) reported that casualt..."
4,Death Sentences and Executions 2013 - Asia-Pacific,22 February 2018,Amnesty International,Annual Reports,"https://www.refworld.org/publisher,AMNESTY,ANNUALREPORT,AFG,53bd2e278,0.html","While some setbacks were recorded in the Asia-Pacific region last year, positive steps in a number of countries showed that, even among traditional supporters of the death penalty, progress towards abolition is tangible.\n\nTen countries were known to have carried out executions, two more than in 2012. China once again executed more people than the rest of the world put together, but it was not possible to obtain an accurate picture of the reality of capital punishment there. Amnesty Interna..."
...,...,...,...,...,...,...
8813,Ratification Without Implementation: The State of Human Rights in Yemen,7 July 1999,Amnesty International,Country Reports,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,3ae6a9e90,0.html","I. INTRODUCTION\n\nIn a letter dated 19 October 1996 addressed to the Secretary General of Amnesty International, Dr Abdul Karim al-Eryani, Yemen's Deputy Prime Minister and Minister of Foreign Affairs wrote:\n\n""I have received a press release issued by Amnesty International following the recent visit to Yemen. I was surprised at the harsh criticism of my country's human rights record mentioned in this document....While the Yemeni Government is trying to correct any mistakes in this field t..."
8814,Human Rights Concerns Following Recent Armed Conflict,7 July 1999,Amnesty International,Country Reports,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,YEM,3ae6a9ae0,0.html","I. Introduction\n\nAn Amnesty International delegation visited Yemen between 9 and 23 July 1994, to carry out research into allegations of human rights abuses, which were reported to have been committed since 4 May. The delegates interviewed over 60 prisoners of conscience and political detainees in Political Security detention centres in Sana'a and Ta'iz. They met with the families of some detainees, members of the Bar Association, human rights activists, members of Parliament and leading s..."
8815,Applying the law fairly or fatally? Police violation of human rights in Zambia,1 April 1999,Amnesty International,Country Reports,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZMB,3ae6a9c718,0.html","ZAMBIA: Applying the law fairly or fatally? Police violation of human rights in Zambia\n\n1. Introduction\n\nThere is political meddling in the way Zambia's 13,000 to 14,000 police officers carry out their duties. Paramilitary police use teargas and batons to break up peaceful public meetings and marches by opposition political parties and non-governmental organizations. Journalists are arrested for carrying out their legitimate work of reporting the news to the nation. Politicians are detai..."
8816,Zimbabwe: Between a rock and a hard place - women human rights defenders at risk,25 July 2007,Amnesty International,Country Reports,"https://www.refworld.org/publisher,AMNESTY,COUNTRYREP,ZWE,46a75af62,0.html","Introduction\n\nThe human rights situation in Zimbabwe has been deteriorating rapidly since 2000. Human rights violations are taking place in a context characterised by a fast-shrinking economy that is being accelerated by government policies. Those policies, particularly on land reform and forced evictions, have contributed significantly to reducing the entire population's capacity to obtain access to their rights to food,1 health, education and housing.\n\nZimbabwean women, who are active ..."


In [7]:
pandarallel.initialize()

# df.text = df.text.parallel_apply(lambda x: x.split(disclaimer)[1])
df.loc[df.text.str.contains(disclaimer), 'text']=\
    df.loc[df.text.str.contains(disclaimer), 'text'].parallel_apply(lambda x: x.split(disclaimer)[1])

#urls
df.text = df.text.apply(lambda x: re.sub(r"http\S+", "", x))
# email
df.text = df.text.apply(lambda x: re.sub("\S+@\S+(?:\.\S+)+",'',x))
#tel
df.text = df.text.apply(lambda x: re.sub('\(\+( |-|\d)+\)( |-|\d)+',' ',x))
df.text = df.text.apply(lambda x: re.sub('\+( |-|\d)+',' ',x))


df.text = df.text.apply(lambda x: re.sub("^\s+|\s+$", "", x, flags=re.UNICODE))
df.text = df.text.apply(lambda x: re.sub('\n\n+', "\n\n", x, flags=re.UNICODE))

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [12]:
df = df[df.text.str.len()>200]

Unnamed: 0,title,date,Publisher,types,href,text
3093,Afghan citizens beaten in detention at Ukraine airport,13 September 2012,Amnesty International,Country News,"https://www.refworld.org/publisher,AMNESTY,COUNTRYNEWS,AFG,4d82fa241e,0.html",
3388,Bahrain witnesses describe bloody crackdown,17 March 2011,Amnesty International,Country News,"https://www.refworld.org/publisher,AMNESTY,COUNTRYNEWS,BHR,4d82fa201a,0.html",
3389,Bahraini protesters tell of bloodshed as crackdown escalates,17 March 2011,Amnesty International,Country News,"https://www.refworld.org/publisher,AMNESTY,COUNTRYNEWS,BHR,4d82fa25c,0.html",
3390,Violent crackdown in Bahrain condemned,17 March 2011,Amnesty International,Country News,"https://www.refworld.org/publisher,AMNESTY,COUNTRYNEWS,BHR,4d82fa261e,0.html",
3483,Belarus urged to release election protesters after student convicted,30 June 2011,Amnesty International,Country News,"https://www.refworld.org/publisher,AMNESTY,COUNTRYNEWS,BLR,4d95726dc,0.html",
3795,China: Further information: Uighur academic sentenced to life: Ilham Tohti,20 November 2014,Amnesty International,Country News,"https://www.refworld.org/publisher,AMNESTY,COUNTRYNEWS,CHN,54254d4e4,0.html",
4418,El Salvador: Critical opportunity to put an end to total criminalization of abortion,23 April 2018,Amnesty International,Country News,"https://www.refworld.org/publisher,AMNESTY,COUNTRYNEWS,SLV,591303764,0.html",
4692,Guatemala: Submission to the UN Human Rights Commitee for the 104th Session of the Human Rights Committee (12-30 March 2012),29 March 2012,Amnesty International,Country News,"https://www.refworld.org/publisher,AMNESTY,COUNTRYNEWS,GTM,4f48e20e2,0.html",
4947,'They are throwing us on the street like dogs' - Europe abandons the Roma in Italy,18 April 2017,Amnesty International,Country News,"https://www.refworld.org/publisher,AMNESTY,COUNTRYNEWS,IRN,58eb3bc74,0.html",
5066,Deaths in Iranian prison must be investigated,27 April 2011,Amnesty International,Country News,"https://www.refworld.org/publisher,AMNESTY,COUNTRYNEWS,IRN,4d82fa23c,0.html",


In [5]:
df= df.rename(columns={'href': 'url'})
df.to_csv('refworld_amnesty_text.csv', index=False)