In [1]:
import requests
from bs4 import BeautifulSoup
import re
import sklearn
import time
import numpy as np
import json
import pandas as pd
pd.set_option("max_colwidth", 500)

# import sys   
# sys.setrecursionlimit(25000)

from pandarallel import pandarallel

In [2]:
def extract_row(url):
    try:
        time.sleep(1)
        res=requests.get(url)
        soup = BeautifulSoup(res.content,'html.parser')
    except:
        print('cant connect',url)
        return   
    
    rows = soup.find_all('td', {'class':'html'})
    
    if len(rows):
        outputs = []
        for row in rows:
            try:
                title = row.a.text
                title = re.sub("^\s+|\s+$", "", title, flags=re.UNICODE)
                href = row.a['href']
                if href.startswith('/'):
                    href = 'https://www.refworld.org' + href

                p = soup.find_all('td', {'class':'html'})[0].p.text.split('|')
                date = re.sub("^\s+|\s+$", "", p[0], flags=re.UNICODE)
                Publisher = re.sub("^\s+|\s+$", "", p[1], flags=re.UNICODE)
                types = re.sub("^\s+|\s+$", "", p[2], flags=re.UNICODE) 

            except:
                print(url)

            output = {'title':title, 'date':date,'Publisher':Publisher,'types':types, 'href': href}
            outputs.append(output)
    else:
        outputs = None      
    return outputs

from newspaper import Article
def extract_text(url):
    try:
        time.sleep(2)
        a = Article(url)
        a.download()
        a.parse()
        text = a.text
    except:
        text = ''
    return text

In [2]:
prefix = 'https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&skip='
suffix = '&publisher=USCRI&searchin=fulltext&sort=date'

df = pd.DataFrame(range(0, 1321,10), columns = ['page'])
df.page = prefix + df.page.astype('str') + suffix
df

Unnamed: 0,page
0,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&skip=0&publisher=USCRI&searchin=fulltext&sort=date
1,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&skip=1&publisher=USCRI&searchin=fulltext&sort=date
2,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&skip=2&publisher=USCRI&searchin=fulltext&sort=date
3,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&skip=3&publisher=USCRI&searchin=fulltext&sort=date
4,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&skip=4&publisher=USCRI&searchin=fulltext&sort=date
...,...
1316,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&skip=1316&publisher=USCRI&searchin=fulltext&sort=date
1317,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&skip=1317&publisher=USCRI&searchin=fulltext&sort=date
1318,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&skip=1318&publisher=USCRI&searchin=fulltext&sort=date
1319,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&skip=1319&publisher=USCRI&searchin=fulltext&sort=date


In [5]:
pandarallel.initialize(nb_workers=32)
batch_size = 32*8

i=0
while i<= df.shape[0]:
    print(i, end=',')
    df.loc[i: i+ batch_size-1, 'rows']= df.loc[i: i+ batch_size-1, 'page'].parallel_apply(extract_row).values
    i += batch_size

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
0,256,512,768,1024,1280,

In [6]:
df.to_csv('refword_raw.csv', index=False)

In [22]:
df1 = df.explode('rows').reset_index(drop=True)
df1 = pd.json_normalize(df1['rows']).sort_values('date')
# df1 = df1[df1.text.str.len()>200].reset_index(drop=True)
df1 =df1.reset_index(drop=True)
df1

Unnamed: 0,title,date,Publisher,types,href
0,Israel Escalates Demolition of Palestinian Refugee Homes,1 August 2001,Publisher: United States Committee for Refugees and Immigrants,Document type: Annual Reports,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&docid=3c58099a11&skip=0&publisher=USCRI&searchin=fulltext&sort=date
1,U.S. Committee for Refugees World Refugee Survey 2001 - Burkina Faso,1 August 2001,Publisher: United States Committee for Refugees and Immigrants,Document type: Annual Reports,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&docid=3ca988865&skip=0&publisher=USCRI&searchin=fulltext&sort=date
2,U.S. Committee for Refugees World Refugee Survey 2001 - Eritrea,1 August 2001,Publisher: United States Committee for Refugees and Immigrants,Document type: Annual Reports,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&docid=3b31e1620&skip=0&publisher=USCRI&searchin=fulltext&sort=date
3,U.S. Committee for Refugees World Refugee Survey 2001 - Haiti,1 August 2001,Publisher: United States Committee for Refugees and Immigrants,Document type: Annual Reports,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&docid=3c56c11521&skip=0&publisher=USCRI&searchin=fulltext&sort=date
4,Newly Displaced in the Balkans: Macedonia Erupts in Violence,1 August 2001,Publisher: United States Committee for Refugees and Immigrants,Document type: Annual Reports,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&docid=3c58099a5&skip=0&publisher=USCRI&searchin=fulltext&sort=date
...,...,...,...,...,...
13100,U.S. Committee for Refugees World Refugee Survey 2004 - Cameroon,25 May 2004,Publisher: United States Committee for Refugees and Immigrants,Document type: Annual Reports,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&docid=40b459358&skip=0&publisher=USCRI&searchin=fulltext&sort=date
13101,U.S. Committee for Refugees World Refugee Survey 2004 - Chad,25 May 2004,Publisher: United States Committee for Refugees and Immigrants,Document type: Annual Reports,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&docid=40b459360&skip=0&publisher=USCRI&searchin=fulltext&sort=date
13102,U.S. Committee for Refugees World Refugee Survey 2004 - Bosnia and Herzegovina,25 May 2004,Publisher: United States Committee for Refugees and Immigrants,Document type: Annual Reports,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&docid=40b459344&skip=0&publisher=USCRI&searchin=fulltext&sort=date
13103,U.S. Committee for Refugees World Refugee Survey 2004 - Bulgaria,25 May 2004,Publisher: United States Committee for Refugees and Immigrants,Document type: Annual Reports,https://www.refworld.org/cgi-bin/texis/vtx/rwmain?page=publisher&docid=40b4593410&skip=0&publisher=USCRI&searchin=fulltext&sort=date


In [28]:
pandarallel.initialize(nb_workers=8, progress_bar=False)
batch_size = 8*5


# i=2400
while i < df1.shape[0]:
    df1.loc[i:i+batch_size-1, 'text']=\
    df1.loc[i:i+batch_size-1, 'href'].parallel_apply(extract_text).values

    print(i, end=',')
    
    if i% 100 ==0:
        df1.to_csv('refword_text.csv', index=False)
        
    i = i + batch_size
    
df1.to_csv('refword_text.csv', index=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
3520,3560,3600,3640,3680,3720,3760,3800,3840,3880,3920,3960,4000,4040,4080,4120,4160,4200,4240,4280,4320,4360,4400,4440,4480,4520,4560,4600,4640,4680,4720,4760,4800,4840,4880,4920,4960,5000,5040,5080,5120,5160,5200,5240,5280,5320,5360,5400,5440,5480,5520,5560,5600,5640,5680,5720,5760,5800,5840,5880,5920,5960,6000,6040,6080,6120,6160,6200,6240,6280,6320,6360,6400,6440,6480,6520,6560,6600,6640,6680,6720,6760,6800,6840,6880,6920,6960,7000,7040,7080,7120,7160,7200,7240,7280,7320,7360,7400,7440,7480,7520,7560,7600,7640,7680,7720,7760,7800,7840,7880,7920,7960,8000,8040,8080,8120,8160,8200,8240,8280,8320,8360,8400,8440,8480,8520,8560,8600,8640,8680,8720,8760,8800,8840,8880,8920,8960,9000,9040,9080,9120,9160,9200,9240,9280,9320,9360,9400,9440,9480,9520,9560,9600,9640,9680,9720,9760,9800,9840,9880,9920,9960,10000,10040,10080,10120,10160,10200,10240,10280,

# United State 

In [3]:
# prefix = 'https://www.refworld.org/publisher,USDOS,,,,,'
# suffix = '.html'

# df = pd.DataFrame(range(0, 13681,10), columns = ['page'])
# df.page = prefix + df.page.astype('str') + suffix
# df

In [4]:
# # df=pd.read_csv('US_raw.csv', header=[0])
# df1 = df.loc[df.rows.isnull()].copy().reset_index(drop=True)
# df1

In [5]:
# pandarallel.initialize(nb_workers=4)
# batch_size = 4

# i=resume=0

# while i< df1.shape[0]:

#     df1.loc[i: i+ batch_size-1, 'rows']= df1.loc[i: i+ batch_size-1, 'page'].parallel_apply(extract_row).values
    
#     df2 = df1.loc[i: i+ batch_size-1, 'rows']
#     valid_len =  df2[~df2.isnull()].shape[0]
    
#     print(i,':',valid_len, end='\t')
    
#     i += batch_size
#     if (i-resume)%(batch_size*5)==0:   
#         df1.to_csv('US_null.csv', index=False)
# df1.to_csv('US_null.csv', index=False)

In [6]:
def get_all_links(href,number):
    return [href.replace('0.html', '%s.html'%i) for i in range(0, number-number%10+1, 10)]

In [7]:
url = 'https://www.refworld.org/publisher,USDOS,ANNUALREPORT,,,,0.html'
res=requests.get(url, timeout=20)
soup = BeautifulSoup(res.content,'html.parser')

ul = soup.find_all('ul', class_ = 'rwbullets')[0]
country_list = ul.find_all('li')

outputs = []
for country_link in country_list:
    href = 'https://www.refworld.org'+ country_link.a['href']
    country = country_link.a.text
    number = country_link.text.split()[-1]
    number = int(number)
    output = {'country':country,'number':number,'base_href':href}
    outputs.append(output)
    
df= pd.json_normalize(outputs)
df

Unnamed: 0,country,number,base_href
0,Afghanistan,125,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html"
1,Albania,78,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ALB,,,0.html"
2,Algeria,88,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,DZA,,,0.html"
3,American Samoa,5,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ASM,,,0.html"
4,Andorra,44,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AND,,,0.html"
...,...,...,...
207,Wallis and Futuna,3,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,WLF,,,0.html"
208,Western Sahara Territory,42,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ESH,,,0.html"
209,Yemen,96,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,YEM,,,0.html"
210,Zambia,63,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZMB,,,0.html"


In [9]:
df['href']= df.apply(lambda x: get_all_links(x['base_href'], x['number']), axis=1)
df= df.explode('href').reset_index(drop=True)
df

Unnamed: 0,country,number,base_href,href
0,Afghanistan,125,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html"
1,Afghanistan,125,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,10.html"
2,Afghanistan,125,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,20.html"
3,Afghanistan,125,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,30.html"
4,Afghanistan,125,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,40.html"
...,...,...,...,...
1533,Zimbabwe,64,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,20.html"
1534,Zimbabwe,64,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,30.html"
1535,Zimbabwe,64,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,40.html"
1536,Zimbabwe,64,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,50.html"


In [10]:
def extract_row(url):
    try:
        time.sleep(1)
        res=requests.get(url, timeout=20)
        soup = BeautifulSoup(res.content,'html.parser')
    except:
        print('cant connect',url)
        return   
    
    rows = soup.find_all('td', {'class':'html'})
    
    if len(rows):
        outputs = []
        for row in rows:
            try:
                title = row.a.text
                title = re.sub("^\s+|\s+$", "", title, flags=re.UNICODE)
                href = row.a['href']
                if href.startswith('/'):
                    href = 'https://www.refworld.org' + href

                p = soup.find_all('td', {'class':'html'})[0].p.text.split('|')
                date = re.sub("^\s+|\s+$", "", p[0], flags=re.UNICODE)
                Publisher = re.sub("^\s+|\s+$", "", p[1], flags=re.UNICODE)
                types = re.sub("^\s+|\s+$", "", p[2], flags=re.UNICODE) 

            except:
                print(url)

            output = {'title':title, 'date':date,'Publisher':Publisher,'types':types, 'href': href}
            outputs.append(output)
    else:
        outputs = None      
    return outputs

In [11]:
df1= df.copy()

In [12]:
pandarallel.initialize(nb_workers=32, progress_bar=False)
batch_size = 32*10
i=0
while i < df1.shape[0]:
    print(i, end=',')
    df1.loc[i:i+batch_size-1, 'rows']=\
    df1.loc[i:i+batch_size-1, 'href'].parallel_apply(extract_row).values
    i = i + batch_size 
df1.to_csv('US_rows.csv', index=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
0,320,640,960,1280,

In [13]:
df1[df1.rows.isnull()]

Unnamed: 0,country,number,base_href,href,rows
43,Angola,70,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AGO,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AGO,,,70.html",
66,Armenia,70,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ARM,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ARM,,,70.html",
76,Australia,70,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AUS,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AUS,,,70.html",
166,Bolivia,70,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,BOL,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,BOL,,,70.html",
175,Bosnia and Herzegovina,80,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,BIH,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,BIH,,,80.html",
348,Cyprus,80,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,CYP,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,CYP,,,80.html",
363,Côte d'Ivoire,70,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,CIV,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,CIV,,,70.html",
399,Ecuador,70,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ECU,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ECU,,,70.html",
423,Equatorial Guinea,60,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,GNQ,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,GNQ,,,60.html",
490,Gambia,60,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,GMB,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,GMB,,,60.html",


In [14]:
df1 = df1[~df1.rows.isnull()]
df1.shape

(1522, 5)

In [15]:
df1 = df1.explode('rows').reset_index(drop=True)
df1

Unnamed: 0,country,number,base_href,href,rows
0,Afghanistan,125,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","{'title': 'Country Reports on Terrorism 2017 - Afghanistan', 'date': '19 September 2018', 'Publisher': 'Publisher: United States Department of State', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,5bcf1fbea,0.html'}"
1,Afghanistan,125,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","{'title': 'Country Reports on Terrorism 2017 - Foreign Terrorist Organizations: Tehrik-e Taliban Pakistan', 'date': '19 September 2018', 'Publisher': 'Publisher: United States Department of State', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,5bcf1f2847,0.html'}"
2,Afghanistan,125,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","{'title': 'Country Reports on Terrorism 2017 - Foreign Terrorist Organizations: Lashkar e-Tayyiba', 'date': '19 September 2018', 'Publisher': 'Publisher: United States Department of State', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,5bcf1f354,0.html'}"
3,Afghanistan,125,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","{'title': 'Country Reports on Terrorism 2017 - Foreign Terrorist Organizations: Jundallah', 'date': '19 September 2018', 'Publisher': 'Publisher: United States Department of State', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,5bcf1f3813,0.html'}"
4,Afghanistan,125,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,,,0.html","{'title': 'Country Reports on Terrorism 2017 - Foreign Terrorist Organizations: Jaish-e-Mohammed', 'date': '19 September 2018', 'Publisher': 'Publisher: United States Department of State', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,5bcf1f3dc,0.html'}"
...,...,...,...,...,...
14126,Zimbabwe,64,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,50.html","{'title': 'U.S. Department of State Country Report on Human Rights Practices 1997 - Zimbabwe', 'date': '31 March 2003', 'Publisher': 'Publisher: United States Department of State', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,3ae6aa1f28,0.html'}"
14127,Zimbabwe,64,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,60.html","{'title': 'U.S. Department of State Country Report on Human Rights Practices 1996 - Zimbabwe', 'date': '30 January 1997', 'Publisher': 'Publisher: United States Department of State', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,3ae6aa3134,0.html'}"
14128,Zimbabwe,64,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,60.html","{'title': 'U.S. Department of State Country Report on Human Rights Practices 1995 - Zimbabwe', 'date': '30 January 1997', 'Publisher': 'Publisher: United States Department of State', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,3ae6aa40c,0.html'}"
14129,Zimbabwe,64,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,0.html","https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,,,60.html","{'title': 'U.S. Department of State Country Report on Human Rights Practices 1994 - Zimbabwe', 'date': '30 January 1997', 'Publisher': 'Publisher: United States Department of State', 'types': 'Document type: Annual Reports', 'href': 'https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,3ae6aa4a2c,0.html'}"


In [16]:
 df1[df1.rows.isnull()]

Unnamed: 0,country,number,base_href,href,rows


In [21]:
df1 = pd.json_normalize(df1['rows']).reset_index(drop=True)
# df1[df1.rows.duplicated()]
df1

Unnamed: 0,title,date,Publisher,types,href
0,Country Reports on Terrorism 2017 - Afghanistan,19 September 2018,Publisher: United States Department of State,Document type: Annual Reports,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,5bcf1fbea,0.html"
1,Country Reports on Terrorism 2017 - Foreign Terrorist Organizations: Tehrik-e Taliban Pakistan,19 September 2018,Publisher: United States Department of State,Document type: Annual Reports,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,5bcf1f2847,0.html"
2,Country Reports on Terrorism 2017 - Foreign Terrorist Organizations: Lashkar e-Tayyiba,19 September 2018,Publisher: United States Department of State,Document type: Annual Reports,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,5bcf1f354,0.html"
3,Country Reports on Terrorism 2017 - Foreign Terrorist Organizations: Jundallah,19 September 2018,Publisher: United States Department of State,Document type: Annual Reports,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,5bcf1f3813,0.html"
4,Country Reports on Terrorism 2017 - Foreign Terrorist Organizations: Jaish-e-Mohammed,19 September 2018,Publisher: United States Department of State,Document type: Annual Reports,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,AFG,5bcf1f3dc,0.html"
...,...,...,...,...,...
14126,U.S. Department of State Country Report on Human Rights Practices 1997 - Zimbabwe,31 March 2003,Publisher: United States Department of State,Document type: Annual Reports,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,3ae6aa1f28,0.html"
14127,U.S. Department of State Country Report on Human Rights Practices 1996 - Zimbabwe,30 January 1997,Publisher: United States Department of State,Document type: Annual Reports,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,3ae6aa3134,0.html"
14128,U.S. Department of State Country Report on Human Rights Practices 1995 - Zimbabwe,30 January 1997,Publisher: United States Department of State,Document type: Annual Reports,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,3ae6aa40c,0.html"
14129,U.S. Department of State Country Report on Human Rights Practices 1994 - Zimbabwe,30 January 1997,Publisher: United States Department of State,Document type: Annual Reports,"https://www.refworld.org/publisher,USDOS,ANNUALREPORT,ZWE,3ae6aa4a2c,0.html"


In [24]:
df1.Publisher.unique(),df1.types.unique(), df1[df1.href.duplicated()].shape

(array(['Publisher: United States Department of State'], dtype=object),
 array(['Document type: Annual Reports'], dtype=object),
 (0, 5))

In [30]:
pandarallel.initialize(nb_workers=16, progress_bar=False)
batch_size = 16*10

i=resume=5760
while i < df1.shape[0]:
    df1.loc[i:i+batch_size-1, 'text']=\
    df1.loc[i:i+batch_size-1, 'href'].parallel_apply(extract_text).values

    print(i, end=',')
    
    if (i - resume)%(batch_size * 10) ==0:
        df1.to_csv('US_text.csv', index=False)
        
    i = i + batch_size
    
df1.to_csv('US_text.csv', index=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
5760,5920,6080,6240,6400,6560,6720,6880,7040,7200,7360,7520,7680,7840,8000,8160,8320,8480,8640,8800,8960,9120,9280,9440,9600,9760,9920,10080,10240,10400,10560,10720,10880,11040,11200,11360,11520,11680,11840,12000,12160,12320,12480,12640,12800,12960,13120,13280,13440,13600,13760,13920,14080,

In [31]:
quit()