# Packages 

In [14]:
import json
import pandas as pd
pd.set_option("max_colwidth", 600)
import ast
from bs4 import BeautifulSoup
import re
import requests
import time
import numpy as np
import zipfile
import os
import html
import re
import glob
import pathlib
import unicodedata
import tarfile

from pandarallel import pandarallel
pandarallel.initialize()
from unidecode import unidecode

def get_csv_size(csv_name):
    get_size = os.path.getsize(os.getcwd() + '/'+csv_name)
    mb_size = get_size/(1024 * 1024)
    mb_size = round(mb_size,1)
    return mb_size

def get_attribute(filename):
    if filename.endswith('csv'):
        df1 = pd.read_csv(filename,header=[0])
    else:
        df1=pd.read_json(filename,orient="records", lines=True)
    return list(df1.columns) 

def split_large_file(filename, source, output,size=None):
    df1=pd.read_csv(filename,header=[0])
    if size ==None:
        size = get_csv_size(filename)
    num_chunks = size//20
    if num_chunks == 0:
        num_chunks = 1 
    df_all = np.array_split(df1, num_chunks)

    for idx, file in enumerate(df_all):
        file.to_csv('%s/%s_%03d.csv'%(output, source, idx), index=False)   
        
def show_all_files(folder):
    df = pd.DataFrame(glob.glob('%s/*'%folder), columns = ['path'])
    df['root'] = df.path.apply(lambda x: x.split('/')[0])
    df['source'] = df.path.apply(lambda x: x.replace('%s/'%folder,'').split('_')[0])
    df['filename'] = df.path.apply(lambda x: x.replace('%s/'%folder,'').split('/')[-1])
    df = df.sort_values('source').reset_index(drop=True)
    df['size'] = df['path'].parallel_apply(get_csv_size)
    return df

def unicodetoascii(text):
    TEXT = (text.
    		replace('\\xe2\\x80\\x99', "'").
            replace('\\xc3\\xa9', 'e').
            replace('\\xe2\\x80\\x90', '-').
            replace('\\xe2\\x80\\x91', '-').
            replace('\\xe2\\x80\\x92', '-').
            replace('\\xe2\\x80\\x93', '-').
            replace('\\xe2\\x80\\x94', '-').
            replace('\\xe2\\x80\\x94', '-').
            replace('\\xe2\\x80\\x98', "'").
            replace('\\xe2\\x80\\x9b', "'").
            replace('\\xe2\\x80\\x9c', '"').
            replace('\\xe2\\x80\\x9c', '"').
            replace('\\xe2\\x80\\x9d', '"').
            replace('\\xe2\\x80\\x9e', '"').
            replace('\\xe2\\x80\\x9f', '"').
            replace('\\xe2\\x80\\xa6', '...').#
            replace('\\xe2\\x80\\xb2', "'").
            replace('\\xe2\\x80\\xb3', "'").
            replace('\\xe2\\x80\\xb4', "'").
            replace('\\xe2\\x80\\xb5', "'").
            replace('\\xe2\\x80\\xb6', "'").
            replace('\\xe2\\x80\\xb7', "'").
            replace('\\xe2\\x81\\xba', "+").
            replace('\\xe2\\x81\\xbb', "-").
            replace('\\xe2\\x81\\xbc', "=").
            replace('\\xe2\\x81\\xbd', "(").
            replace('\\xe2\\x81\\xbe', ")"))
    return TEXT

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Check csv

In [2]:
pandarallel.initialize(nb_workers=8, progress_bar=False) 

df = pd.DataFrame(glob.glob('2.Organization/*/*.csv'), columns = ['path'])
df['source'] = df.path.apply(lambda x: x.split('/')[1].replace('.csv',''))
df = df.sort_values('source').reset_index(drop=True)
df['attributes']= df.path.parallel_apply(get_attribute)
df['size']= df.path.parallel_apply(get_csv_size)
df

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Unnamed: 0,path,source,attributes,size
0,2.Organization/amnesty/amnesty.csv,amnesty,"[date, title, abstract, text, types, topic, url, read_more_link, pdf_link]",86.3
1,2.Organization/crf/cfr.csv,crf,"[topic, title, date, url, text]",159.9
2,2.Organization/hrw/hrw.csv,hrw,"[Type, title, date, abstract, text, url]",317.0
3,2.Organization/newhuman/newhuman.csv,newhuman,"[title, date, topic, url, text]",155.9
4,2.Organization/ohchr/ohchr.csv,ohchr,"[title, text, country, url, country_url, news_search_link]",100.3
5,2.Organization/phr/phr.csv,phr,"[category, title, url, date, tags, text]",8.4
6,2.Organization/refworld-amnesty/refworld-amnesty.csv,refworld-amnesty,"[title, date, Publisher, types, url, text]",73.5
7,2.Organization/refworld-ca/refword-ca.csv,refworld-ca,"[title, date, Publisher, types, url, text]",92.6
8,2.Organization/refworld-uscri/refword-uscri.csv,refworld-uscri,"[title, date, Publisher, types, url, text]",8.8
9,2.Organization/refworld-usds/refword-usds.csv,refworld-usds,"[title, date, Publisher, types, url, text]",325.7


# Split

In [7]:
pandarallel.initialize(nb_workers=8, progress_bar=False)       
_ = df.parallel_apply(lambda x: split_large_file(x['path'],x['source'],'split'), axis=1) 

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [8]:
df = show_all_files('split')
df

Unnamed: 0,path,root,source,filename,size
0,split/amnesty_002.csv,split,amnesty,amnesty_002.csv,21.7
1,split/amnesty_003.csv,split,amnesty,amnesty_003.csv,22.3
2,split/amnesty_000.csv,split,amnesty,amnesty_000.csv,20.5
3,split/amnesty_001.csv,split,amnesty,amnesty_001.csv,21.8
4,split/crf_005.csv,split,crf,crf_005.csv,24.1
...,...,...,...,...,...
74,split/unnews_003.csv,split,unnews,unnews_003.csv,20.8
75,split/unnews_006.csv,split,unnews,unnews_006.csv,17.0
76,split/unnews_007.csv,split,unnews,unnews_007.csv,15.8
77,split/unnews_004.csv,split,unnews,unnews_004.csv,18.4


In [9]:
df['size'].describe()

count    79.000000
mean     20.786076
std       4.716413
min       2.900000
25%      19.450000
50%      20.900000
75%      22.600000
max      37.600000
Name: size, dtype: float64

# step 1  clean

In [15]:
df = show_all_files('split')
# df['attribute'] = df.path.parallel_apply(get_attribute)
df

Unnamed: 0,path,root,source,filename,size
0,split/amnesty_002.csv,split,amnesty,amnesty_002.csv,21.7
1,split/amnesty_003.csv,split,amnesty,amnesty_003.csv,22.3
2,split/amnesty_000.csv,split,amnesty,amnesty_000.csv,20.5
3,split/amnesty_001.csv,split,amnesty,amnesty_001.csv,21.8
4,split/crf_005.csv,split,crf,crf_005.csv,24.1
...,...,...,...,...,...
74,split/unnews_003.csv,split,unnews,unnews_003.csv,20.8
75,split/unnews_006.csv,split,unnews,unnews_006.csv,17.0
76,split/unnews_007.csv,split,unnews,unnews_007.csv,15.8
77,split/unnews_004.csv,split,unnews,unnews_004.csv,18.4


In [16]:
df.source.unique()

array(['amnesty', 'crf', 'hrw', 'newhuman', 'ohchr', 'phr',
       'refworld-amnesty', 'refworld-ca', 'refworld-uscri',
       'refworld-usds', 'rescue', 'satp', 'unchr', 'unnews', 'unodc'],
      dtype=object)

In [17]:
print('total size:', df['size'].sum())
df.groupby(by=["source"], dropna=False).sum()

total size: 1642.1


Unnamed: 0_level_0,size
source,Unnamed: 1_level_1
amnesty,86.3
crf,159.8
hrw,317.1
newhuman,156.0
ohchr,100.3
phr,8.4
refworld-amnesty,73.4
refworld-ca,92.6
refworld-uscri,8.8
refworld-usds,325.7


In [18]:
def basic_process(filename, output_folder):
    
    sizes = []
    
    df1 = pd.read_csv(filename, header=[0])
    sizes.append(df1.shape[0])
    
    if 'url' not in df1.columns:
        df1['url'] ='\n'
    if 'title' not in df1.columns:
        df1['title']='\n'
    
#     df1 = df1[~df1.url.isnull()]
    
    df1 = df1[~df1.text.isnull()]
    df1 = df1[~df1.text.duplicated()]
#     df1 = df1[~df1.url.duplicated()]
    
    df1.loc[df1.title.isnull(),'title']='\n'
    df1.loc[df1.url.isnull(),'url']='\n'
    if 'abstract' in df1:
        df1.loc[df1.abstract.isnull(),'abstract']='\n'
        
    # ------------------- Start cleaning  --------------------------#
    
    # Convert coding
    df1.text = df1.text.apply(lambda x: unicodetoascii(x))
    df1.text = df1.text.apply(lambda x: unicodedata.normalize("NFKD", x))
    
    # email
    df1.text = df1.text.apply(lambda x: re.sub("\S+@\S+(?:\.\S+)+",'',x))
    
    # telphone
    df1.text = df1.text.apply(lambda x: re.sub('\(\+( |-|\d)+\)( |-|\d)+',' ',x))
    df1.text = df1.text.apply(lambda x: re.sub('\+( |-|\d)+',' ',x))
    
    # noise
    df1.text =\
    df1.text.apply(lambda x: re.sub('\n(ad|advertisement|tweet):?\n', "", x, flags=re.IGNORECASE))
    
    # urls
    df1.text = df1.text.apply(lambda x: re.sub(r"http\S+", "", x))
    
    # delete too many \n
    df1.text = df1.text.apply(lambda x: re.sub('\n\n+', "\n\n", x, flags=re.IGNORECASE))
    
    # head and tails
    df1.text = df1.text.apply(lambda x: re.sub("^\s+|\s+$", "", x, flags=re.UNICODE)) 
    
    
    df1 = df1[df1.text.str.len()>100]
    
    sizes.append(df1.shape[0])
    
    # ------------------- Ending cleaning  --------------------------#
    
    filename = filename.split('/')[1]
    new_filename = output_folder + '/'+ filename

    print('%s:\t%s'%(filename, sizes))
    df1.to_csv(new_filename, index= False)
    return

In [19]:
pandarallel.initialize(nb_workers=8, progress_bar=False) 
_ = df['path'].parallel_apply(basic_process, output_folder='step1')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
refworld-amnesty_001.csv:	[2931, 2721]
hrw_014.csv:	[2425, 2425]
refworld-usds_010.csv:	[806, 786]amnesty_002.csv:	[8097, 8087]

refworld-usds_006.csv:	[806, 799]
newhuman_006.csv:	[6092, 6089]
crf_003.csv:	[1790, 1790]
unnews_000.csv:	[7939, 7938]
hrw_013.csv:	[2425, 2425]
amnesty_003.csv:	[8097, 8071]
refworld-usds_003.csv:	[807, 804]
refworld-amnesty_002.csv:	[2931, 2667]
newhuman_002.csv:	[6093, 6090]
refworld-usds_011.csv:	[806, 800]
hrw_004.csv:	[2425, 2425]
amnesty_000.csv:	[8098, 8085]
hrw_001.csv:	[2426, 2426]
refworld-usds_008.csv:	[806, 790]
unnews_001.csv:	[7939, 7939]
refworld-usds_014.csv:	[806, 799]
newhuman_004.csv:	[6092, 6092]
refworld-ca_000.csv:	[7471, 7109]
rescue_000.csv:	[705, 702]
hrw_006.csv:	[2425, 2425]
hrw_002.csv:	[2425, 2425]
refworld-usds_004.csv:	[807, 796]
amnesty_001.csv:	[8098, 8081]
ohchr_001.csv:	[1839, 1249]


In [22]:
df = show_all_files('step1')
df

Unnamed: 0,path,root,source,filename,size
0,step1/amnesty_002.csv,step1,amnesty,amnesty_002.csv,21.6
1,step1/amnesty_003.csv,step1,amnesty,amnesty_003.csv,22.2
2,step1/amnesty_000.csv,step1,amnesty,amnesty_000.csv,20.5
3,step1/amnesty_001.csv,step1,amnesty,amnesty_001.csv,21.7
4,step1/crf_005.csv,step1,crf,crf_005.csv,24.1
...,...,...,...,...,...
74,step1/unnews_003.csv,step1,unnews,unnews_003.csv,20.8
75,step1/unnews_006.csv,step1,unnews,unnews_006.csv,17.0
76,step1/unnews_007.csv,step1,unnews,unnews_007.csv,15.7
77,step1/unnews_004.csv,step1,unnews,unnews_004.csv,18.4


In [23]:
print('total size:', df['size'].sum())
df.groupby(by=["source"], dropna=False).sum()

total size: 1590.6


Unnamed: 0_level_0,size
source,Unnamed: 1_level_1
amnesty,86.0
crf,159.8
hrw,316.7
newhuman,156.0
ohchr,63.3
phr,8.3
refworld-amnesty,62.7
refworld-ca,89.6
refworld-uscri,8.8
refworld-usds,324.8


# save to json

In [17]:
# folder = '2.Organization'
# df = pd.DataFrame(columns=['path','source', 'filename'])
# df.path = [str(x) for x in pathlib.Path('%s/*'%folder).glob('**/*')]
df = pd.DataFrame(glob.glob('*/*/*'), columns = ['path'])
# df['root'] = df.path.apply(lambda x: x.split('/')[0])

df['source'] = df.path.apply(lambda x: x.split('/')[1])
df['filename'] = df.path.apply(lambda x: x.split('/')[-1])

# df = df.sort_values('source').reset_index(drop=True)
# df['size'] = df['path'].parallel_apply(get_csv_size)
# df['filename'] = df.path.apply(lambda x: x.replace('%s/'%folder,''))

df['json_file'] = df['filename'].apply(lambda x: x.replace('.csv', '.json'))

df['json_file'] = df.source +'/'+ df.json_file
df['tar_file'] = 'tar/'+ df.json_file+'.tar.gz'
df.json_file = 'json/'+ df.json_file
df

Unnamed: 0,path,source,filename,json_file,tar_file
0,csv/frus/024.csv,frus,024.csv,json/frus/024.json,tar/frus/024.json.tar.gz
1,csv/frus/053.csv,frus,053.csv,json/frus/053.json,tar/frus/053.json.tar.gz
2,csv/frus/066.csv,frus,066.csv,json/frus/066.json,tar/frus/066.json.tar.gz
3,csv/frus/050.csv,frus,050.csv,json/frus/050.json,tar/frus/050.json.tar.gz
4,csv/frus/037.csv,frus,037.csv,json/frus/037.json,tar/frus/037.json.tar.gz
...,...,...,...,...,...
154,csv/refworld-uscri/refworld-uscri_000.csv,refworld-uscri,refworld-uscri_000.csv,json/refworld-uscri/refworld-uscri_000.json,tar/refworld-uscri/refworld-uscri_000.json.tar.gz
155,csv/refworld-amnesty/refworld-amnesty_001.csv,refworld-amnesty,refworld-amnesty_001.csv,json/refworld-amnesty/refworld-amnesty_001.json,tar/refworld-amnesty/refworld-amnesty_001.json.tar.gz
156,csv/refworld-amnesty/refworld-amnesty_002.csv,refworld-amnesty,refworld-amnesty_002.csv,json/refworld-amnesty/refworld-amnesty_002.json,tar/refworld-amnesty/refworld-amnesty_002.json.tar.gz
157,csv/refworld-amnesty/refworld-amnesty_000.csv,refworld-amnesty,refworld-amnesty_000.csv,json/refworld-amnesty/refworld-amnesty_000.json,tar/refworld-amnesty/refworld-amnesty_000.json.tar.gz


In [18]:
cwd = os.getcwd()
cwd

'/home/yibo/桌面/Politic_Conflict_Datasets/2.Organization'

In [19]:
for i in df.source.unique():
    os.mkdir(cwd + '/json/'+i)
    os.mkdir(cwd + '/tar/'+i)  

In [12]:
df1 = pd.read_csv(df.path[0], header=[0])
df1

Unnamed: 0,id,text,date
0,frus1922v02/d876,"211.31/–\nThe Secretary of State to the Minister in Venezuela (McGoodwin)\nWashington, April 18, 1916.\nNo. 183\nSir: The Department acknowledges receipt of your despatch No. 621, of March 23, 1916,Not printed. in which you state that it might be opportune to suggest the possible conclusion of an extradition treaty between the United States and Venezuela.\nBefore giving further consideration, however, to the question of the possible conclusion of an extradition treaty with the Government of Venezuela, the Department desires to be informed whether the laws of Venezuela prohibit capital puni...",1916-04-18
1,frus1939v05/d42,"740.00111 A.R./97a: Circular telegram\n\nThe Secretary of State to Chiefs of Diplomatic Missions in the American Republics\n\n\n\n\n\nWashington, September 8, 1939—4 p.m.\n\n\nThe President has announced that the Under Secretary of StateSumner Welles, Representative of the Secretary of State. will represent the United States at the conference in Panama. He will be assisted by the following as advisers: The Honorable Edwin C. Wilson, Minister Designate to Uruguay; and Dr. Herbert Feis,Herbert Feis, Adviser; Adviser on International Economic Affairs, Department of State. Dr. Warren Kelchner,...",1939-09-08
2,frus1951v05/d537,"320/11–2951: Telegram\n\nThe United States Representative at the United Nations (Austin) to the Department of State\n\n\n\n\nsecret\n\npriority\n\n\n\nParis, November 29, 1951—11 p. m.\n\n\n\nDelga 424. Re Palestine: US GADel staff offers for Dept’s consideration fol analysis of situation after discussion with Riley and Blandford (Delga’s 305 and 307).Both dated November 22, pp. 941 and 942.\n\n1. Re position paper proposal for transfer general political functions including mediation to Riley and repatriation and compensation to Blandford under resolution such as contained Dept’s position ...",1951-11-29
3,frus1951v07p1/d697,"Lot 55D128: Black Book, Tab 72: Telegram\n\nThe Commander in Chief, Far East (Ridgway) to the Joint Chiefs of Staff\n\n\n\n\nsecret priority\n\n\n\nTokyo, 11 November 1951—9:27 p. m.\n\n\nC–57058. For your info CINCUNC Adv msg HNC 435 is quoted.\n“1. The sub-delegation reports the following during the 10 November meetings.\n“a. Morning and part of the afternoon sessions devoted to debate on the subject of adjustments and merits of each side’s previous proposals. Crux of today’s session came in the late afternoon when Lee made the following proposal:\n\n“‘The proposals our side made and t...",1951-11-11
4,frus1969-76v36/d124,"124. Intelligence Note Prepared in the Bureau of Intelligence and ResearchSource: National Archives, RG 59, Central Files 1970–73, PET 3 OPEC. Confidential; No Foreign Dissem. Drafted by Leo F. Cecchini, Jr. (INR); approved by Ghiardi; and released by Weiss.\n\nRECN–15\nWashington, June 27, 1972.\n\nOPEC OPENS OIL MINISTERS’ MEETING IN ATMOSPHERE OF UNCERTAINTY\nThe OPECOPEC members are: Abu Dhabi, Algeria, Indonesia, Iran, Iraq, Kuwait, Libya, Nigeria, Qatar, Saudi Arabia, and Venezuela. [Footnote in the original.] Ordinary Ministerial Council Conference opened June 27 in Vienna with impo...",1972-06-27
...,...,...,...
3853,frus1944Quebec/d225,"Hopkins Papers: Telegram\nThe President’s Special Assistant (Hopkins) to the President\n\ntop secret\n[Washington,] September 11, 1944.\n\nMR–out—385. Personal and top secret to the President from Harry Hopkins.\nHave discussed all of the cables relative to Poland with the State Department and Marshall. On the basis of these conferences, would suggest that you send substantially the following messages— \n1.\nTo the President of Poland:At the request of Charles E. Bohlen, an unnumbered telegram was sent to Quebec by the White House Map Room correcting “President of Poland” to “Prime Ministe...",1944-09-11
3854,frus1866p3/d131,"[Translation.\nSeñor Romero to Mr. Seward\n\n\nMexican Legation in the United States of America, Washington, November 20, 1866.\nMr. Secretary: I have the honor to transmit to you, for the information of the United States government, the accompanying documents, containing official reports of recent events in the eastern military division of the Mexican republic.\nI call your special attention to two important victories of General Diaz over the French and Austrians on the 3d and 18th of October last, at Miahuatlan and Carbonera, in the State of Oaxaca.\nI avail myself of this occasion to r...",1866-11-20
3855,frus1977-80v18/d111,"111. Summary of Conclusions of a Meeting of the Special Coordination CommitteeSource: Carter Library, National Security Council, Institutional Files, Box 88, SCC 026, 8/24/77, Indian Ocean. Top Secret. The meeting took place in the White House Situation Room. The minutes are not attached and were not found. Sick sent the Summary of Conclusions to Brzezinski under an August 25 memorandum. (Ibid.) Papers prepared for the meeting are ibid.\n\n\n\nWashington, August 24, 1977, 9–10:05 a.m.\n\n\n\nSUBJECT\nIndian Ocean Arms Control\n\n\nPARTICIPANTS\n\n\n\nState\n\n\nWarren Christopher\n\n\nRegi...",1977-08-24
3856,frus1864p4/d276,"[Translation.]\nThe Duke de Soulé to Mr. Harvey.\n\n\nDepartment of Foreign Affairs,\nJanuary 23, 1864.\nWith reference to the note which you were pleased to address me, under date of the 14th instant, requesting that preventive measures might be adopted with regard to the English bark Agrippina, which, according to information, purposed carrying to the Azores a cargo of munitions of war for supplying the confederate cruisers, it is my duty to inform you that the minister of the interior has advised me, in his communication of the 20th instant, as having forwarded, on that same date, to t...",1864-01-23


In [20]:
def convert_json_tar(filename, json_file, tar_file):
    df1 = pd.read_csv(filename, header=[0])      
    df1.to_json(json_file, orient="records", lines=True)
    
    with tarfile.open(tar_file, "w:gz") as tar:
        tar.add(json_file, arcname=os.path.basename(json_file))

pandarallel.initialize(nb_workers=8, progress_bar=True) 
_ = df.parallel_apply(lambda x: convert_json_tar(x['path'], x['json_file'], x['tar_file']), axis=1)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=20), Label(value='0 / 20'))), HBox…

In [21]:
pandarallel.initialize() 

df.source = df.path.apply(lambda x: x.split('/')[1]).values
df['size'] = df['path'].parallel_apply(get_csv_size)
df

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Unnamed: 0,path,source,filename,json_file,tar_file,size
0,csv/frus/024.csv,frus,024.csv,json/frus/024.json,tar/frus/024.json.tar.gz,15.2
1,csv/frus/053.csv,frus,053.csv,json/frus/053.json,tar/frus/053.json.tar.gz,15.5
2,csv/frus/066.csv,frus,066.csv,json/frus/066.json,tar/frus/066.json.tar.gz,14.7
3,csv/frus/050.csv,frus,050.csv,json/frus/050.json,tar/frus/050.json.tar.gz,15.9
4,csv/frus/037.csv,frus,037.csv,json/frus/037.json,tar/frus/037.json.tar.gz,16.0
...,...,...,...,...,...,...
154,csv/refworld-uscri/refworld-uscri_000.csv,refworld-uscri,refworld-uscri_000.csv,json/refworld-uscri/refworld-uscri_000.json,tar/refworld-uscri/refworld-uscri_000.json.tar.gz,8.8
155,csv/refworld-amnesty/refworld-amnesty_001.csv,refworld-amnesty,refworld-amnesty_001.csv,json/refworld-amnesty/refworld-amnesty_001.json,tar/refworld-amnesty/refworld-amnesty_001.json.tar.gz,10.6
156,csv/refworld-amnesty/refworld-amnesty_002.csv,refworld-amnesty,refworld-amnesty_002.csv,json/refworld-amnesty/refworld-amnesty_002.json,tar/refworld-amnesty/refworld-amnesty_002.json.tar.gz,29.6
157,csv/refworld-amnesty/refworld-amnesty_000.csv,refworld-amnesty,refworld-amnesty_000.csv,json/refworld-amnesty/refworld-amnesty_000.json,tar/refworld-amnesty/refworld-amnesty_000.json.tar.gz,22.5


In [22]:
print(df['size'].sum())
df.groupby(by=["source"], dropna=False)['size'].sum().round(1)

2842.2999999999997


source
amnesty               86.0
crf                  159.8
frus                1251.7
hrw                  316.7
newhuman             156.0
ohchr                 63.3
phr                    8.3
refworld-amnesty      62.7
refworld-ca           89.6
refworld-uscri         8.8
refworld-usds        324.8
rescue                 2.9
satp                 108.4
unchr                 28.1
unnews               165.9
unodc                  9.3
Name: size, dtype: float64