# Exploration of Dask

In [1]:
from impresso_commons.utils.s3 import IMPRESSO_STORAGEOPT
from dask import dataframe as ddf

In [2]:
PATH = '/scratch/students/justine/'
FILE = 's3-impresso-stats'

In [3]:
test_df = ddf.read_csv(
    PATH+FILE+"/content-item-stats/*",
    storage_options=IMPRESSO_STORAGEOPT
)

## Explore dataframe

### Snapshots

In [4]:
test_df.head(5)

Unnamed: 0,id,year,newspaper,type,n_tokens,title_length
0,BDC-1839-01-20-a-i0001,1839,BDC,ar,250.0,4.0
1,BDC-1839-01-20-a-i0002,1839,BDC,ar,758.0,
2,BDC-1839-01-20-a-i0003,1839,BDC,ar,14.0,31.0
3,BDC-1839-01-20-a-i0004,1839,BDC,ar,349.0,22.0
4,BDC-1839-01-20-a-i0005,1839,BDC,ar,193.0,


In [5]:
test_df.tail(5)

Unnamed: 0,id,year,newspaper,type,n_tokens,title_length
173,waeschfra-1884-07-05-a-i0017,1884,waeschfra,ar,19.0,
174,waeschfra-1884-07-05-a-i0018,1884,waeschfra,img,,
175,waeschfra-1884-07-05-a-i0019,1884,waeschfra,img,,
176,waeschfra-1884-07-05-a-i0020,1884,waeschfra,img,,
177,waeschfra-1884-07-05-a-i0021,1884,waeschfra,img,,


In [7]:
%time test_df.head(1000,npartitions=3)

CPU times: user 3.21 s, sys: 603 ms, total: 3.81 s
Wall time: 2.12 s


Unnamed: 0,id,year,newspaper,type,n_tokens,title_length
0,BDC-1839-01-20-a-i0001,1839,BDC,ar,250.0,4.0
1,BDC-1839-01-20-a-i0002,1839,BDC,ar,758.0,
2,BDC-1839-01-20-a-i0003,1839,BDC,ar,14.0,31.0
3,BDC-1839-01-20-a-i0004,1839,BDC,ar,349.0,22.0
4,BDC-1839-01-20-a-i0005,1839,BDC,ar,193.0,
...,...,...,...,...,...,...
538,BNN-1886-01-30-a-i0017,1886,BNN,ar,102.0,14.0
539,BNN-1886-01-30-a-i0018,1886,BNN,ar,136.0,35.0
540,BNN-1886-01-30-a-i0019,1886,BNN,ar,8.0,13.0
541,BNN-1886-01-30-a-i0020,1886,BNN,ar,39.0,19.0


### Info in the type column

In [8]:
%time test_df.type.unique().compute()

CPU times: user 3min 4s, sys: 5min 36s, total: 8min 40s
Wall time: 1min 25s


0         ar
1        img
2         ad
3    section
4    picture
5       page
6         tb
7         ob
8          w
Name: type, dtype: object

### Try filtering

In [9]:
%%time 
BDC_ar_df = test_df[(test_df.newspaper=='BDC') & (test_df.type=='ar')].compute()

CPU times: user 3min 15s, sys: 5min 57s, total: 9min 13s
Wall time: 1min 18s


## Statistics

### Number of rows

In [10]:
%time test_df.shape[0].compute()

CPU times: user 3min 2s, sys: 5min 47s, total: 8min 49s
Wall time: 1min 10s


47876994

### Number of content items per np

In [11]:
count_rows_np = test_df.groupby('newspaper').id.count()

In [12]:
%%time 
count_rows_np_pd = count_rows_np.compute()

CPU times: user 3min 44s, sys: 8min, total: 11min 44s
Wall time: 1min 19s


In [13]:
count_rows_np_pd

newspaper
BDC                 146
BLB                 429
BNN               50500
CDV               16939
CON                2842
                  ...  
schmiede           2940
tageblatt        993158
volkfreu1869      34410
waechtersauer     37123
waeschfra         19953
Name: id, Length: 76, dtype: int64

### Join with issues table

#### Create column issue id

In [4]:
test_df['issue_id'] = test_df.id.apply(lambda x: x[:x.rfind("-")], meta=str)

In [5]:
test_df.head()

Unnamed: 0,id,year,newspaper,type,n_tokens,title_length,issue_id
0,BDC-1839-01-20-a-i0001,1839,BDC,ar,250.0,4.0,BDC-1839-01-20-a
1,BDC-1839-01-20-a-i0002,1839,BDC,ar,758.0,,BDC-1839-01-20-a
2,BDC-1839-01-20-a-i0003,1839,BDC,ar,14.0,31.0,BDC-1839-01-20-a
3,BDC-1839-01-20-a-i0004,1839,BDC,ar,349.0,22.0,BDC-1839-01-20-a
4,BDC-1839-01-20-a-i0005,1839,BDC,ar,193.0,,BDC-1839-01-20-a


#### Join with issues table

In [9]:
from sql import db_engine, read_table

In [10]:
engine = db_engine()
issues_df = read_table('impresso.issues', engine)

In [11]:
issues_df

Unnamed: 0,id,year,month,day,edition,access_rights,created,last_modified,is_damaged,s3_version,newspaper_id
0,actionfem-1927-10-15-a,1927,10,15,a,Closed,2019-06-15 12:22:38,NaT,0,,actionfem
1,actionfem-1927-11-15-a,1927,11,15,a,Closed,2019-06-15 12:22:38,NaT,0,,actionfem
2,actionfem-1927-12-15-a,1927,12,15,a,Closed,2019-06-15 12:22:38,NaT,0,,actionfem
3,actionfem-1928-01-15-a,1928,1,15,a,Closed,2019-06-15 12:22:41,NaT,0,,actionfem
4,actionfem-1928-02-15-a,1928,2,15,a,Closed,2019-06-15 12:22:41,NaT,0,,actionfem
...,...,...,...,...,...,...,...,...,...,...,...
441856,waeschfra-1884-06-07-a,1884,6,7,a,Closed,2019-06-15 15:07:34,NaT,0,,waeschfra
441857,waeschfra-1884-06-14-a,1884,6,14,a,Closed,2019-06-15 15:07:34,NaT,0,,waeschfra
441858,waeschfra-1884-06-21-a,1884,6,21,a,Closed,2019-06-15 15:07:34,NaT,0,,waeschfra
441859,waeschfra-1884-06-28-a,1884,6,28,a,Closed,2019-06-15 15:07:34,NaT,0,,waeschfra


In [16]:
ar_content_item = test_df.merge(issues_df, left_on='issue_id', right_on='id', suffixes=('_ci', '_issue'))

In [17]:
ar_content_item.head()

Unnamed: 0,id_ci,year_ci,newspaper,type,n_tokens,title_length,issue_id,id_issue,year_issue,month,day,edition,access_rights,created,last_modified,is_damaged,s3_version,newspaper_id
0,BDC-1839-01-20-a-i0001,1839,BDC,ar,250.0,4.0,BDC-1839-01-20-a,BDC-1839-01-20-a,1839,1,20,a,OpenPublic,2019-06-17 11:53:25,NaT,0,,BDC
1,BDC-1839-01-20-a-i0002,1839,BDC,ar,758.0,,BDC-1839-01-20-a,BDC-1839-01-20-a,1839,1,20,a,OpenPublic,2019-06-17 11:53:25,NaT,0,,BDC
2,BDC-1839-01-20-a-i0003,1839,BDC,ar,14.0,31.0,BDC-1839-01-20-a,BDC-1839-01-20-a,1839,1,20,a,OpenPublic,2019-06-17 11:53:25,NaT,0,,BDC
3,BDC-1839-01-20-a-i0004,1839,BDC,ar,349.0,22.0,BDC-1839-01-20-a,BDC-1839-01-20-a,1839,1,20,a,OpenPublic,2019-06-17 11:53:25,NaT,0,,BDC
4,BDC-1839-01-20-a-i0005,1839,BDC,ar,193.0,,BDC-1839-01-20-a,BDC-1839-01-20-a,1839,1,20,a,OpenPublic,2019-06-17 11:53:25,NaT,0,,BDC


In [18]:
%time ar_content_item.shape[0].compute()

CPU times: user 9min 22s, sys: 14min, total: 23min 22s
Wall time: 5min 10s


33707113

## Others

In [66]:
def np_pd(ddf, npID: str, max_rows: int):
    if len(ddf.loc[ddf.newspaper == npID] < max_rows) :
        return ddf.loc[ddf.newspaper == npID].compute()
    else :
        print('Length of dataframe exceeds max_rows > cannot be converted to pandas df.')
        return None

In [35]:
def num_content_items(npID: str):
    return test_df.loc[test_df.newspaper == npID].count().compute()

In [37]:
def num_content_items2(npID: str):
    return test_df.filter(test_df.newspaper == npID).count().compute()

In [38]:
%time num_content_items2('BDC')

AttributeError: 'DataFrame' object has no attribute 'filter'

In [49]:
%time num_content_items('BLB')

CPU times: user 3min 23s, sys: 6min 52s, total: 10min 15s
Wall time: 1min 2s


id              429
year            429
newspaper       429
type            429
n_tokens        429
title_length    429
dtype: int64

In [26]:
%time test_df.loc[test_df.newspaper == 'BDC'].compute()

CPU times: user 3min 24s, sys: 6min 44s, total: 10min 9s
Wall time: 1min 1s


Unnamed: 0,id,year,newspaper,type,n_tokens,title_length
0,BDC-1839-01-20-a-i0001,1839,BDC,ar,250.0,4.0
1,BDC-1839-01-20-a-i0002,1839,BDC,ar,758.0,
2,BDC-1839-01-20-a-i0003,1839,BDC,ar,14.0,31.0
3,BDC-1839-01-20-a-i0004,1839,BDC,ar,349.0,22.0
4,BDC-1839-01-20-a-i0005,1839,BDC,ar,193.0,
...,...,...,...,...,...,...
7,BDC-1839-04-04-a-i0002,1839,BDC,ar,713.0,
8,BDC-1839-04-04-a-i0003,1839,BDC,ar,773.0,
9,BDC-1839-04-04-a-i0004,1839,BDC,ar,396.0,
10,BDC-1839-04-04-a-i0005,1839,BDC,ar,336.0,
