# Export data to parquet

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

### Show tables to export

In [2]:
spark.sql("use ryanmax")
tables = spark.sql("show tables").toPandas().query('isTemporary == False and not(tableName.str.match("archived") or tableName.str.match("trash"))')
tables

Unnamed: 0,database,tableName,isTemporary
18,ryanmax,externallinks,False
19,ryanmax,externallinks_from_dump_20190401,False
20,ryanmax,externallinks_from_dump_20190420,False
21,ryanmax,free_id_types,False
22,ryanmax,infobox_count,False
23,ryanmax,page_lengths_w_date,False
24,ryanmax,pages_w_with_extlinks,False
25,ryanmax,pages_wpm,False
26,ryanmax,pages_wpm_with_extlinks,False
27,ryanmax,pcor_pmids,False


In [3]:
EXPORT_PATH = "/user/piccardi/ryan/extracts/"

for table in tables['tableName']:
    data = spark.sql("select * from " + table)
    data.write.parquet(EXPORT_PATH + table +".parquet")
    post = spark.read.parquet(EXPORT_PATH + table + ".parquet")
    print("\n------------------------------------------------------------------")
    print("writing ", table, " to parquet")
    print("schemas: ")
    print(" -before")
    data.printSchema()
    print(" -after")
    post.printSchema()
    preCn = data.count()
    postCn = post.count()
    print(" -- row count before: ", preCn, ", after: ", postCn)
    print(" -- row counts match: ", preCn == postCn) 



------------------------------------------------------------------
writing  externallinks  to parquet
schemas: 
 -before
root
 |-- dump_date: string (nullable = true)
 |-- el_id: long (nullable = true)
 |-- el_from: long (nullable = true)
 |-- el_to: string (nullable = true)

 -after
root
 |-- dump_date: string (nullable = true)
 |-- el_id: long (nullable = true)
 |-- el_from: long (nullable = true)
 |-- el_to: string (nullable = true)

 -- row count before:  257503582 , after:  257503582
 -- row counts match:  True

------------------------------------------------------------------
writing  externallinks_from_dump_20190401  to parquet
schemas: 
 -before
root
 |-- el_id: long (nullable = true)
 |-- el_from: long (nullable = true)
 |-- el_to: string (nullable = true)

 -after
root
 |-- el_id: long (nullable = true)
 |-- el_from: long (nullable = true)
 |-- el_to: string (nullable = true)

 -- row count before:  128425048 , after:  128425048
 -- row counts match:  True

----------------

 -- row count before:  649987 , after:  649987
 -- row counts match:  True

------------------------------------------------------------------
writing  wpm_sections  to parquet
schemas: 
 -before
root
 |-- page_id: long (nullable = true)
 |-- section_h2: string (nullable = true)
 |-- section_id: string (nullable = true)
 |-- dt: string (nullable = true)

 -after
root
 |-- page_id: long (nullable = true)
 |-- section_h2: string (nullable = true)
 |-- section_id: string (nullable = true)
 |-- dt: string (nullable = true)

 -- row count before:  249574 , after:  249574
 -- row counts match:  True


In [5]:
# show files
!hdfs dfs -du -h /user/piccardi/ryan/* /user/piccardi/ryan/extracts/pcor

18.7 G   56.0 G   /user/piccardi/ryan/anonymous/anonymous_citationusage_april.parquet
207.6 G  622.8 G  /user/piccardi/ryan/anonymous/anonymous_pageloads_april.parquet
23.4 G   70.2 G   /user/piccardi/ryan/anonymous/session_ids.parquet
15.8 G  47.4 G  /user/piccardi/ryan/dumps/enwiki-20190401-pages-articles-multistream.xml.bz2
15.8 G  47.5 G  /user/piccardi/ryan/dumps/enwiki-20190420-pages-articles-multistream.xml.bz2
7.8 G    23.3 G   /user/piccardi/ryan/extracts/externallinks.parquet
3.9 G    11.6 G   /user/piccardi/ryan/extracts/externallinks_from_dump_20190401.parquet
3.9 G    11.7 G   /user/piccardi/ryan/extracts/externallinks_from_dump_20190420.parquet
1.9 M    5.7 M    /user/piccardi/ryan/extracts/free_id_types.parquet
48.9 M   146.7 M  /user/piccardi/ryan/extracts/infobox_count.parquet
76.7 M   230.2 M  /user/piccardi/ryan/extracts/page_lengths_w_date.parquet
110.6 M  331.9 M  /user/piccardi/ryan/extracts/pages_w_with_extlinks.parquet
2.1 M    6.3 M    /user/piccardi/ryan/extra