In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [2]:
schema = StructType([
    StructField("id", StringType(), True),
    StructField("submitter", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("title", StringType(), True),
    StructField("comments", StringType(), True),
    StructField("journal-ref", StringType(), True),
    StructField("doi", StringType(), True),
    StructField("abstract", StringType(), True),
    StructField("categories", StringType(), True),
    StructField("versions", StringType(), True),
])

In [2]:
APP_NAME = "JSON_RDD"

In [3]:
# Cria uma sessão Spark
spark = SparkSession.builder \
    .appName(APP_NAME) \
    .config("spark.default.parallelism", "8") \
    .master("local[8]").getOrCreate()

24/10/04 09:23:13 WARN Utils: Your hostname, guilherme-linux resolves to a loopback address: 127.0.1.1; using 192.168.2.154 instead (on interface enp5s0)
24/10/04 09:23:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/04 09:23:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [32]:
spark.stop()

In [4]:
spark.sparkContext.setJobGroup("JSON_RDD", "Read Json with .textFile with 100 partitions")

In [5]:
rdd_json = spark.sparkContext.textFile("/home/guilhermefmk/Documentos/labs_spark/data/arxivData.json", 100)

In [6]:
spark

In [7]:
import json

In [8]:
spark.sparkContext.setJobGroup("JSON_RDD", "MAP RDD WITH json.dumps")

In [9]:
rdd = rdd_json.map(lambda x: json.loads(x))

24/10/04 09:23:34 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [10]:
spark.sparkContext.setJobGroup("JSON_RDD", "PERSIST RDD")

In [40]:
rdd.persist()

PythonRDD[2] at RDD at PythonRDD.scala:53

In [11]:
spark.sparkContext.setJobGroup("JSON_RDD", "RDD TAKE 2")

In [12]:
rdd.take(2)

                                                                                

[{'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [13]:
print(spark.sparkContext.defaultParallelism) ## Número de partições padrão para um RDD quando não setado
print(rdd.getNumPartitions())

8
100


In [14]:
spark.sparkContext.setJobGroup("JSON_RDD", "RDD COUNT")

In [15]:
rdd.count()

                                                                                

2011231

In [20]:
rdd.toDF().printSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- authors_parsed: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- categories: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- id: string (nullable = true)
 |-- journal-ref: string (nullable = true)
 |-- license: string (nullable = true)
 |-- report-no: string (nullable = true)
 |-- submitter: string (nullable = true)
 |-- title: string (nullable = true)
 |-- update_date: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)



In [19]:
rdd.flatMap(lambda x: x.keys()).distinct().collect()

                                                                                

['comments',
 'id',
 'title',
 'abstract',
 'update_date',
 'doi',
 'categories',
 'journal-ref',
 'versions',
 'authors',
 'license',
 'authors_parsed',
 'report-no',
 'submitter']

In [18]:
rdd.toDF().show(5)

24/10/04 09:47:06 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 4 (TID 103): Attempting to kill Python Worker
                                                                                

+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+---------+--------------------+--------------------+----------------+------------------+--------------------+-----------+--------------------+
|            abstract|             authors|      authors_parsed|     categories|            comments|                 doi|       id|         journal-ref|             license|       report-no|         submitter|               title|update_date|            versions|
+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+---------+--------------------+--------------------+----------------+------------------+--------------------+-----------+--------------------+
|  A fully differe...|C. Bal\'azs, E. L...|[[Balázs, C., ], ...|         hep-ph|37 pages, 15 figu...|10.1103/PhysRevD....|0704.0001|Phys.Rev.D76:0130...|                NULL|ANL-HEP-PR-07-12|    Pavel Nado

In [23]:
spark.sparkContext.setJobGroup("JSON_RDD", "MAP FOR GET LICENSES")

In [24]:
rdd.map(lambda x: x["license"]).distinct().collect()

                                                                                

[None,
 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/',
 'http://creativecommons.org/licenses/by-nc-sa/3.0/',
 'http://creativecommons.org/licenses/by-nc-sa/4.0/',
 'http://creativecommons.org/licenses/by-nc-nd/4.0/',
 'http://creativecommons.org/publicdomain/zero/1.0/',
 'http://creativecommons.org/licenses/by-sa/4.0/',
 'http://creativecommons.org/licenses/publicdomain/',
 'http://creativecommons.org/licenses/by/3.0/',
 'http://creativecommons.org/licenses/by/4.0/']

In [25]:
shortest_title = rdd.reduce(lambda x, y: x if len(x["title"]) < len(y["title"]) else y)

                                                                                

In [27]:
largest_title = rdd.reduce(lambda x, y: x if len(x["title"]) > len(y["title"]) else y)

                                                                                

In [28]:
largest_title["title"]

'Investigation of the 2-body system with a rotating central body (e. g.\n  earth-moon system) within the Projective Unified Field theory: the transfer\n  of rotational angular momentum and energy from the central body to the\n  orbital 2-body system, the tidal and the non-tidal influences (mechanical,\n  general-relativistic Lense-Thirring effect and cosmological\n  PUFT-contributions)'

In [29]:
shortest_title["title"]

'0'

In [34]:
from datetime import datetime
from operator import add

In [41]:
rdd.map(lambda x: (datetime.strptime(x["update_date"], "%Y-%m-%d").month, 1)).reduceByKey(lambda x,y: x+y).collect()

                                                                                

[(8, 138469),
 (1, 134247),
 (9, 138978),
 (10, 197755),
 (2, 116948),
 (11, 297963),
 (3, 126458),
 (12, 132305),
 (4, 117126),
 (5, 296587),
 (6, 191746),
 (7, 122649)]