# **Arxiv metadata Analytics with PySpark RDD: JSON case study**<a href="#Arxiv-metadata-Analytics-with-PySpark-RDD:-JSON-case-study" class="anchor-link">¶</a>

### Udemy Course: Best Hands-on Big Data Practices and Use Cases using PySpark<a href="#Udemy-Course:-Best-Hands-on-Big-Data-Practices-and-Use-Cases-using-PySpark" class="anchor-link">¶</a>

### Author: Amin Karami (PhD, FHEA)<a href="#Author:-Amin-Karami-(PhD,-FHEA)" class="anchor-link">¶</a>

In \[ \]:

    ########## ONLY in Colab ##########
    !pip3 install pyspark
    ########## ONLY in Colab ##########

In \[ \]:

    ########## ONLY in Ubuntu Machine ##########
    # Load Spark engine
    !pip3 install -q findspark
    import findspark
    findspark.init()
    ########## ONLY in Ubuntu Machine ##########

In \[ \]:

    from pyspark import SparkContext, SparkConf

    # Initializing Spark
    conf = SparkConf().setAppName("Archive_PySpark").setMaster("local[*]")
    sc = SparkContext(conf=conf)
    print(sc)
    print("Ready to go!")

In \[ \]:

    # Read and Load Data to Spark
    # Data source: https://www.kaggle.com/Cornell-University/arxiv/version/62

    import json

    rdd_json = sc.textFile("data/arxiv-metadata-oai-snapshot.json", 100)
    rdd = rdd_json.map(lambda x: json.loads(x))
    rdd.persist()

In \[ \]:

    # Check the number of parallelism and partitions:

    print(sc.defaultParallelism)
    print(rdd.getNumPartitions())

## Question 1: Count elements<a href="#Question-1:-Count-elements" class="anchor-link">¶</a>

In \[ \]:

    # during teaching: https://spark.apache.org/docs/3.0.0-preview/web-ui.html
    # http://localhost:4040/

    rdd.count()

## Question 2: Get the first two records<a href="#Question-2:-Get-the-first-two-records" class="anchor-link">¶</a>

In \[ \]:

    rdd.take(2)

## Question 3: Get all attributes<a href="#Question-3:-Get-all-attributes" class="anchor-link">¶</a>

In \[ \]:

    rdd.flatMap(lambda x: x.keys()).distinct().collect()

## Question 4: Get the name of the licenses<a href="#Question-4:-Get-the-name-of-the-licenses" class="anchor-link">¶</a>

In \[ \]:

    rdd.map(lambda x: x["license"]).distinct().collect()

## Question 5: Get the shortest and the longest titles<a href="#Question-5:-Get-the-shortest-and-the-longest-titles" class="anchor-link">¶</a>

In \[ \]:

    shortest_title_rdd = rdd.map(lambda x: x["title"]).reduce(lambda x, y: x if x < y else y)
    longest_title_rdd = rdd.map(lambda x: x["title"]).reduce(lambda x, y: x if x > y else y)

    print("shortest title: ", shortest_title_rdd)
    print("longest title: ", longest_title_rdd)

## Question 6: Find abbreviations with 5 or more letters in the abstract<a href="#Question-6:-Find-abbreviations-with-5-or-more-letters-in-the-abstract" class="anchor-link">¶</a>

In \[ \]:

    import re

    def get_abbrivations(line):
        result = re.search(r"\(([A-Za-z][^_ /\\<>]{5,})\)", line)
        if result:
            return result.group(1) # return 1st match. group (0) will return all the matches

In \[ \]:

    rdd.filter(lambda x: get_abbrivations(x['abstract'])).count()

## Question 7: Get the number of archive records per month ('update_date' attribute)<a href="#Question-7:-Get-the-number-of-archive-records-per-month-(&#39;update_date&#39;-attribute)" class="anchor-link">¶</a>

In \[ \]:

    import datetime

    def extract_date(DateIn):
        d = datetime.datetime.strptime(DateIn, "%Y-%m-%d")
        return d.month

    # check the function:
    extract_date('2008-12-13')

In \[ \]:

    rdd.map(lambda x: (extract_date(x["update_date"]),1)).reduceByKey(lambda x,y: x+y).collect()

    # sort by values
    # rdd.map(lambda x: (extract_date(x["update_date"]),1)).reduceByKey(lambda x,y: x+y).sortBy(lambda l: l[1]).collect()

## Question 8: Get the average number of pages<a href="#Question-8:-Get-the-average-number-of-pages" class="anchor-link">¶</a>

In \[ \]:

    import re

    def get_Page(line):
        search = re.findall('\d+ pages', line)
        if search:
            return int(search[0].split(" ")[0])
        else:
            return 0

In \[ \]:

    rdd_average = rdd.map(lambda x: get_Page(x['comments'] if x['comments'] != None else "None"))

    # remove 0:
    rdd_average = rdd_average.filter(lambda x: x != 0)

    average_counter = rdd_average.count()
    avarage_summation = rdd_average.reduce(lambda x,y: int(x)+int(y))

    print(average_counter)
    print(avarage_summation)
    print("the average of pages is ", avarage_summation/average_counter)