In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 39 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 53.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=6cdcc850a4108be7f4fd1aea1ba6c054b357ae32d154967eb2325548f0f09988
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [None]:
# Import the basic spark library
from pyspark.sql import SparkSession

spark = SparkSession.builder \
      .master("local") \
      .appName("Structure") \
      .getOrCreate()

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount = True)

Mounted at /gdrive


In [None]:
%cd /gdrive/My Drive/SMBUD project/Third Delivery/Structures

/gdrive/.shortcut-targets-by-id/1q0fAfMYWojuW0WzsoMALMBcukJ3n_QVK/SMBUD project/Third Delivery/Structures


In [None]:
import pandas as pd
import json
import string
from numpy import nan
import random

We are using the function `pd.json_normalize()` to extract the elements from the nested json array and we put the result in the pandas dataframe. The original dataset contains many attributes which sometimes are not present. For this reason, we decide to replace them with the empty string in the case they are Strings or with 0 in the case they are Integers. We have used `pd.drop_duplicates()` to remove duplicates in keywords, fos, authors and urls dataframes.

In [None]:
with open("dataset.json") as json_file:
  data = json.load(json_file)

authors = pd.json_normalize(data, record_path=['authors'])
authors = authors.drop_duplicates('_id', keep='first')
venues = list()
pubs = list()
for el in data:
  venues.append(el['venue'])
  pub = {
      '_id': el['_id'],
      'title': el['title'],
      'year': el['year'],
      'n_citation': el['n_citation'] if 'n_citation' in el.keys() else 0,
      'page_start': el['page_start'] if 'page_start' in el.keys() else "",
      'page_end': el['page_end'] if 'page_end' in el.keys() else "",
      'lang': el['lang'] if 'lang' in el.keys() else "",
      'volume': el['volume'] if 'volume' in el.keys() else "",
      'issue': el['issue'] if 'issue' in el.keys() else "",
      'isbn': el['isbn'] if 'isbn' in el.keys() else "",
      'doi': el['doi'] if 'doi' in el.keys() else "",
      'pdf': el['pdf'],
      'abstract': el['abstract'] if 'abstract' in el.keys() else "",
      'publisher': el['publisher'],
  }
  pubs.append(pub)
publications = pd.DataFrame(pubs)
venue = pd.DataFrame(venues)
keywords = pd.json_normalize(data, record_path=['keywords'])
keywords = keywords.drop_duplicates(0, keep='first')
fos = pd.json_normalize(data, record_path=['fos'])
fos = fos.drop_duplicates(0, keep='first')
urls = pd.json_normalize(data, record_path=['url'])
urls = urls.drop_duplicates(0, keep='first')
venue

Unnamed: 0,_id,name_d,type,raw,publisher,publisher_id,sid,issn,t,raw_zh,online_issn,name,name_s
0,53a72a4920f7420be8bfa51b,International Conference on Document Analysis ...,conference,ICDAR-1,Unknown,1,,,,,,,
1,53a72e2020f7420be8c80142,International Symposium on Circuits and Systems,conference,ISCAS (3),Unknown,1,,,,,,,
2,53a72e9920f7420be8c93fac,Computer Software and Applications Conference,conference,COMPSAC,Unknown,1,,,,,,,
3,572de199d39c4f49934b3d5c,Frontiers of Computer Science in China,conference,Frontiers of Computer Science in China,Unknown,1,,,,,,,
4,,Data & Knowledge Engineering,journal,Data & Knowledge Engineering,North-Holland,2,data-and-knowledge-engineering,0169-023X,J,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109,53a72aef20f7420be8c12039,COCOON,journal,COCOON,Unknown,1,conf/cocoon,,C,,,,
1110,53a72a4520f7420be8bf8d9b,ICALP,journal,ICALP,Unknown,1,conf/icalp,,C,,,,
1111,,Environmental Modelling & Software,journal,Environmental Modelling & Software,Elsevier,5,environmental-modelling-and-software,1364-8152,J,,,,
1112,53a7253820f7420be8b4823b,DAC,journal,DAC,Unknown,1,conf/dac,,C,,,,


In many case the Venue does not have the '_id' so we decide to randomly generate ids and replace it inside the Venue dataframe. We also remove duplicates (venues are extracted from an array of publications, and many publications can have the same venue). We do this after generating the new ids, since doing the same operation previously would have removed all the venues with a null id.

In [None]:
venue['name_d'] = venue['name_d'].fillna('')
venue['raw'] = venue['raw'].fillna('')

letters = string.ascii_lowercase + string.digits
tmp_map_id = dict()

for i in range(len(venue.index)):
  if venue.iloc[i]['_id'] is nan:
    if venue.iloc[i]['name_d'] not in tmp_map_id.keys():
      venue.at[i, '_id'] = '53a72' + ''.join(random.choice(letters) for i in range(19))
    else:
      venue.at[i, '_id'] = tmp_map_id[venue.iloc[i]['name_d']]
  else:
    tmp_map_id[venue.iloc[i]['name_d']] = venue.iloc[i]['_id']

publications['venue_id'] = venue['_id']
venue = venue.drop_duplicates('_id', keep='last')
venue

Unnamed: 0,_id,name_d,type,raw,publisher,publisher_id,sid,issn,t,raw_zh,online_issn,name,name_s
0,53a72a4920f7420be8bfa51b,International Conference on Document Analysis ...,conference,ICDAR-1,Unknown,1,,,,,,,
1,53a72e2020f7420be8c80142,International Symposium on Circuits and Systems,conference,ISCAS (3),Unknown,1,,,,,,,
2,53a72e9920f7420be8c93fac,Computer Software and Applications Conference,conference,COMPSAC,Unknown,1,,,,,,,
3,572de199d39c4f49934b3d5c,Frontiers of Computer Science in China,conference,Frontiers of Computer Science in China,Unknown,1,,,,,,,
4,53a72ynxodzg9j1giddd7ebg,Data & Knowledge Engineering,journal,Data & Knowledge Engineering,North-Holland,2,data-and-knowledge-engineering,0169-023X,J,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109,53a72aef20f7420be8c12039,COCOON,journal,COCOON,Unknown,1,conf/cocoon,,C,,,,
1110,53a72a4520f7420be8bf8d9b,ICALP,journal,ICALP,Unknown,1,conf/icalp,,C,,,,
1111,53909e8120f70186a0e2ca18,Environmental Modelling & Software,journal,Environmental Modelling & Software,Elsevier,5,environmental-modelling-and-software,1364-8152,J,,,,
1112,53a7253820f7420be8b4823b,DAC,journal,DAC,Unknown,1,conf/dac,,C,,,,


We add indexes to the 'keywords', 'fos' and 'urls' dataframes.

In [None]:
keywords['key_index'] = keywords.index
keywords = keywords.rename(columns={keywords.columns[0]: 'key_name'})
fos['fos_index'] = fos.index
fos = fos.rename(columns={fos.columns[0]: 'fos_name'})
urls['url_index'] = urls.index
urls = urls.rename(columns={urls.columns[0]: 'url_name'})
keywords

Unnamed: 0,key_name,key_index
0,handwriting recognition,0
1,shape,1
2,feature extraction,2
3,knowledge base,3
4,prototypes,4
...,...,...
11370,electronic voting,11370
11372,covert channel,11372
11374,information hiding,11374
11375,relative entropy,11375


We replace the publications' attributes authors, keywords, fos and urls with the list of the corresponding indexes. 

In [None]:
tmp_df = pd.DataFrame([], columns=['_id', 'keywords', 'fos', 'url', 'authors', 'references'])
for el in data:
  key_idx = [keywords[keywords.key_name == keyword]['key_index'].tolist()[0] for keyword in el['keywords']]
  url_idx = [urls[urls.url_name == url]['url_index'].tolist()[0] for url in el['url']]
  fos_idx = [fos[fos.fos_name == fos_]['fos_index'].tolist()[0] for fos_ in el['fos']]
  authors_orcid = [author['_id'] for author in el['authors']]
  refs = el['references'] if 'references' in el.keys() else []
  tmp_df.loc[len(tmp_df.index)] = [el['_id'], key_idx, fos_idx, url_idx, authors_orcid, refs]
publications = publications.merge(tmp_df)
publications

Unnamed: 0,_id,title,year,n_citation,page_start,page_end,lang,volume,issue,isbn,doi,pdf,abstract,publisher,venue_id,keywords,fos,url,authors,references
0,53e99784b7602d9701f3e151,A solution to the problem of touching and brok...,1993,17,602,605,en,,,,10.1109/ICDAR.1993.395663,,,Unknown,53a72a4920f7420be8bfa51b,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",[0],"[53f46797dabfaeb22f542630, 54328883dabfaeb4c6a...","[53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990c..."
1,53e99784b7602d9701f3e15d,Timing yield estimation using statistical stat...,2005,28,2461,2464Vol.3,en,,,0-7803-8834-8,10.1109/ISCAS.2005.1465124,//static.aminer.org/pdf/PDF/000/423/329/timing...,As process variations become a significant pro...,Unknown,53a72e2020f7420be8c80142,"[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 2...","[11, 12, 13, 14, 15, 16, 17, 6, 19, 20, 21]","[1, 2]","[53f43b03dabfaedce555bf2a, 53f45ee9dabfaee43ec...","[53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27..."
2,53e99784b7602d9701f3f411,Using XML to Integrate Existing Software Syste...,2002,28,167,172,en,,,0-7695-1727-7,10.1109/CMPSAC.2002.1044548,,The eXtensible Markup Language 驴 XML 驴 is not ...,Unknown,53a72e9920f7420be8c93fac,"[44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 5...","[22, 23, 24, 25, 26, 6, 28, 29, 30, 31, 32, 33]","[3, 4, 5]",[548a2e3ddabfae9b40134fbc],"[53e9adbdb7602d97037be8a2, 53e9bb53b7602d97047..."
3,53e99784b7602d9701f3f5fe,Research on resource allocation for multi-tier...,2011,2,506,512,en,5,4,,10.1007/s11704-011-0127-6,,Resource allocation for multi-tier web applica...,Unknown,572de199d39c4f49934b3d5c,"[63, 64, 65]","[34, 35, 36, 37, 6, 39, 40, 41, 42, 43, 44]","[6, 7, 8]",[53f46a22dabfaee0d9c3d5e5],"[53e9a073b7602d9702957efa, 53e9ad87b7602d97037..."
4,53e99784b7602d9701f3f95d,FCLOS,2009,0,192,220,en,68,2,,10.1016/j.datak.2008.09.003,,Mobile online analytical processing (mOLAP) en...,North-Holland,53a72ynxodzg9j1giddd7ebg,"[66, 67, 68]","[45, 46, 47, 35, 49, 6, 51, 52, 53, 54]","[9, 10, 8]","[53f43b64dabfaefedbaf97e4, 53f43354dabfaedd74d...","[53e99ee0b7602d97027ae130, 53e9aca7b7602d97036..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109,599c7c65601a182cd27aa14d,Graph Separators: A Parameterized View.,2001,0,318,327,,,,,,,,Unknown,53a72aef20f7420be8c12039,[],"[3716, 15, 6183, 3571, 7702, 11079, 10030, 6, ...",[2814],"[53f43c31dabfaee2a1d1aafe, 5409158ddabfae450f4...","[53e9b1d1b7602d9703c6492f, 53e9ab07b7602d97034..."
1110,599c7b52601a182cd272714a,Parameterized Complexity: Exponential Speed-Up...,2001,0,261,272,,,,,,,,Unknown,53a72a4520f7420be8bf8d9b,[],"[11085, 511, 503, 3571, 101, 11090, 6189, 257,...",[2815],"[53f43c31dabfaee2a1d1aafe, 5409158ddabfae450f4...","[599c7c65601a182cd27aa14d, 53e9aac3b7602d97034..."
1111,599c7a31601a182cd2699edc,Vulnerability of water quality in intensively ...,2005,0,379,380,,20,4,,10.1016/j.envsoft.2004.05.002,,,Elsevier,53909e8120f70186a0e2ca18,[],"[11096, 11097, 11098, 11099, 8391, 11101, 1110...","[2816, 2817, 8]",[85],[53e9b923b7602d97045101f6]
1112,599c7b6a601a182cd2735703,Statistical timing for parametric yield predic...,2003,0,932,937,,,,,,,,Unknown,53a7253820f7420be8b4823b,[],"[11, 11107, 4444, 5252, 11110, 11111, 6, 348, ...",[2819],"[86, 53f4b6d1dabfaedce564bfc0, 87, 88, 53f4302...","[53e9a8a9b7602d97031f6bb9, 53e9aad9b7602d97034..."


We select only attributes we are interested in due to the fact that in many cases the other attributes are null

In [None]:
authors = authors[['_id', 'firstname', 'lastname']]
venue = venue[['_id', 'name_d', 'type', 'raw', 'publisher']]

We create the authors_spark dataframe

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, LongType
# Authors
authors_schema = StructType([ \
    StructField("_id", StringType(), False), \
    StructField("firstname", StringType(), False), \
    StructField("lastname", StringType(), False) \
])

authors_spark = spark.createDataFrame(data=authors, schema=authors_schema) 
authors_spark.printSchema()
authors_spark.show(truncate=False)

root
 |-- _id: string (nullable = false)
 |-- firstname: string (nullable = false)
 |-- lastname: string (nullable = false)

+------------------------+------------+----------------+
|_id                     |firstname   |lastname        |
+------------------------+------------+----------------+
|53f46797dabfaeb22f542630|Jairo       |Rocha           |
|54328883dabfaeb4c6a8a699|Theo        |Pavlidis        |
|53f43b03dabfaedce555bf2a|Min         |Pan             |
|53f45ee9dabfaee43ecda842|Chris       |C. N. Chu       |
|53f42e8cdabfaee1c0a4274e|Hai         |Zhou            |
|548a2e3ddabfae9b40134fbc|Harry       |M. Sneed        |
|53f46a22dabfaee0d9c3d5e5|Shuguo      |Yang            |
|53f43b64dabfaefedbaf97e4|Ilias       |Michalarias     |
|53f43354dabfaedd74d80e7b|Arkadiy     |Omelchenko      |
|53f443b6dabfaeecd69a25b7|Hans-Joachim|Lenz            |
|53f437b0dabfaedce553b065|Mario       |Zuehlke         |
|53f7ba9cdabfae9060ae1f26|Hartmut     |König           |
|53f43640dabfaedf435

We create the venue_spark dataframe

In [None]:
# Venues
venues_schema = StructType([ \
    StructField("_id", StringType(), False), \
    StructField("name_d", StringType(), False), \
    StructField("type", StringType(), False), \
    StructField("raw", StringType(), True), \
    StructField("publisher", StringType(), True) \
])

venue_spark = spark.createDataFrame(data=venue, schema=venues_schema) 
venue_spark.printSchema()
venue_spark.show(truncate=False)

root
 |-- _id: string (nullable = false)
 |-- name_d: string (nullable = false)
 |-- type: string (nullable = false)
 |-- raw: string (nullable = true)
 |-- publisher: string (nullable = true)

+------------------------+-------------------------------------------------------------------------------------------------------+----------+-------------------------------------------------------------------------------------------------------+-------------+
|_id                     |name_d                                                                                                 |type      |raw                                                                                                    |publisher    |
+------------------------+-------------------------------------------------------------------------------------------------------+----------+-------------------------------------------------------------------------------------------------------+-------------+
|53a72a4920f7420be8bfa51b|

We create the fos_spark, urls_spark and keywords dataframes

In [None]:
# Fos, keywords and urls
fos_schema = StructType([ \
    StructField("fos_name", StringType(), False), \
    StructField("fos_index", LongType(), False) \
    
])

fos_spark = spark.createDataFrame(data=fos, schema=fos_schema) 
fos_spark.printSchema()
fos_spark.show(truncate=False)

keywords_schema = StructType([ \
    StructField("key_name", StringType(), False), \
    StructField("key_index", LongType(), False) \
])

keywords_spark = spark.createDataFrame(data=keywords, schema=keywords_schema) 
keywords_spark.printSchema()
keywords_spark.show(truncate=False)

urls_schema = StructType([ \
    StructField("url_name", StringType(), False), \
    StructField("url_index", LongType(), False) \
])

urls_spark = spark.createDataFrame(data=urls, schema=urls_schema) 
urls_spark.printSchema()
urls_spark.show(truncate=False)

root
 |-- fos_name: string (nullable = false)
 |-- fos_index: long (nullable = false)

+----------------------------------+---------+
|fos_name                          |fos_index|
+----------------------------------+---------+
|feature (computer vision)         |0        |
|handwriting recognition           |1        |
|feature extraction                |2        |
|artificial intelligence           |3        |
|feature (machine learning)        |4        |
|optical character recognition     |5        |
|computer science                  |6        |
|intelligent word recognition      |7        |
|document processing               |8        |
|intelligent character recognition |9        |
|pattern recognition               |10       |
|static timing analysis            |11       |
|statistics                        |12       |
|sequential logic                  |13       |
|statistical static timing analysis|14       |
|algorithm                         |15       |
|clock skew         

We create the publications_spark dataframes

In [None]:
publications_schema = StructType([ \
    StructField("_id", StringType(), False), \
    StructField("title", StringType(), False), \
    StructField("year", IntegerType(), False), \
    StructField("n_citation", IntegerType(), True), \
    StructField("page_start", StringType(), True), \
    StructField("page_end", StringType(), True), \
    StructField("lang", StringType(), True), \
    StructField("volume", StringType(), True), \
    StructField("issue", StringType(), True), \
    StructField("isbn", StringType(), True), \
    StructField("doi", StringType(), False), \
    StructField("pdf", StringType(), True), \
    StructField("abstract", StringType(), True), \
    StructField("publisher", StringType(), True), \
    StructField("venue_id", StringType(), True), \
    StructField("keywords", ArrayType(LongType()), True), \
    StructField("fos", ArrayType(LongType()), True), \
    StructField("url", ArrayType(LongType()), True), \
    StructField("authors", ArrayType(StringType()), True), \
    StructField("references", ArrayType(StringType()), True) \
])

publications_spark = spark.createDataFrame(data=publications, schema=publications_schema) 
publications_spark.printSchema()
publications_spark.show(truncate=False)

root
 |-- _id: string (nullable = false)
 |-- title: string (nullable = false)
 |-- year: integer (nullable = false)
 |-- n_citation: integer (nullable = true)
 |-- page_start: string (nullable = true)
 |-- page_end: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- volume: string (nullable = true)
 |-- issue: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- doi: string (nullable = false)
 |-- pdf: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- venue_id: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- fos: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- url: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)