# Data Parsing

Here we will try to parse the Aminer V12 json file into CSV and seperate the year 2001

In [1]:
import numpy as np
import pandas as pd
import os, json, gc, re, random
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
#import plotly.express as px
import seaborn as sns
import pickle
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

Defining the data file path

In [4]:
data_file = 'drive/My Drive/Capstone/Data/dblp.v12.json'


Using `yield` to load the JSON file in a loop to prevent Python memory issues if JSON is loaded directly

In [5]:
def get_metadata():
    with open(data_file, 'r', encoding='utf-8') as f:
        next(f)                                         #skip the first line
        for line in f:
            yield line
metadata = get_metadata()

In [6]:
paper_list = []
counter = 0
for paper in metadata:   
    paper = paper.strip(',')       # takes care of the errors with ',' in begining of the line
    if paper == None or paper == '' or paper == '[' or paper == ']' or len(paper) == 0:
        print('I got a null or empty string value for paper in a file')
    else:       
#         print(paper_dict)
#         print("")
#         doc_type = paper_dict.get('doc_type')
        try:
            paper_dict = json.loads(paper)
            year = int(paper_dict.get('year'))
            to_pop = ['page_start', 'page_end','doc_type','publisher','volume','issue','doi','alias_ids','title','indexed_abstract','fos','venue','authors']
            for i in to_pop:
                try:
                    paper_dict.pop(i)
                except:
                    pass 
            if year <= 2010:
              paper_list.append(paper_dict)
              counter += 1
            if (counter % 100000 == 0):
                print(counter)
                print(paper_dict)
        except:
            pass 
#paper_list

0
{'id': 1091, 'year': 2013, 'n_citation': 1, 'references': [2005687710, 2018037215]}
100000
{'id': 128134186, 'year': 2007, 'n_citation': 0}
200000
{'id': 1486651247, 'year': 2005, 'n_citation': 8, 'references': [1535182298, 1550040727, 1571142685, 2142152227]}
300000
{'id': 1532236086, 'year': 2004, 'n_citation': 4, 'references': [2061753595, 2099105076, 2135123541]}
400000
{'id': 1578639176, 'year': 2010, 'n_citation': 27, 'references': [1984249915, 1997556709, 2022553223, 2023199012, 2023758372, 2040244898, 2060353874, 2093331871, 2099997444, 2105893525, 2106281550, 2113922176, 2117088188, 2119494621, 2125669135, 2131499463, 2140871574, 2144333758, 2144377734, 2149874318, 2152621157, 2161537925, 2165741325, 2167306162, 2168803272]}
500000
{'id': 1790952784, 'year': 1995, 'n_citation': 0}
500000
{'id': 1790952937, 'year': 2012, 'n_citation': 0, 'references': [194197919, 1677409904, 2059331361, 2075133364, 2111308925, 2118877769, 2119605622, 2124386111, 2141720053, 2163943109]}
60000

In [7]:
# checking the length of data found
len(paper_list)

2262458

In [8]:
# now add the list into 
papers_df = pd.DataFrame(paper_list)

In [None]:
papers_df = papers_df[['id','year']]

In [9]:
papers_df.info()
papers_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2262458 entries, 0 to 2262457
Data columns (total 4 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   id          int64 
 1   year        int64 
 2   n_citation  int64 
 3   references  object
dtypes: int64(3), object(1)
memory usage: 69.0+ MB


Unnamed: 0,id,year,n_citation,references
0,1388,2000,1,
1,1688,2009,6,"[1560724230, 1986968751, 2156909104]"
2,5411,2009,0,
3,5781,2004,2,
4,6762,2003,0,


In [None]:
papers_df = papers_df.drop('year', axis=1)

Let's Write the paper dataFrame into pickle for later use

In [10]:
path = 'drive/My Drive/Capstone/Data/papers_0_2010.'
papers_df.to_pickle(path + 'pkl')

In [None]:
papers_df.to_csv(path + 'csv')