## Preprocessing of data for cypher

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
sns.set()
import re
from itertools import combinations
from collections import defaultdict
import json
import csv
  

### Reading the json files

In [2]:
file_0 = []
for line in open('dblp-ref-0.json', encoding='utf-8-sig'):
    file_0.append(json.loads(line))

file_1 = []
for line in open('dblp-ref-1.json', encoding='utf-8-sig'):
    file_1.append(json.loads(line))

file_2 = []
for line in open('dblp-ref-2.json', encoding='utf-8-sig'):
    file_2.append(json.loads(line))

file_3 = []
for line in open('dblp-ref-3.json', encoding='utf-8-sig'):
    file_3.append(json.loads(line))

### Merging the 4 files into one list and convert into a data frame

In [3]:
# merginf the 4 lists into 1
inout_data_list = file_0 + file_1 + file_2 + file_3

# saving the dataset into a csv - table format
with open('output.csv', 'w',encoding='utf-8-sig', newline='') as f:
    fieldnames = ['abstract', 'authors', 'n_citation', 'references', 'title', 'venue', 'year', 'id']
    writer = csv.DictWriter(f, fieldnames=fieldnames)

    writer.writeheader()
    for row in inout_data_list:
        writer.writerow(row)

# reading the csv files to create a dataframe for our preprocess
data_for_neo_4j=pd.read_csv('output.csv', encoding='utf-8')
data_for_neo_4j.head()

Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id
0,,"['Tegegne Marew', 'Doo-Hwan Bae']",1,"['2134bf3b-fd89-4724-90ce-5993b4fa3218', '906c...",Using Classpects for Integrating Non-Functiona...,international conference on software engineering,2006,01f1d231-80ae-4cce-b56c-9d821e0924d0
1,,"['Lei Zhang', 'Xuan Zhang', 'Meiping Chai', 'Y...",2,"['3e3b524c-70c5-4008-b349-fd7ae950e655', '4929...",Solution Proposals for Japan-Oriented Offshore...,international conference on software engineering,2009,0e6ce7a9-6456-437b-9f3f-4bda192a6fae
2,,"['Dongyun Liu', 'Hong Mei']",39,"['4b837f17-7e38-4175-82bc-daa37f162933', '65ac...",Mapping Requirements to Software Architecture ...,international conference on software engineering,2003,10c7185a-f2b7-4810-b1d6-1340c2949922
3,IEEE 802.11e Medium Access Control (MAC) is an...,"['N. Sai Shankar', 'Sunghyun Choi']",50,,QoS Signaling for Parameterized Traffic in IEE...,Lecture Notes in Computer Science,2002,11f0bd37-ae5a-43e6-b14a-a59bc00fdd90
4,The aim of this paper is to develop an executa...,"['C. Graciani Díaz', 'Francisco-Jesús Martín-M...",50,['c17481ca-9511-4793-8dad-a2486e0b2713'],Specification of Adleman's Restricted Model Us...,Lecture Notes in Computer Science,2002,155dec16-36d6-44f4-976b-1afb5d1924af


### Removing lists' notation like square brackets from specific columns

In [4]:
data_for_neo_4j['authors'] = data_for_neo_4j['authors'].str.strip('[]')
data_for_neo_4j['references'] = data_for_neo_4j['references'].str.strip('[]')
data_for_neo_4j=data_for_neo_4j.drop('n_citation', axis=1)
data_for_neo_4j['references'] = data_for_neo_4j['references'].astype(str).str.replace("[']", "", regex=True)
data_for_neo_4j['authors'] = data_for_neo_4j['authors'].astype(str).str.replace("[']", "", regex=True)
data_for_neo_4j['authors'] = data_for_neo_4j['authors'].astype(str).str.replace('["]', "", regex=True)
data_for_neo_4j.to_csv('C:/Users/cob_n/.Neo4jDesktop/relate-data/dbmss/dbms-1fc8a5b8-6b7e-428c-b356-e52027d827bf/import/data_for_neo_4j.csv',header=False, index=False, encoding='utf-8' )


In [5]:
data_for_neo_4j.head()

Unnamed: 0,abstract,authors,references,title,venue,year,id
0,,"Tegegne Marew, Doo-Hwan Bae","2134bf3b-fd89-4724-90ce-5993b4fa3218, 906c17e0...",Using Classpects for Integrating Non-Functiona...,international conference on software engineering,2006,01f1d231-80ae-4cce-b56c-9d821e0924d0
1,,"Lei Zhang, Xuan Zhang, Meiping Chai, Yibing Ta...","3e3b524c-70c5-4008-b349-fd7ae950e655, 4929a7b3...",Solution Proposals for Japan-Oriented Offshore...,international conference on software engineering,2009,0e6ce7a9-6456-437b-9f3f-4bda192a6fae
2,,"Dongyun Liu, Hong Mei","4b837f17-7e38-4175-82bc-daa37f162933, 65acba26...",Mapping Requirements to Software Architecture ...,international conference on software engineering,2003,10c7185a-f2b7-4810-b1d6-1340c2949922
3,IEEE 802.11e Medium Access Control (MAC) is an...,"N. Sai Shankar, Sunghyun Choi",,QoS Signaling for Parameterized Traffic in IEE...,Lecture Notes in Computer Science,2002,11f0bd37-ae5a-43e6-b14a-a59bc00fdd90
4,The aim of this paper is to develop an executa...,"C. Graciani Díaz, Francisco-Jesús Martín-Mateo...",c17481ca-9511-4793-8dad-a2486e0b2713,Specification of Adleman's Restricted Model Us...,Lecture Notes in Computer Science,2002,155dec16-36d6-44f4-976b-1afb5d1924af


### Last step of preprocessing

In [6]:
#pre-processing phase - remove square brackets, quotes, double quotes etc ([,],',')
data_for_neo_4j['authors'] = data_for_neo_4j['authors'].str.strip('[]')
data_for_neo_4j['references'] = data_for_neo_4j['references'].str.strip('[]')

#pre-processing phase - drop n_citation as it's not needed
data_for_neo_4j=data_for_neo_4j.drop('n_citation', axis=1)
data_for_neo_4j['references'] = data_for_neo_4j['references'].astype(str).str.replace("[']", "", regex=True)
data_for_neo_4j['authors'] = data_for_neo_4j['authors'].astype(str).str.replace("[']", "", regex=True)
data_for_neo_4j['authors'] = data_for_neo_4j['authors'].astype(str).str.replace('["]', "", regex=True)

#save to csv
data_for_neo_4j.to_csv('C:/Users/cob_n/.Neo4jDesktop/relate-data/dbmss/dbms-1fc8a5b8-6b7e-428c-b356-e52027d827bf/import/data_for_neo_4j.csv',header=False, index=False, encoding='utf-8' )

#pre-processing phase - split data to 3 different datasets and continue
articles_data=data_for_neo_4j[['id', 'title', 'year', 'venue', 'abstract']]
authors_data=data_for_neo_4j[['id','authors']]
references=data_for_neo_4j[['id','references']]

#pre-processing phase - split based on deimeter and creating rows with authors names
authors_data = authors_data.assign(author=authors_data['authors'].str.split(',')).explode('author').reset_index(drop=True)
authors_data=authors_data.drop(columns='authors')

#pre-processing phase - split based on deimeter and creating rows with refernces
references = references.assign(reference=references['references'].str.split(',')).explode('reference').reset_index(drop=True)
references=references.drop(columns='references')
references["reference"] = references["reference"].fillna('unknown')

#pre-processing phase - remove whitespaces
articles_data['id']=articles_data['id'].str.strip()
articles_data['title']=articles_data['title'].str.strip()
# articles_data['year']=articles_data['year'].str.strip()
articles_data['venue']=articles_data['venue'].str.strip()

references['id']=references['id'].str.strip()
references['reference']=references['reference'].str.strip()

authors_data['id']=authors_data['id'].str.strip()
authors_data['author']=authors_data['author'].str.strip()

#pre-processing phase - save to csv - there are the input data for Neo4j
articles_data.to_csv('C:/Users/cob_n/.Neo4jDesktop/relate-data/dbmss/dbms-1fc8a5b8-6b7e-428c-b356-e52027d827bf/import/articles_data.csv',header=False, index=False )
authors_data.to_csv('C:/Users/cob_n/.Neo4jDesktop/relate-data/dbmss/dbms-1fc8a5b8-6b7e-428c-b356-e52027d827bf/import/authors_data.csv',header=False, index=False )
references.to_csv('C:/Users/cob_n/.Neo4jDesktop/relate-data/dbmss/dbms-1fc8a5b8-6b7e-428c-b356-e52027d827bf/import/references.csv' , header=False, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles_data['id']=articles_data['id'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles_data['title']=articles_data['title'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles_data['venue']=articles_data['venue'].str.strip()


### Appendix an attempt to run cypher queries through python

In [2]:
!pip3 install py2neo

Collecting py2neo
  Downloading py2neo-2021.2.3-py2.py3-none-any.whl (177 kB)
     -------------------------------------- 177.0/177.0 kB 1.1 MB/s eta 0:00:00
Collecting interchange~=2021.0.4
  Downloading interchange-2021.0.4-py2.py3-none-any.whl (28 kB)
Collecting pansi>=2020.7.3
  Downloading pansi-2020.7.3-py2.py3-none-any.whl (10 kB)
Collecting monotonic
  Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)
Installing collected packages: monotonic, pansi, interchange, py2neo
Successfully installed interchange-2021.0.4 monotonic-1.6 pansi-2020.7.3 py2neo-2021.2.3



[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
from py2neo import Graph
import pandas as pd

import matplotlib 
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import pandas as pd
from collections import Counter
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

In [4]:
query = """
MATCH (article:Articles) WHERE article.year IS NOT NULL
WITH article.year AS year, count(*) AS count
ORDER BY year
RETURN toString(year) AS year, count
"""
by_year = graph.run(query).to_data_frame()

ax = by_year.plot(kind='bar', x='year', y='count', legend=None, figsize=(15,8))
ax.xaxis.set_label_text("")
plt.tight_layout()
plt.show()

NameError: name 'graph' is not defined