# Data Engineering Project 
## ETL

**Authors**: 
- Dmitri Rozgonjuk
- Eerik Sven Puudist
- Lisanne Siniväli
- Cheng-Han Chung


The aim of this script is to clean the main raw data frame and write a new, clean data frame for further use. In this notebook, the comparisons of different read- and write-methods are demonstrated.

First, we install and import the necessary libraries from one cell (to avoid having libraries in some individual cells below). The packages and their versions to be installed will later be added to the `requirements.txt` file.

We also use this section to set global environment parameters.

In [None]:
!conda install psycopg2 -y
!pip install -r requirements.txt

In [None]:
## NB!! run the installs from terminal
########### Library Installations ##############

################### Imports ####################
### Data wrangling
import pandas as pd # working with dataframes
import numpy as np # vector operationsõ


### Specific-purpose libraries
# NB! Most configure with an API key
#from pybliometrics.scopus import AbstractRetrieval
from habanero import Crossref # CrossRef API
from genderize import Genderize # Gender API

### Misc
from math import floor
import time
import requests
import warnings # suppress warnings
import os # accessing directories
from tqdm import tqdm # track loop runtime
from unidecode import unidecode # international encoding fo names

### Custom Scripts (ETL, augmentations, SQL)
from scripts.raw_to_tables import *
from scripts.augmentations import *
from scripts.final_tables import *
from scripts.sql_queries import *
#from scripts.neo4j_queries import *

### Database drivers
import psycopg2
#from neo4j import GraphDatabase

########## SETTING ENV PARAMETERS ################
warnings.filterwarnings('ignore') # suppress warnings

## Pipeline start

In [None]:
# First check if the tables are already in the system
## If tables exist, import from .csv

if os.path.exists('./tables') and len(os.listdir('./tables')) == 8: # directory + 7 tables
    print('Tables exist...')
    author = pd.read_csv('./tables/author.csv')
    authorshiphip = pd.read_csv('./tables/authorship.csv')
    article = pd.read_csv('./tables/article.csv')
    article_category = pd.read_csv('./tables/article_category.csv')
    category = pd.read_csv('./tables/category.csv')
    journal = pd.read_csv('./tables/journal.csv')
    print('Tables are in the working directory!')
    
## If tables do not exist, pull from kaggle (or local machine), proprocess to tables
else: 
    print('Preparing tables...')
    print()
    ingest_and_prepare()
    print('Tables are in the working directory!')

# 2. Loading Clean Data or Data Augmentation

In [None]:
article = article_ready()
journal = journal_ready()

# Remove not found journals from articles
article = article[article['journal_issn'].isin(journal['journal_issn'])].reset_index(drop = True)
# Update 'article.csv' in 'data_ready' directory
article.to_csv('./data_ready/article.csv', index = False)

authorship = authorship_ready(article)
author = author_ready(article, authorship)
article_category = article_category_ready(article)
category = category_ready(article_category)

### Author update and augments
In order to query 'gender' of a given author, we first extract all valid (length > 3) first names. We acknowledge that there may be first names that are smaller than four characters in length, but given that query amount is limited, we are going with a more robust way to extract as many names as possible.

### Journal
In order to get the journal information, we need the journal ISSN list from the `article` table. Although journal Impact Factor are more common metrics, they are trademarked and, hence, retrieving them is not open-source. The alternative is to use SNIP - source-normalized impact per publication. This is the average number of citations per publication, corrected for differences in citation practice between research domains. Fortunately, the list of journals and their SNIP is available from the CWTS website (https://www.journalindicators.com/).

# 3. From Pandas to PostgreSQL

In [3]:
# Import the data from Pandas
article = pd.read_csv('data_ready/article.csv')
author = pd.read_csv('data_ready/author.csv')
authorship = pd.read_csv('data_ready/authorship.csv')
category = pd.read_csv('data_ready/category.csv')
article_category = pd.read_csv('data_ready/article_category.csv')
journal = pd.read_csv('data_ready/journal.csv')

tables = [article, author, authorship, category, article_category, journal]

# Name of tables (for later print)
article.name = 'article'
author.name = 'author'
authorship.name = 'authorship'
category.name = 'category'
article_category.name = 'article_category'
journal.name = 'journal'

In [4]:
# Insert into tables (helper function)
def insert_to_tables(table, query):
    ''' Helper function for inserting values to Postresql tables
    Args:
        table (pd.DataFrame): pandas table
        query (SQL query): correspondive SQL query for 'table' for data insertion in DB
    '''
    
    print(f'Inserting table -- {table.name} -- ...')
    
    try:
        for i, row in table.iterrows():
            cur.execute(query, list(row))
        print(f'Table -- {table.name} -- successfully inserted!')
    except:
        print(f'Error with table -- {table.name} --')
    print()

In [9]:
    # Connect to the database
conn = psycopg2.connect(host="localhost", user="postgres", password="postgres", database="postgres")
conn.set_session(autocommit=True)
cur = conn.cursor()

    # create sparkify database with UTF8 encoding
cur.execute("DROP DATABASE IF EXISTS research_db")
cur.execute("CREATE DATABASE research_db WITH ENCODING 'utf8' TEMPLATE template0")

In [6]:
# Drop Tables 
for query in drop_tables:
    cur.execute(query)
    conn.commit()
    
    # Create Tables
for query in create_tables:
    cur.execute(query)
    conn.commit()

In [7]:
# Insert into tables
for i in tqdm(range(len(tables))):
    insert_to_tables(tables[i], insert_tables[i])

  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Inserting table -- article -- ...


 17%|██████████████                                                                      | 1/6 [00:32<02:41, 32.39s/it]

Table -- article -- successfully inserted!

Inserting table -- author -- ...


 33%|████████████████████████████                                                        | 2/6 [01:28<03:05, 46.43s/it]

Table -- author -- successfully inserted!

Inserting table -- authorship -- ...


 67%|████████████████████████████████████████████████████████                            | 4/6 [03:11<01:27, 43.70s/it]

Table -- authorship -- successfully inserted!

Inserting table -- category -- ...
Table -- category -- successfully inserted!

Inserting table -- article_category -- ...


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [04:13<00:50, 50.39s/it]

Table -- article_category -- successfully inserted!

Inserting table -- journal -- ...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [04:15<00:00, 42.62s/it]

Table -- journal -- successfully inserted!






# Database Connection

In [16]:
#%load_ext sql
%sql postgresql://postgres:postgres@localhost/postgres

## Load the possiblity to run magic function

# Test Queries

In [17]:
%sql SELECT * FROM authorship LIMIT 10;

 * postgresql://postgres:***@localhost/postgres
10 rows affected.


article_id,author_id
1001.0001,KrotovD
1001.0001,HedenO
1001.0041,IndykP
1001.0041,SzarekS
1001.0361,GargouriY
1001.0361,HajjemC
1001.0361,LariviereV
1001.0361,GingrasY
1001.0361,CarrL
1001.0361,BrodyT


In [18]:
%sql SELECT * FROM article_category LIMIT 10;

 * postgresql://postgres:***@localhost/postgres
10 rows affected.


article_id,category_id
1001.0001,cs.IT
1001.0001,math.IT
1001.0041,math.MG
1001.0041,cs.CC
1001.0041,math.FA
1001.0361,cs.CY
1001.0361,cs.DL
1001.0639,cs.DS
1001.0641,cs.LO
1001.0641,cs.GT


In [19]:
%sql SELECT * FROM article LIMIT 10;

 * postgresql://postgres:***@localhost/postgres
10 rows affected.


article_id,title,doi,n_authors,journal_issn,type,n_cites,year
704.0046,A limit relation for entropy and channel capacity per unit cost,10.1063/1.2779138,3,0022-2488,journal-article,6.0,2009
704.0062,On-line Viterbi Algorithm and Its Relationship to Random Walks,10.1007/978-3-540-74126-8_23,3,0302-9743,book-chapter,5.0,2010
704.0098,Sparsely-spread CDMA - a statistical mechanics based analysis,10.1088/1751-8113/40/41/004,2,1751-8113,journal-article,26.0,2009
704.0217,Capacity of a Multiple-Antenna Fading Channel with a Quantized Precoding  Matrix,10.1109/TIT.2008.2011437,2,0018-9448,journal-article,113.0,2010
704.0301,Differential Recursion and Differentially Algebraic Functions,10.1145/1507244.1507252,1,1529-3785,journal-article,4.0,2009
704.0954,Sensor Networks with Random Links: Topology Design for Distributed  Consensus,10.1109/TSP.2008.920143,2,1053-587X,journal-article,186.0,2009
704.1267,Text Line Segmentation of Historical Documents: a Survey,10.1007/s10032-006-0023-z,3,1433-2833,journal-article,253.0,2007
704.1308,Antenna Combining for the MIMO Downlink Channel,10.1109/T-WC.2008.070383,1,1536-1276,journal-article,111.0,2016
704.1524,GLRT-Optimal Noncoherent Lattice Decoding,10.1109/TSP.2007.894237,3,1053-587X,journal-article,36.0,2009
704.1751,Information Theoretic Proofs of Entropy Power Inequalities,10.1109/TIT.2010.2090193,1,0018-9448,journal-article,106.0,2016


In [22]:
%sql SELECT * FROM author LIMIT 10;

 * postgresql://postgres:***@localhost/postgres
10 rows affected.


author_id,last_name,first_name,middle_name,gender,total_pubs,total_cites,avg_cites,med_coauthors,n_unique_coauthors,hindex,rank_total_pubs,rank_total_cites,rank_avg_cites,rank_hindex
WangX,Wang,Xingbo,,,167,6958,41.665,3.0,349,39,2,10,6632,1
PoorH,Poor,H,Vincent,,78,6876,88.154,3.0,147,39,20,11,2802,1
ZhangJ,Zhang,Jiawei,,F,156,6449,41.34,3.0,350,37,4,14,6651,3
AbramoG,Abramo,Giovanni,,M,96,2941,30.635,2.0,20,33,9,90,9238,5
LiuY,Liu,Ying,,,159,5562,34.981,3.0,398,33,3,27,8057,5
ZhangR,Zhang,Ruimao,,,70,7612,108.743,2.0,121,33,26,7,2124,5
DAngeloC,DAngelo,Ciriaco,Andrea,,97,2926,30.165,2.0,23,32,8,93,9303,7
WangY,Wang,Yongqing,,,174,4627,26.592,3.0,445,30,1,56,10550,8
ZhangY,Zhang,Yichen,,M,133,3952,29.714,3.0,318,30,5,64,9540,8
DasS,Das,Sourav,,M,72,3024,42.0,2.0,100,29,22,83,6551,10


In [None]:
%sql SELECT * FROM journal LIMIT 10;

In [52]:
author = 'WangX'
# Get the articles
papers = authorship[authorship['author_id'] == author]['article_id'].values

# Get all authors
co_authors = authorship[authorship['article_id'].isin(papers)]

# N pubs with unique co-authors
npubs_coauthors = co_authors[co_authors['author_id'] != author].groupby(['author_id']).size()

# n Cites with unique co-authors


In [54]:
npubs_coauthors

author_id
AbrahamssonP    12
AggarwalV        7
AshraphijuoM     5
BaiX             1
BajwaS           1
                ..
ZieglerV         1
ZouL             1
ZouY             3
ZuoZ             1
deVisserC        1
Length: 349, dtype: int64

In [41]:
%%sql query_one <<
SELECT author_id, rank_total_pubs as rank, total_pubs as publications
FROM author 
ORDER BY rank_total_pubs 
LIMIT  3 * (SELECT COUNT(*) FROM author) / 100;

 * postgresql://postgres:***@localhost/postgres
1686 rows affected.
Returning data to local variable query_one


In [42]:
query_one

author_id,rank,publications
WangY,1,174
WangX,2,167
LiuY,3,159
ZhangJ,4,156
ZhangY,5,133
WangZ,6,107
WangH,7,99
DAngeloC,8,97
WangJ,9,96
AbramoG,9,96


In [47]:
%%sql query_two <<
SELECT final.author_id, final.rank, final.publications, final.journal_title as top_journal,  TO_CHAR((final.number * 100 / final.publications), 'fm99%') as percentage_of_all_publications
FROM (select a.author_id, rank, publications, mode() within group (order by j.journal_title) AS journal_title, COUNT(j.journal_title) as number
      from (SELECT author_id, rank_total_pubs as rank, total_pubs as publications
      FROM author 
      ORDER BY rank_total_pubs 
      LIMIT  3 * (SELECT COUNT(*) FROM author) / 100) AS a
      INNER JOIN authorship au ON a.author_id = au.author_id
      INNER JOIN article ar ON au.article_id = ar.article_id
      INNER JOIN journal j ON ar.journal_issn = j.journal_issn
      group by a.author_id, rank, publications,j.journal_title
      having j.journal_title = mode() within group (order by j.journal_title)) as final
LEFT JOIN (select a.author_id, rank, publications, mode() within group (order by j.journal_title) AS journal_title, COUNT(j.journal_title) as number
      from (SELECT author_id, rank_total_pubs as rank, total_pubs as publications
      FROM author 
      ORDER BY rank_total_pubs 
      LIMIT  3 * (SELECT COUNT(*) FROM author) / 100) AS a
      INNER JOIN authorship au ON a.author_id = au.author_id
      INNER JOIN article ar ON au.article_id = ar.article_id
      INNER JOIN journal j ON ar.journal_issn = j.journal_issn
      group by a.author_id, rank, publications,j.journal_title
      having j.journal_title = mode() within group (order by j.journal_title)) as final1 ON 
    final.author_id = final1.author_id AND final.number < final1.number
WHERE final1.author_id IS NULL
ORDER BY final.rank 
LIMIT  3 * (SELECT COUNT(*) FROM author) / 100;


 * postgresql://postgres:***@localhost/postgres
1686 rows affected.
Returning data to local variable query_two


In [48]:
query_two

author_id,rank,publications,top_journal,percentage_of_all_publications
WangY,1,174,IEEE Transactions on Image Processing,7%
WangX,2,167,IEEE Transactions on Signal Processing,8%
LiuY,3,159,IEEE Transactions on Signal Processing,7%
LiuY,3,159,IEEE Transactions on Image Processing,7%
LiuY,3,159,Lecture Notes in Computer Science,7%
ZhangJ,4,156,IEEE Transactions on Wireless Communications,7%
ZhangY,5,133,IEEE Transactions on Image Processing,5%
ZhangY,5,133,Lecture Notes in Computer Science,5%
WangZ,6,107,IEEE Transactions on Image Processing,7%
WangH,7,99,Pattern Recognition,6%


In [55]:
%%sql query_three <<

SELECT final.author_id, final.rank, final.publications, final.most_productive_year, final.count_of_pub
FROM (SELECT a.author_id, rank, publications, ar.year AS most_productive_year, count(ar.year) as count_of_pub
    FROM (SELECT author_id, rank_total_pubs as rank, total_pubs as publications
    FROM author 
    ORDER BY rank_total_pubs 
    LIMIT  3 * (SELECT COUNT(*) FROM author) / 100) AS a
    INNER JOIN authorship au ON a.author_id = au.author_id
    INNER JOIN article ar ON au.article_id = ar.article_id
    GROUP BY a.author_id, rank, publications, ar.year) as final
LEFT JOIN (SELECT a.author_id, rank, publications, ar.year AS most_productive_year, count(ar.year) as count_of_pub
    FROM (SELECT author_id, rank_total_pubs as rank, total_pubs as publications
    FROM author 
    ORDER BY rank_total_pubs 
    LIMIT  3 * (SELECT COUNT(*) FROM author) / 100) AS a
    INNER JOIN authorship au ON a.author_id = au.author_id
    INNER JOIN article ar ON au.article_id = ar.article_id
    GROUP BY a.author_id, rank, publications, ar.year) as final1 ON 
    final.author_id = final1.author_id AND final.count_of_pub < final1.count_of_pub
WHERE final1.author_id IS NULL
ORDER BY final.rank 
LIMIT  3 * (SELECT COUNT(*) FROM author) / 100;

 * postgresql://postgres:***@localhost/postgres
1686 rows affected.
Returning data to local variable query_three


In [56]:
query_three

author_id,rank,publications,most_productive_year,count_of_pub
WangY,1,174,2021,43
WangX,2,167,2020,27
LiuY,3,159,2020,33
LiuY,3,159,2022,33
LiuY,3,159,2021,33
ZhangJ,4,156,2022,35
ZhangY,5,133,2022,32
WangZ,6,107,2022,24
WangH,7,99,2022,25
DAngeloC,8,97,2018,83


In [61]:
%%sql query_four <<

SELECT final.author_id, final.rank, final.year AS most_influential_year, final.pub AS count_of_pub, final.avg_cites
FROM (SELECT a.author_id, rank, count(ar.year) as pub, ar.year, (sum(ar.n_cites::DECIMAL)::int) / count(ar.year) as avg_cites
    FROM (SELECT author_id, rank_total_pubs as rank
    FROM author
    ORDER BY rank_total_pubs 
    LIMIT  3 * (SELECT COUNT(*) FROM author) / 100) AS a
    INNER JOIN authorship au ON a.author_id = au.author_id
    INNER JOIN article ar ON au.article_id = ar.article_id
    GROUP BY a.author_id, rank, ar.year) as final
LEFT JOIN (SELECT a.author_id, rank, count(ar.year) as pub, ar.year, (sum(ar.n_cites::DECIMAL)::int) / count(ar.year) as avg_cites
    FROM (SELECT author_id, rank_total_pubs as rank
    FROM author 
    ORDER BY rank_total_pubs 
    LIMIT  3 * (SELECT COUNT(*) FROM author) / 100) AS a
    INNER JOIN authorship au ON a.author_id = au.author_id
    INNER JOIN article ar ON au.article_id = ar.article_id
    GROUP BY a.author_id, rank, ar.year) as final1 ON 
    final.author_id = final1.author_id AND final.avg_cites < final1.avg_cites
WHERE final1.author_id IS NULL
ORDER BY final.rank 
LIMIT  3 * (SELECT COUNT(*) FROM author) / 100;



 * postgresql://postgres:***@localhost/postgres
1686 rows affected.
Returning data to local variable query_four


In [62]:
query_four

author_id,rank,most_influential_year,count_of_pub,avg_cites
WangY,1,2018,10,180
WangX,2,2016,21,108
LiuY,3,2016,11,186
ZhangJ,4,2015,5,188
ZhangY,5,2015,3,236
WangZ,6,2012,1,84
WangH,7,2014,2,182
DAngeloC,8,2018,83,32
AbramoG,9,2018,84,32
WangJ,9,2014,2,198


# 4. Preparing Graph DB Data

- about network analysis with these data in Neo4J: https://medium.com/swlh/network-analysis-of-arxiv-dataset-to-create-a-search-and-recommendation-engine-of-articles-cd18b36a185e

- link prediction: https://towardsdatascience.com/link-prediction-with-neo4j-part-2-predicting-co-authors-using-scikit-learn-78b42356b44c

The Graph Database Schema is pictured below:
<img src="images/graph_db_schema.png"/>

Tutorial: https://www.youtube.com/watch?v=PfySvVqHAWo&t=33s

In [9]:
conn = Neo4jConnection(uri='bolt://neo:7687', user='', pwd='')

In [10]:
result = conn.query('MATCH (n:Article) RETURN COUNT(n) AS ct')
print(result[0]['ct'])

40255


In [43]:
# Delete all nodes
# conn.query('MATCH (a) DELETE a')

[]

### Add constraints to ID variables

In [11]:
# Add ID uniqueness constraint to optimize queries
conn.query('CREATE CONSTRAINT ON(n:Category) ASSERT n.id IS UNIQUE')
conn.query('CREATE CONSTRAINT ON(j:Journal) ASSERT j.id IS UNIQUE')
conn.query('CREATE CONSTRAINT ON(au:Author) ASSERT au.id IS UNIQUE')
conn.query('CREATE CONSTRAINT ON(ar:Article) ASSERT ar.id IS UNIQUE')

[]

### Ingest the data

In [None]:
add_category(conn, category)
add_journal(conn, journal)
add_author(conn, author)
add_article(conn, article)
add_article_category(conn, article_category)
add_authorship(conn, authorship)

{'total': 73959, 'batches': 74, 'time': 27.207443952560425}

{'total': 116083, 'batches': 117, 'time': 28.746549606323242}

In [18]:
result = conn.query('MATCH (n:Author) RETURN COUNT(n) AS ct')
print(result[0]['ct'])

56202


# 5. Example Queries

## 5.1. Data Warehouse

## 5.2. Graph Database

In [None]:
# Ego-network WITH the author
MATCH (author:Author)-[:AUTHORED]->(article:Article) 
WHERE author.id = "GousiosG" # add specific name
WITH author, COUNT(article) AS number_of_articles, collect(article) AS articles
ORDER BY number_of_articles DESC 
LIMIT 1
UNWIND articles AS article
MATCH (coauthor:Author)-[:AUTHORED]->(article)
RETURN article, collect(coauthor), COUNT(article)

In [None]:
# Ego-network WITHOUT the author
# https://stackoverflow.com/questions/28816222/finding-a-list-of-neo4j-nodes-which-have-the-most-relationships-back-to-another
MATCH (author:Author)-[:AUTHORED]->(article:Article) 
WITH author, COUNT(article) AS number_of_articles, collect(article) AS articles
ORDER BY number_of_articles DESC 
LIMIT 1
UNWIND articles AS article
MATCH (coauthor:Author)-[:AUTHORED]->(article)
WHERE coauthor <> author
RETURN article, collect(coauthor)

## Total Pipeline Runtime

In [None]:
end_pipe = time.time()

print(f'Time of pipeline start: {time.ctime(end_pipe)}')
print(f'Total pipeline runtime: {(end_pipe - start_pipe)/60} min.')