In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import sqlite3
import numpy as np
from Selector import Selector
from Tablecreator import TableCreator
from Inserter import Inserter

# DB CREATION AND INSERT

In [2]:
def _exec(DB, statement):
    DB.c.execute(statement)


def createDB(DB_name):
    testDB = TableCreator(DB_name)
    testDB.createArticle()
    testDB.createAuthor()
    testDB.createHas()
    testDB.createCitation()
    testDB.closeConnect()
    return

name = 'test.db'
createDB(name)

In [3]:
def insert_paper(DB_name, idd, article, authors, year, citations = '', citated = '', keywords = '', pages = '', volume = '', issue = '', abstract = ''):
    testDB = Inserter(DB_name)
    if testDB.insertArticle(idd, article, keywords, pages, year, volume, issue, abstract):
        for author in authors:
            testDB.insertAuthor(author)
            testDB.insertHas(idd, author)

        for cit in citations:
            testDB.insertCitations(idd, cit)
        for cit in citated:
            testDB.insertCitations(cit, idd)
        testDB.closeConnect()
    return

# Parsing DATA

In [4]:
# pip install elsapy

from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json
import requests

## Debug imports
from time import time
    
## Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()

## Initialize client
client = ElsClient(config['apikey'])

In [5]:
def search(name, numres):
    res = []
    if numres != -1:
        return defsearch(numres, name)
    else:
        body = {
            "authors": name,
            "loadedAfter": "2010-01-01T00:00:00Z",
            "display": {
                "show": 1
            }
        }
        r = requests.put(base, data=json.dumps(body), headers=headers).json()
        return defsearch(r['resultsFound'], name)



def defsearch(numres, name):
    print('Found ' + str(numres) + ' results')
    base = 'https://api.elsevier.com/content/search/sciencedirect'
    headers = {
        'x-els-apikey': config['apikey'],
        'Content-Type': 'application/json'
    }
    res = []
    for offset in range(0, numres, 100):
            body = {
                "authors": name,
                "loadedAfter": "2010-01-01T00:00:00Z",
                "display": {
                    "offset": offset,
                    "show": min(100, numres - offset)
                }
            }
            r = requests.put(base, data=json.dumps(
                body), headers=headers).json()
            res += r['results']
    return res

# Insert ifo to DB

In [6]:
def push_info_to_db(searchres, name = 'test.db'):
    for elem in searchres:
        idd = elem['pii']
        article = elem['sourceTitle']
        auths = [author['name'] for author in elem['authors']]
        year = int(elem['publicationDate'][:4])
        insert_paper(name, idd, article, auths, year)

## Algorithm

In [7]:
from random import randint
from queue import Queue

q = Queue()

# get random search result
s = search('Markov', 1)
push_info_to_db(s)

# put first authors in the queue
for auth in s[0]['authors']:
    q.put(auth['name'])
    
# search co-authors in queue
threshold = 150
i = 0
while not q.empty() and i <= threshold:
    searchres = search(q.get_nowait(), randint(5, 10))
    push_info_to_db(searchres)
    for res in searchres:
        for author in res['authors']:
            q.put(author['name'])
    i += 1

Found 1 results
Found 9 results
Found 7 results
Found 6 results
Found 8 results
Found 5 results
Found 9 results
Found 5 results
Found 7 results
Found 7 results
Found 7 results
Found 5 results
Found 9 results
Found 5 results
Found 7 results
Found 10 results
Found 5 results
Found 5 results
Found 10 results
Found 8 results
Found 10 results
Found 6 results
Found 8 results
Found 9 results
Found 10 results
Found 7 results
Found 6 results
Found 7 results
Found 6 results
Found 7 results
Found 8 results
Found 9 results
Found 5 results
Found 5 results
Found 6 results
Found 10 results
Found 6 results
Found 6 results
Found 7 results
Found 6 results
Found 10 results
Found 8 results
Found 5 results
Found 9 results
Found 10 results
Found 9 results
Found 9 results
Found 10 results
Found 7 results
Found 10 results
Found 7 results
Found 6 results
Found 7 results
Found 8 results
Found 9 results
Found 5 results
Found 5 results
Found 6 results
Found 9 results
Found 5 results
Found 7 results
Found 7 results

# Make Authors Graph

In [8]:
def create_df_authors(DB_name):
    testDB = Selector(DB_name)
    df = testDB.make_df_authors()
    testDB.closeConnect()
    return df

In [9]:
def create_df_authors_for_year(DB_name, year):
    testDB = Selector(DB_name)
    df = testDB.make_df_for_year(year)
    testDB.closeConnect()
    return df

In [10]:
def create_graph_from_pandas_df(df):
    """ Takes pandas dataframe and create networkx graph. We suggest every row in df
        is an article with next columns: 'list of authors' (list of strings)
    """
    G = nx.Graph()
    
    for num, row in df.iterrows():
        authors_list = row['authors_list']
        # connect every one and update edges
        for i in range(len(authors_list)):
            for j in range(i + 1, len(authors_list)):
                from_, to_ = authors_list[i], authors_list[j]
                new_weight = (G[from_][to_]['weight'] if G.has_edge(from_, to_) else 0) + 1
                G.add_edge(from_, to_, weight=new_weight)
        
    return G    

I split graph into 5 independent pieces consisting of 5 years each. Also i use data for 2019 year as test for target.

In [222]:
name = 'test.db'
df = create_df_authors_for_year(name, 1994)
dffs = []
grapfs = []
for i in range(1990, 2020, 5):
    dffs.append(create_df_authors_for_year(name, i))
    dffs[-1] = dffs[-1].append(create_df_authors_for_year(name, i + 1))
    dffs[-1] = dffs[-1].append(create_df_authors_for_year(name, i + 2))
    dffs[-1] = dffs[-1].append(create_df_authors_for_year(name, i + 3))
    dffs[-1] = dffs[-1].append(create_df_authors_for_year(name, i + 4))
    grapfs.append(create_graph_from_pandas_df(dffs[-1]))
test = create_df_authors_for_year(name, 2019)
testgraph = create_graph_from_pandas_df(test)
G = create_graph_from_pandas_df(df)

In [223]:
len(list(G.adjacency()))
for i in grapfs:
    print(len(list(i.adjacency())))
print(len(list(testgraph.adjacency())))


13
29
31
30
2438
4859
941


Here i make features. I make:
common neighbours
Jaccard’s coefficient
Shortest path:
Katz score
node2vec embending of size 10
density of the graph


common neighbours
Jaccard’s coefficient
Shortest path:
Katz score
for same nodes in previous interval of years


Also i use only 10000 first node-pairs for train set and 1000 first node-pairs for test set as there are a lot of nodes and taking them all would take a lot of time

In [301]:
def make_features(grapha, graphb, emembs, prev):
    beta = 0.1
    subs = (grapha.subgraph(c) for c in nx.connected_components(grapha))
    nodes = list(grapha.nodes)
    features = []
    target = []
    wow = 0
    fl = False
    for i in range(len(nodes) - 1):
        if(fl):
            break
        for j in range(i + 1, len(nodes)):
            wow += 1
            if(wow % 1000 == 0):
                print(wow)
                fl = True
                break
            if not grapha.has_edge(nodes[i], nodes[j]):
                features.append([nodes[i], nodes[j]])
                if(graphb.has_edge(nodes[i], nodes[j])):
                #    print('daticho')
                    target.append(1)
                else:
                    target.append(0)
                features[-1].append(len(list(nx.common_neighbors(grapha, nodes[i], nodes[j]))))
                features[-1].append(len(list(nx.common_neighbors(grapha, nodes[i], nodes[j]))) / (grapha.degree(nodes[i]) + grapha.degree(nodes[j]) - len(list(nx.common_neighbors(grapha, nodes[i], nodes[j])))))
                if(nx.has_path(grapha, nodes[i], nodes[j])):
                    features[-1].append(nx.shortest_path_length(grapha, nodes[i], nodes[j]))
                    paths = nx.all_simple_paths(grapha, nodes[i], nodes[j], cutoff=5)
                    res = 0
                    for i11 in paths:
                        res += beta ** (len(i11))
                    features[-1].append(res)
                    #mcmf = nx.max_flow_min_cost(grapha, nodes[i], nodes[j], capacity='weight', weight='None')
                    #features[-1].append(mcmf)
                    #res = 0
                   # for i1 in mcmf[nodes[i]]:
                  #      res += mcmf[nodes[i]][i1]
                 #   features[-1].append(res)
                #    features[-1].append(nx.cost_of_flow(grapha, mcmf))
        #            print(features[-1][-1], features[-1][-2])
                else:
                    features[-1].append(10000)
                    features[-1].append(0)
                    #features[-1].append(0)
                for k in emembs[nodes[i], nodes[j]]:
                    features[-1].append(k)
                features[-1].append(nx.density(grapha))
                flflfl = True
                for k in prev:
                    if nodes[i] == k[0] and nodes[j] == k[1]:
                        features[-1].append(k[2])
                        features[-1].append(k[3])
                        features[-1].append(k[4])
                        features[-1].append(k[5])
                        flflfl = False
                        break
                    elif nodes[i] < k[0] or (nodes[i] == k[0] and nodes[j] < k[1]):
                        features[-1].append(0)
                        features[-1].append(0)
                        features[-1].append(0)
                        features[-1].append(0)
                        flflfl = False
                        break
                if(prev == [] or flflfl):
                    features[-1].append(0)
                    features[-1].append(0)
                    features[-1].append(0)
                    features[-1].append(0)
    return (features, target)

Here i pre-count embendings

In [224]:
ememembes = []
for i in grapfs:
    model = Node2Vec(i, dimensions=10, walk_length=5, num_walks=5)
    model = model.fit()
    ememembes.append(HadamardEmbedder(keyed_vectors=model.wv))






Computing transition probabilities:   0%|          | 0/13 [00:00<?, ?it/s]




Computing transition probabilities: 100%|██████████| 13/13 [00:00<00:00, 1446.12it/s]




Generating walks (CPU: 1):   0%|          | 0/5 [00:00<?, ?it/s]




Generating walks (CPU: 1): 100%|██████████| 5/5 [00:00<00:00, 57.56it/s]




Computing transition probabilities:   0%|          | 0/29 [00:00<?, ?it/s]




Computing transition probabilities: 100%|██████████| 29/29 [00:00<00:00, 1320.29it/s]




Generating walks (CPU: 1):   0%|          | 0/5 [00:00<?, ?it/s]




Generating walks (CPU: 1):  80%|████████  | 4/5 [00:00<00:00, 34.25it/s]




Generating walks (CPU: 1): 100%|██████████| 5/5 [00:00<00:00, 23.30it/s]




Computing transition probabilities:   0%|          | 0/31 [00:00<?, ?it/s]




Computing transition probabilities: 100%|██████████| 31/31 [00:00<00:00, 1552.74it/s]




Generating walks (CPU: 1):   0%|          | 0/5 [00:00<?, ?it/s]




Generating walks (CPU: 1):  80%|████████  | 4/5 [0

Computing transition probabilities:  46%|████▌     | 2219/4859 [00:04<00:05, 526.19it/s]




Computing transition probabilities:  48%|████▊     | 2312/4859 [00:04<00:04, 604.00it/s]




Computing transition probabilities:  49%|████▉     | 2383/4859 [00:04<00:03, 630.92it/s]




Computing transition probabilities:  50%|█████     | 2451/4859 [00:04<00:03, 615.43it/s]




Computing transition probabilities:  52%|█████▏    | 2516/4859 [00:04<00:04, 541.39it/s]




Computing transition probabilities:  53%|█████▎    | 2595/4859 [00:05<00:03, 596.70it/s]




Computing transition probabilities:  55%|█████▍    | 2660/4859 [00:05<00:04, 540.29it/s]




Computing transition probabilities:  56%|█████▌    | 2719/4859 [00:05<00:03, 551.44it/s]




Computing transition probabilities:  57%|█████▋    | 2783/4859 [00:05<00:03, 574.03it/s]




Computing transition probabilities:  59%|█████▊    | 2843/4859 [00:05<00:03, 580.15it/s]




Computing transition probabilities:  60%|██████    | 2931/4859 [00:05<

In [244]:
len(grapfs)

6

Here i make train set

In [286]:
feats1, tars1 = [], []
for i in range(5):
    if(i == 0):
        res = make_features(grapfs[i], grapfs[i + 1], ememembes[i], [])
    else:
        res = make_features(grapfs[i], grapfs[i + 1], ememembes[i], feats1[-1])
    feats1.append(res[0])
    tars1.append(res[1])
    print('ok')

ok
ok
ok
ok
10000
ok


In [308]:
trainF, trainT = [], []
for i in range(5):
    trainF += feats1[i]
    trainT += tars1[i]

In [309]:
maxxxxx = 21
for i in trainF:
    maxxxxx = min(len(i), maxxxxx)
print(maxxxxx, len(trainF[0]))
for i in range(len(trainF)):
    trainF[i] = trainF[i][2:]
    trainF[i] = np.array(trainF[i])
trainF = np.array(trainF)
trainF = (trainF - trainF.mean()) / trainF.std()

21 21


Here i train logreg model

In [289]:
from sklearn.linear_model import LogisticRegression

In [310]:
modelll = LogisticRegression()
modelll.fit(trainF, trainT)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Here i make test set

In [311]:
testF, testT = make_features(grapfs[5], testgraph, ememembes[5], feats1[-1])


1000


In [312]:
for i in range(len(testF)):
    if(i % 1000 == 999):
        print(i)
    testF[i] = testF[i][2:]
    testF[i] = np.array(testF[i])
testF = np.array(testF)
testF = (testF - trainF.mean()) / trainF.std()

In [313]:
predlr = modelll.predict(testF)

Here i train knn model

In [314]:
from sklearn.neighbors import KNeighborsClassifier
modelll2 = KNeighborsClassifier(3)

In [315]:
modelll2.fit(trainF, trainT)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [316]:
predknn = modelll2.predict(testF)

Here i train MLPclassifier

In [317]:
from sklearn.neural_network import MLPClassifier
modelll3 = MLPClassifier(alpha=1)

In [318]:
modelll3.fit(trainF, trainT)

MLPClassifier(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [319]:
predmlp = modelll3.predict(testF)

In [320]:
reses = [[predknn, 'knn'], [predlr, 'logistic regression'], [predmlp, 'mlp']]

In [326]:
sum(testT)

0

In [327]:
sum(predknn)

0

In [328]:
sum(predlr)

0

In [329]:
sum(predmlp)

0

It seems, that there were no ones in test target(in train target there were less than 1% of ones). All classifiers did their work well with accuracy equals to 1.