In [10]:
import sqlite3
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from utils.dummy import dummy_data
from utils.utils import get_citation_doi
from utils.utils import get_paper_citation_pairs
from acm_api.acm import acm_meta
from evaluation.evaluate import Train_Test_Split_Plain
from cf.cf import CF
from cf.cf_unit import CFU
from cf.cf_rank import CFUR

<h2>Processing the input file with doi of papers</h2>

In [11]:
doi_list = []
data = {}

# reading the input file with doi of papers
with open('doi/doi.txt', 'r') as doi_file:
    lines = doi_file.readlines()
    
for line in lines:
    doi_list.append(line.strip())
    
con = sqlite3.connect('papers.db')
cur = con.cursor()

is_table_exists = cur.execute(
    """
    SELECT name FROM sqlite_master WHERE type='table'
    AND name='PAPERS'; """).fetchall()

if is_table_exists == []:
    cur.execute(""" 
        CREATE TABLE PAPERS (doi TEXT, data TEXT);
    """)
    
print('processing doi....')
for doi in tqdm(doi_list):
    print(doi)
    cursor = con.execute("SELECT * FROM PAPERS WHERE doi=?;", (doi,))
    fetch_data = cursor.fetchone()
    
    if fetch_data is None:
        temp = acm_meta(doi)
        insert_statement = 'INSERT INTO PAPERS(doi, data) VALUES(?, ?)'
        
        con.execute(insert_statement, (doi, str(temp)))
        data[doi] = get_citation_doi(temp)
        
    else:
        data[doi] = get_citation_doi(fetch_data[1])

con.commit()
con.close()

processing doi....


  0%|          | 0/18 [00:00<?, ?it/s]

10.1145/3467477
extracting citation's doi....


  0%|          | 0/192 [00:00<?, ?it/s]

10.1145/3457607
extracting citation's doi....


  0%|          | 0/166 [00:00<?, ?it/s]

10.1145/3388792
extracting citation's doi....


  0%|          | 0/79 [00:00<?, ?it/s]

10.1145/3318299.3318343
extracting citation's doi....


  0%|          | 0/40 [00:00<?, ?it/s]

10.1145/3234150
extracting citation's doi....


  0%|          | 0/181 [00:00<?, ?it/s]

10.1145/3210548
extracting citation's doi....


  0%|          | 0/83 [00:00<?, ?it/s]

10.1145/3368640.3368642
extracting citation's doi....


  0%|          | 0/18 [00:00<?, ?it/s]

10.14778/3007263.3007318
extracting citation's doi....


  0%|          | 0/6 [00:00<?, ?it/s]

10.1145/3453444
extracting citation's doi....


  0%|          | 0/193 [00:00<?, ?it/s]

10.1145/3309074.3309092
extracting citation's doi....


  0%|          | 0/44 [00:00<?, ?it/s]

10.1145/3477140
extracting citation's doi....


  0%|          | 0/92 [00:00<?, ?it/s]

10.1145/2567574.2567633
10.1145/3484824.3484898
extracting citation's doi....


  0%|          | 0/14 [00:00<?, ?it/s]

10.1145/3315508.3329976
extracting citation's doi....


  0%|          | 0/32 [00:00<?, ?it/s]

10.1145/3432291.3432308
extracting citation's doi....


  0%|          | 0/16 [00:00<?, ?it/s]

10.1145/3054912
extracting citation's doi....


  0%|          | 0/146 [00:00<?, ?it/s]

10.1145/2493525.2493530
extracting citation's doi....


  0%|          | 0/116 [00:00<?, ?it/s]

10.1145/1128817.1128824
extracting citation's doi....


  0%|          | 0/42 [00:00<?, ?it/s]

In [12]:
# Paper's doi and it's citations
data

{'10.1145/3467477': ['10.1145/170036.170072',
  '10.5555/645920.672836',
  '10.1016/j.neucom.2017.01.067',
  '10.1016/j.asoc.2011.05.036',
  '10.1016/j.knosys.2015.07.016',
  '10.1016/j.asoc.2009.11.003',
  '10.1109/TSMCC.2011.2157494',
  '10.1109/TSMCB.2011.2167144',
  '10.5555/1162264',
  '10.1016/j.eswa.2010.11.028',
  '10.1023/A%3A1018054314350',
  '10.1016/j.patrec.2004.09.043',
  '10.1007/11844297_46',
  '10.1109/ICMLA.2011.73',
  '10.1109/TIT.1967.1053964',
  '10.1016/j.dss.2011.01.015',
  '10.1016/j.asoc.2007.12.008',
  '10.1016/j.knosys.2014.08.013',
  '10.1016/j.eswa.2009.04.031',
  '10.5555/3322706.3361996',
  '10.1287/ijoc.15.1.3.15152',
  '10.1016/j.eswa.2012.01.065',
  '10.1016/j.patcog.2013.05.006',
  '10.1016/j.knosys.2011.01.012',
  '10.5555/2789272.2886795',
  '10.1016/j.asoc.2009.04.004',
  '10.1162/evco.2009.17.3.275',
  '10.1016/0305-0548%2886%2990048-1',
  '10.1016/j.neucom.2011.07.005',
  '10.5555/3086952',
  '10.1177/105971239400300202',
  '10.1016/j.neucom.2008

<h2>Code for creating the rating matrix</h2>

In [13]:
citation_set = set()
for key in data.keys():
    citation_set.update(data[key])

index_ = data.keys()
df = pd.DataFrame(np.zeros((len(index_), len(citation_set))), index=index_, columns=citation_set)

for i in index_:
    for j in citation_set:
        if j in data[i]:
            df.loc[i][j] = 1

df.to_csv('citation-web/matrix-way/citation-matrix.csv')

<h2>Creating the citation pair</h2>

In [14]:
citation_pair_path = get_paper_citation_pairs(df)

writing citation pairs....


  0%|          | 0/18 [00:00<?, ?it/s]

In [15]:
########################################
	# Experiment 1
	# Plain CF
	
	
	########################################


	########################################
	# Experiment 1
	# Plain CF
	
	# TO DO
	########################################

In [16]:
evaluate = Train_Test_Split_Plain(CFU, min_ref_limit=5, train_size=0.5)
evaluate.fit(data)

The following doi's are ignored because of low reference :
{'10.1145/2567574.2567633': 0,
 '10.1145/3368640.3368642': 2,
 '10.1145/3484824.3484898': 0,
 '10.14778/3007263.3007318': 4}
writing citation pairs....


  0%|          | 0/14 [00:00<?, ?it/s]

removing the indexes with no citations....
normalizing dataframe....


100%|█████████████████████████████████████████| 14/14 [00:00<00:00, 2320.96it/s]


creating similarity matrix....


100%|█████████████████████████████████████████| 453/453 [00:26<00:00, 17.08it/s]


computing recommendations for : 10.1145/3315508.3329976


100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 891.52it/s]


computing recommendations for : 10.1145/1128817.1128824


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 613.94it/s]


computing recommendations for : 10.1145/3388792


100%|██████████████████████████████████████████| 16/16 [00:00<00:00, 471.81it/s]


computing recommendations for : 10.1145/3453444


100%|██████████████████████████████████████████| 38/38 [00:00<00:00, 410.45it/s]


computing recommendations for : 10.1145/3457607


0it [00:00, ?it/s]


computing recommendations for : 10.1145/3309074.3309092


100%|████████████████████████████████████████████| 4/4 [00:00<00:00, 919.70it/s]


computing recommendations for : 10.1145/3467477


100%|██████████████████████████████████████████| 38/38 [00:00<00:00, 190.57it/s]


finishing evaluation....


  0%|          | 0/7 [00:00<?, ?it/s]

0.0

In [17]:
evaluate = Train_Test_Split_Plain(CF, min_ref_limit=5, train_size=0.5)
evaluate.fit(data)

The following doi's are ignored because of low reference :
{'10.1145/2567574.2567633': 0,
 '10.1145/3368640.3368642': 2,
 '10.1145/3484824.3484898': 0,
 '10.14778/3007263.3007318': 4}
writing citation pairs....


  0%|          | 0/14 [00:00<?, ?it/s]

Creating similarity matrix....


100%|█████████████████████████████████████████| 453/453 [00:26<00:00, 17.05it/s]


Computing recommendations for : 10.1145/3315508.3329976


100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 385.01it/s]


Computing recommendations for : 10.1145/1128817.1128824


100%|██████████████████████████████████████████| 15/15 [00:00<00:00, 924.22it/s]


Computing recommendations for : 10.1145/3388792


100%|████████████████████████████████████████████| 7/7 [00:00<00:00, 466.69it/s]


Computing recommendations for : 10.1145/3453444


100%|██████████████████████████████████████████| 30/30 [00:00<00:00, 400.36it/s]


Computing recommendations for : 10.1145/3457607


100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 581.67it/s]


Computing recommendations for : 10.1145/3309074.3309092


100%|████████████████████████████████████████████| 4/4 [00:00<00:00, 798.61it/s]


Computing recommendations for : 10.1145/3467477


100%|██████████████████████████████████████████| 20/20 [00:00<00:00, 189.31it/s]


finishing evaluation....


  0%|          | 0/7 [00:00<?, ?it/s]

0.0

In [18]:
evaluate = Train_Test_Split_Plain(CFUR, min_ref_limit=5, train_size=0.5)
evaluate.fit(data)

The following doi's are ignored because of low reference :
{'10.1145/2567574.2567633': 0,
 '10.1145/3368640.3368642': 2,
 '10.1145/3484824.3484898': 0,
 '10.14778/3007263.3007318': 4}
writing citation pairs....


  0%|          | 0/14 [00:00<?, ?it/s]

removing the indexes with no citations....
page rank normalization on dataframe....


100%|█████████████████████████████████████████| 14/14 [00:00<00:00, 4377.86it/s]


creating similarity matrix....


100%|█████████████████████████████████████████| 453/453 [00:26<00:00, 16.80it/s]


normalizing the similarity matrix....


100%|███████████████████████████████████████| 453/453 [00:00<00:00, 3806.27it/s]


Computing recommendations for : 10.1145/3315508.3329976


0it [00:00, ?it/s]


Computing recommendations for : 10.1145/1128817.1128824


0it [00:00, ?it/s]


Computing recommendations for : 10.1145/3388792


0it [00:00, ?it/s]


Computing recommendations for : 10.1145/3453444


0it [00:00, ?it/s]


Computing recommendations for : 10.1145/3457607


0it [00:00, ?it/s]


Computing recommendations for : 10.1145/3309074.3309092


0it [00:00, ?it/s]


Computing recommendations for : 10.1145/3467477


0it [00:00, ?it/s]

finishing evaluation....





  0%|          | 0/7 [00:00<?, ?it/s]

0.0