In [13]:
%load_ext autoreload
%autoreload 2

import sqlite3
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from utils.dummy import dummy_data
from utils.utils import get_citation_doi
from utils.utils import get_paper_citation_pairs
from acm_api.acm import acm_meta
from evaluation.evaluate import Train_Test_Split_Plain
from cf.cf import CF
from cf.cf_unit import CFU
from cf.cf_pr import CFUPR
from cf.cf_hits import CFUH

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<h2>Processing the input file with doi of papers</h2>

In [2]:
doi_list = []
data = {}

# reading the input file with doi of papers
with open('doi/doi.txt', 'r') as doi_file:
    lines = doi_file.readlines()
    
for line in lines:
    doi_list.append(line.strip())
    
con = sqlite3.connect('data.db')
cur = con.cursor()

is_table_exists = cur.execute(
    """
    SELECT name FROM sqlite_master WHERE type='table'
    AND name='PAPERS'; """).fetchall()

if is_table_exists == []:
    cur.execute(""" 
        CREATE TABLE PAPERS (doi TEXT, data TEXT);
    """)
    
print('processing doi....')
for doi in tqdm(doi_list):
    print(doi)
    cursor = con.execute("SELECT * FROM PAPERS WHERE doi=?;", (doi,))
    fetch_data = cursor.fetchone()
    
    if fetch_data is None:
        temp = acm_meta(doi)
        insert_statement = 'INSERT INTO PAPERS(doi, data) VALUES(?, ?)'
        
        con.execute(insert_statement, (doi, str(temp)))
        data[doi] = get_citation_doi(temp)
        
    else:
        data[doi] = get_citation_doi(fetch_data[1])

con.commit()
con.close()

processing doi....


  0%|          | 0/7 [00:00<?, ?it/s]

10.1145/3439723
extracting citation's doi....


  0%|          | 0/155 [00:00<?, ?it/s]

10.1145/3446374
extracting citation's doi....


  0%|          | 0/172 [00:00<?, ?it/s]

10.1145/3459992
extracting citation's doi....


  0%|          | 0/162 [00:00<?, ?it/s]

10.1145/3459992
extracting citation's doi....


  0%|          | 0/162 [00:00<?, ?it/s]

10.1145/3301282
extracting citation's doi....


  0%|          | 0/146 [00:00<?, ?it/s]

10.1145/3322645.3322656
extracting citation's doi....


  0%|          | 0/18 [00:00<?, ?it/s]

10.1145/3487891
extracting citation's doi....


  0%|          | 0/136 [00:00<?, ?it/s]

In [3]:
# Paper's doi and it's citations
# data

<h2>Code for creating the rating matrix</h2>

In [4]:
citation_set = set()
for key in data.keys():
    citation_set.update(data[key])

index_ = data.keys()
df = pd.DataFrame(np.zeros((len(index_), len(citation_set))), index=index_, columns=citation_set)

for i in index_:
    for j in citation_set:
        if j in data[i]:
            df.loc[i][j] = 1

df.to_csv('citation-web/matrix-way/citation-matrix.csv')

<h2>Creating the citation pair</h2>

In [5]:
citation_pair_path = get_paper_citation_pairs(df)

writing citation pairs....


  0%|          | 0/6 [00:00<?, ?it/s]

In [6]:
########################################
	# Experiment 1
	# Plain CF
	
	
	########################################


	########################################
	# Experiment 1
	# Plain CF
	
	# TO DO
	########################################

In [7]:
evaluate = Train_Test_Split_Plain(CFU, min_ref_limit=5, train_size=0.5)
evaluate.fit(data)

The following doi's are ignored because of low reference :
{'10.1145/3322645.3322656': 3}
writing citation pairs....


  0%|          | 0/5 [00:00<?, ?it/s]

removing the indexes with no citations....
normalizing dataframe....


100%|███████████████████████████████████████████| 5/5 [00:00<00:00, 1012.14it/s]


creating similarity matrix....


100%|███████████████████████████████████████████| 85/85 [00:00<00:00, 94.04it/s]


computing recommendations for : 10.1145/3487891


100%|███████████████████████████████████████████| 8/8 [00:00<00:00, 1016.12it/s]


computing recommendations for : 10.1145/3301282


100%|██████████████████████████████████████████| 22/22 [00:00<00:00, 678.26it/s]


computing recommendations for : 10.1145/3439723


100%|█████████████████████████████████████████| 26/26 [00:00<00:00, 1589.05it/s]

finishing evaluation....





  0%|          | 0/3 [00:00<?, ?it/s]

0.0

In [8]:
evaluate = Train_Test_Split_Plain(CF, min_ref_limit=5, train_size=0.5)
evaluate.fit(data)

The following doi's are ignored because of low reference :
{'10.1145/3322645.3322656': 3}
writing citation pairs....


  0%|          | 0/5 [00:00<?, ?it/s]

Creating similarity matrix....


100%|███████████████████████████████████████████| 85/85 [00:00<00:00, 94.04it/s]


Computing recommendations for : 10.1145/3487891


100%|██████████████████████████████████████████| 12/12 [00:00<00:00, 981.58it/s]


Computing recommendations for : 10.1145/3301282


100%|██████████████████████████████████████████| 23/23 [00:00<00:00, 681.38it/s]


Computing recommendations for : 10.1145/3439723


100%|█████████████████████████████████████████| 33/33 [00:00<00:00, 1677.23it/s]

finishing evaluation....





  0%|          | 0/3 [00:00<?, ?it/s]

0.0

In [9]:
evaluate = Train_Test_Split_Plain(CFUPR, min_ref_limit=5, train_size=0.5, normalize_similarity=True)
evaluate.fit(data)

The following doi's are ignored because of low reference :
{'10.1145/3322645.3322656': 3}
writing citation pairs....


  0%|          | 0/5 [00:00<?, ?it/s]

removing the indexes with no citations....
page rank normalization on dataframe....


100%|███████████████████████████████████████████| 5/5 [00:00<00:00, 3362.98it/s]


creating similarity matrix....


100%|███████████████████████████████████████████| 85/85 [00:00<00:00, 96.07it/s]


normalizing the similarity matrix....


100%|█████████████████████████████████████████| 85/85 [00:00<00:00, 3660.78it/s]


Computing recommendations for : 10.1145/3487891


100%|█████████████████████████████████████████| 16/16 [00:00<00:00, 1048.43it/s]


Computing recommendations for : 10.1145/3301282


100%|██████████████████████████████████████████| 21/21 [00:00<00:00, 694.77it/s]


Computing recommendations for : 10.1145/3439723


100%|█████████████████████████████████████████| 29/29 [00:00<00:00, 1672.74it/s]

finishing evaluation....





  0%|          | 0/3 [00:00<?, ?it/s]

0.022878059940517045

In [12]:
evaluate = Train_Test_Split_Plain(CFUH, min_ref_limit=5, train_size=0.5, normalize_similarity=True)
evaluate.fit(data)

The following doi's are ignored because of low reference :
{'10.1145/3322645.3322656': 3}
writing citation pairs....


  0%|          | 0/5 [00:00<?, ?it/s]

removing the indexes with no citations....
page rank normalization on dataframe....


FileExistsError: [Errno 17] File exists: 'temp/pair-way/temp-hits-base-node.txt'