# Fatcat Queries

In [1]:
import numpy as np
import pandas as pd



### Example with fatcat search url
- Example:
    - https://search.fatcat.wiki/fatcat_release/_search?q=container_name:%22angewandte%20mathematik%22 
    - https://search.fatcat.wiki/fatcat_release/_search?q=(title:%22new%20york%20city%22)%20OR%20(title:%22big%20apple%22) 


In [2]:
import requests
import json
import re
import urllib.parse

In [3]:
def search_fatcat_journal_name(journal):
    url_header = "https://search.fatcat.wiki/fatcat_release/_search?q=container_name:"
    url_content = re.sub(" ", "%20", journal)
    url = url_header + url_content
    
    print(url)
    
    # make http request
    request = requests.get(url, timeout = 10)
    response = request.text
    response_json = json.loads(response)
    
    
    search_results = response_json["hits"]["hits"]
    result = []
    for search_res in search_results:
        search_res_id = search_res["_id"]
        search_res_score = search_res["_score"]
        release = search_res["_source"]
        search_res_info = {"id": search_res_id, "score": search_res_score, "title": release["title"], 
                           "volume": release["title"], "issue": release["issue"], "pages": release["pages"],
                           "author": release["contrib_names"], "url": release["best_pdf_url"]}
        result.append(search_res_info)
    
    
    return result

In [4]:
search_fatcat_journal_name("angewandte%20mathematik")[0:2]

https://search.fatcat.wiki/fatcat_release/_search?q=container_name:angewandte%20mathematik


[{'id': '2ed52tb6ojf4xc33mxfhrxr7mq',
  'score': 22.10781,
  'title': 'Fluorescent and water dispersible single‐chain nanoparticles: core‐shell structured compartmentation',
  'volume': 'Fluorescent and water dispersible single‐chain nanoparticles: core‐shell structured compartmentation',
  'issue': None,
  'pages': None,
  'author': ['Justus F. Hoffmann',
   'Andreas H. Roos',
   'Franz-Josef Schmitt',
   'Dariush Hinderberger',
   'Wolfgang H Binder'],
  'url': 'https://web.archive.org/web/20210717152749/https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/ange.202015179'},
 {'id': 't7jmqk7uuvdixnqry2trlwx6i4',
  'score': 21.988827,
  'title': 'Formation of Binuclear Zigzag Hexapentaene Titanium Complexes via a Titanacumulene [Ti=C=C=CH2\n] Intermediate',
  'volume': 'Formation of Binuclear Zigzag Hexapentaene Titanium Complexes via a Titanacumulene [Ti=C=C=CH2\n] Intermediate',
  'issue': '40',
  'pages': '12465-12469',
  'author': ['Tim Oswald',
   'Tina Gelert',
   'Christian Las

In [5]:
def search_fatcat_article_title(title):
    url_header = "https://search.fatcat.wiki/fatcat_release/_search?q=title:"
    url_content = re.sub(" ", "%20", title)
    url = url_header + url_content
    
    print(url)
    
    # make http request
    request = requests.get(url, timeout = 20)
    response = request.text
    response_json = json.loads(response)
    
    
    search_results = response_json["hits"]["hits"]
    result = []
    for search_res in search_results:
        search_res_id = search_res["_id"]
        search_res_score = search_res["_score"]
        release = search_res["_source"]
        search_res_info = {"id": search_res_id, "score": search_res_score, "title": release["title"], 
                           "volume": release["title"], "issue": release["issue"], "pages": release["pages"],
                           "author": release["contrib_names"], "url": release["best_pdf_url"]}
        result.append(search_res_info)

    return result

In [6]:
search_fatcat_article_title("new york city")[0:2]

https://search.fatcat.wiki/fatcat_release/_search?q=title:new%20york%20city


[{'id': 'elslp4gi35eavfa777gej2jb7m',
  'score': 29.655378,
  'title': 'Vision Zero Action Plan. City of New York (2014). New York City, New York',
  'volume': 'Vision Zero Action Plan. City of New York (2014). New York City, New York',
  'issue': None,
  'pages': None,
  'author': ['City Of New York'],
  'url': None},
 {'id': 'ndh34lzng5dyxhsenfortjkjcy',
  'score': 27.668268,
  'title': "Fashioning Piety: Women's Dress, Money, and Faith among Senegalese Muslims in New",
  'volume': "Fashioning Piety: Women's Dress, Money, and Faith among Senegalese Muslims in New",
  'issue': None,
  'pages': None,
  'author': ['York City', 'Beth Buggenhagen'],
  'url': 'https://web.archive.org/web/20180413170944/https://scholarworks.iu.edu/dspace/bitstream/handle/2022/20949/Buggenhagen%20C&S%2024.1%202012%20postprint.pdf;jsessionid=00D46E727DA0B290508A7EE22F3C881B?sequence=1'}]

In [7]:
def search_fatcat_journal_name_article_title(journal, title):
    url_header = "https://search.fatcat.wiki/fatcat_release/_search?q="
    url_content_journal = "container_name:" + re.sub(" ", "%20", journal)
    url_content_title = "title:" + re.sub(" ", "%20", title)
    url_content = "(" + url_content_journal + ")AND(" + url_content_title + ")"
    url = url_header + url_content
    
    print(url)
    
    # make http request
    request = requests.get(url, timeout = 20)
    response = request.text
    response_json = json.loads(response)
    
#     print(response_json)
    search_results = response_json["hits"]["hits"]
    result = []
    for search_res in search_results:
        search_res_id = search_res["_id"]
        search_res_score = search_res["_score"]
        release = search_res["_source"]
        search_res_info = {"id": search_res_id, "score": search_res_score, "title": release["title"], 
                           "volume": release["title"], "issue": release["issue"], "pages": release["pages"],
                           "author": release["contrib_names"], "url": release["best_pdf_url"]}
        result.append(search_res_info)

    return result

In [8]:
search_fatcat_journal_name_article_title("Scientific American", "Our Solar System Is Overflowing with Liquid Water")[:2]

https://search.fatcat.wiki/fatcat_release/_search?q=(container_name:Scientific%20American)AND(title:Our%20Solar%20System%20Is%20Overflowing%20with%20Liquid%20Water)


[{'id': 'krkousbgvbdm5bjbousk6bzdb4',
  'score': 35.348415,
  'title': 'Our Solar System',
  'volume': 'Our Solar System',
  'issue': '1006supp',
  'pages': '16081-16082',
  'author': ['Alfred Bicknell'],
  'url': 'https://archive.org/download/crossref-pre-1909-scholarly-works/10.1038%252Fscientificamerican04131889-226c.zip/10.1038%252Fscientificamerican04131895-16081bsupp.pdf'},
 {'id': 'v26gkqtuo5gspkjjuktwkkkgbe',
  'score': 35.278923,
  'title': 'Our Solar System',
  'volume': 'Our Solar System',
  'issue': '1522supp',
  'pages': '24386-24387',
  'author': ['Agnes M. Clerke'],
  'url': None}]

### Lucene Query String Syntax
- https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html
- https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html 

In [9]:
query_string = {"query": 
                {"query_string": 
                 {
                    "query": "(new york city) OR (big apple)",
                    "default_field": "content"
                 }
                }
               }

### Example with python wrapper
- https://gist.github.com/miku/cdf38c7cd7ea3d163539ca398ccf607b

In [10]:
#!/usr/bin/env python

"""
You may need to "pip install elasticsearch"
"""

from elasticsearch import Elasticsearch
es = Elasticsearch(["https://search.fatcat.wiki"])

resp = es.search(index="fatcat_release", body={"query": {"match_all": {}}}, track_total_hits=True)

for k, v in resp.items():
    print(k)
    # took
    # timed_out
    # _shards
    # hits

print(resp["hits"]["total"]["value"])
# 167456483

for doc in resp["hits"]["hits"]:
    release = doc["_source"]
    print(doc["_id"], release["title"])

# lbfov5kze5h4xl4gbuv5v57n4m Vorwort
# h2wpwflsgngfzeidqpdu77tz5y Employment status and the frequency and causes of burn injuries in New England
# cltiujpt2bghtfmawezhxh5sge On Kunama ukunkula 'elbow' and its proposed cognates in Nilo-Saharan languages
# 2ejozpx555bc3lhhvnumqagbqa SGLT2 Inhibitors as a Therapeutic Option for Diabetic Nephropathy
# 36fcbnsk4jf3vbez4kiqew52jq Efficient Generation of Electricity from Methane using High Temperature Fuel Cells - Status, Challenges and Prospects
# mg3zwqfrkran5i3bv5vg2neoie Generation of Cocyclic Hadamard Matrices
# tnuk2tn43zayldjaimz5aymb4u Dedication
# ympv6467nrcrtkljvppj4pocfu About Our Contributors
# 6rwhhwpfi5ag7fmedaglkebrky News in brief
# u7ehcognt5g2dpdzapmn6ouaqm The Palazzo Rucellai and Questions of Typology in the Development of Renaissance Buildings

took
timed_out
_shards
hits
168937642
lbfov5kze5h4xl4gbuv5v57n4m Vorwort
h2wpwflsgngfzeidqpdu77tz5y Employment status and the frequency and causes of burn injuries in New England
cltiujpt2bghtfmawezhxh5sge On Kunama ukunkula 'elbow' and its proposed cognates in Nilo-Saharan languages
2ejozpx555bc3lhhvnumqagbqa SGLT2 Inhibitors as a Therapeutic Option for Diabetic Nephropathy
36fcbnsk4jf3vbez4kiqew52jq Efficient Generation of Electricity from Methane using High Temperature Fuel Cells - Status, Challenges and Prospects
mg3zwqfrkran5i3bv5vg2neoie Generation of Cocyclic Hadamard Matrices
tnuk2tn43zayldjaimz5aymb4u Dedication
ympv6467nrcrtkljvppj4pocfu About Our Contributors
6rwhhwpfi5ag7fmedaglkebrky News in brief
u7ehcognt5g2dpdzapmn6ouaqm The Palazzo Rucellai and Questions of Typology in the Development of Renaissance Buildings


- https://elasticsearch-py.readthedocs.io/en/v7.13.2/

In [11]:
#!/usr/bin/env python

"""
You may need to "pip install elasticsearch"
"""

es = Elasticsearch(["https://search.fatcat.wiki"], timeout = 25)

query_string = {
  "query": {
    "query_string": {
      "query": "(container_name:Canadian%20Journal%20of%20Public%20Health)AND(title:Lifetime%20probability%20of%20developing%20lung%20cancer,%20by%20smoking%20status,%20Canada)AND(volume:85)AND(release_year:1994)"
    }
  }
}
query_string = {
  "query": {
    "query_string": {
      "query": "(container_name:Toxins)AND(title:The Discodermia calyx Toxin Calyculin A)AND(release_year:2011)"
    }
  }
}
query_string = {
    'query': {
        'query_string': {
            'query': '(container_name:Toxins)AND(title:The%20Discodermia%20calyx%20Toxin%20Calyculin%20A)AND(release_year:2011)'
        }
    }
}
query_string = {
    'query': {
        'query_string': {
            'query': "(container_name:Fieldiana Geology)AND(title:Revision of the sauropterygian reptile genus ''Cymatosaurus'' v. Fritsch, 1894, and the relationships of Germanosaurus Nopcsa, 1928, from the Middle Triassic of Europe)AND(release_year:1997)"}}}
# try:
resp = es.search(index="fatcat_release", body = query_string, track_total_hits=True)

for k, v in resp.items():
    print(k)

print(resp["hits"]["total"]["value"])
# 167456483

for doc in resp["hits"]["hits"]:
    release = doc["_source"]
    print(doc["_id"], release["title"])
# except:
#     print("err")


took
timed_out
_shards
hits
3085
6alerabq4vao3e6yd2tun74ouy 7 火山防災と火山地質学
xgoqdedhn5euhlxfi62e3zf6ya P15 焼岳円頂丘溶岩の地質と岩石(その 1) : 石基組織の多様性から見た円頂丘の定置史
lpugazojqbhyzdsvq6eez75vxa Stratigraphic relationships of the Lower Tertiary of the Faeroe Basalt Plateau and the Faeroe–Shetland Basin
x5ll6i5fxvh5dg3273a4aoqx6a 233 オマーンオフィオライト北部斑れい岩層の地質
gnzwl2gperdppdzlhuxk4mfxey 368 焼岳火山群白谷山・アカンダナ火山の地質と活動年代
vo7kyjrxmrbk5i75pybl7uldja 277 セントラルベースンフォルトの地形と地質 : 西フィリピン海盆発達史の鍵
lczkta4rurhu5mrfudbb7mtbvm 41 九州東部秩父累帯南帯の地質 : 「熊田」及びその周辺地域
nki43bx4kvhzhnn46yeunjmefy Acidic and sulfate-rich hydrothermal fluids from the Manus back-arc basin, Papua New Guinea
rnod7mfmr5bvvbpsxs7f3b5e3a 418 湯殿山巨大地すべり冠頭部付近の地質
nfe54y7hojdcph5dmmizoiyjra Early Mesozoic subduction in the Eastern Mediterranean: Evidence from Triassic eclogite in northwest Turkey


In [12]:
#!/usr/bin/env python

"""
You may need to "pip install elasticsearch"
"""

es = Elasticsearch(["https://search.fatcat.wiki"])

query_string = {
  "query": {
    "query_string": {
      "query": "(work_id:dsz4e6hn5zcfrntj6sp5rjes5m)"
    }
  }
}
resp = es.search(index="fatcat_release", body = query_string, track_total_hits=True)

for k, v in resp.items():
    print(k)

print(resp["hits"]["total"]["value"])
# 167456483
print(resp["hits"]["hits"])
# for doc in resp["hits"]["hits"]:
#     release = doc["_source"]
#     print(doc["_id"], release["title"])


took
timed_out
_shards
hits
1
[{'_index': 'fatcat_release_v03c', '_type': '_doc', '_id': 'uvh54wbys5ehleliwtsps6k5cq', '_score': 16.779491, '_source': {'doc_index_ts': '2021-04-09T00:35:26.741925Z', 'ident': 'uvh54wbys5ehleliwtsps6k5cq', 'state': 'active', 'revision': '401acd39-b212-4eb2-8526-89e064c12c6f', 'work_id': 'dsz4e6hn5zcfrntj6sp5rjes5m', 'title': 'Surgical technique of one-stage bilateral lung reimplantation in the dog', 'subtitle': None, 'original_title': None, 'release_type': 'article-journal', 'release_stage': 'published', 'withdrawn_status': None, 'language': 'en', 'volume': '61', 'issue': '6', 'pages': '847-56', 'number': None, 'license': None, 'doi': None, 'pmid': '4932558', 'pmcid': None, 'isbn13': None, 'wikidata_qid': None, 'core_id': None, 'arxiv_id': None, 'jstor_id': None, 'ark_id': None, 'mag_id': None, 'dblp_id': None, 'doaj_id': None, 'is_oa': False, 'is_longtail_oa': False, 'is_preserved': True, 'in_web': False, 'in_dweb': False, 'in_ia': False, 'in_ia_sim': T

# Fatcat Searches 

## Citation Preprocessing

In [13]:
from ipynb.fs.full.Citation_Preprocessing import *

{{Akademik dergi kaynağı|başlık=Lifetime probability of developing lung cancer, by smoking status, Canada|sayı=6|sayfalar=385-8|çalışma=Canadian Journal of Public Health|yıl=1994|cilt=85|pmid=7895211}}
{'journal': 'Canadian Journal of Public Health', 'date': '1994', 'year': 1994, 'volume': '85', 'issue': '6', 'title': 'Lifetime probability of developing lung cancer, by smoking status, Canada', 'author': [], 'page': '385', 'url': '', 'external_ids': {'pmid': '7895211'}}
{'journal': 'Canadian Journal of Public Health', 'date': '1994', 'year': 1994, 'volume': '85', 'issue': '6', 'title': 'Lifetime probability of developing lung cancer, by smoking status, Canada', 'author': [], 'page': '385', 'url': '', 'external_ids': {'pmid': '7895211'}}
There is already an existing doi link.
False
yay
(56136, 3)
(37333, 3)
{{Akademik dergi kaynağı|başlık=Lifetime probability of developing lung cancer, by smoking status, Canada|sayı=6|sayfalar=385-8|çalışma=Canadian Journal of Public Health|yıl=1994|cilt

In [14]:
test_cite7 = "{{Akademik dergi kaynağı|başlık=Lifetime probability of developing lung cancer, by smoking status, Canada|sayı=6|sayfalar=385-8|çalışma=Canadian Journal of Public Health|yıl=1994|cilt=85|pmid=7895211}}"
print(test_cite7)

cite_info7 = parse_citation_data(test_cite7)
print(cite_info7)

{{Akademik dergi kaynağı|başlık=Lifetime probability of developing lung cancer, by smoking status, Canada|sayı=6|sayfalar=385-8|çalışma=Canadian Journal of Public Health|yıl=1994|cilt=85|pmid=7895211}}
{'journal': 'Canadian Journal of Public Health', 'date': '1994', 'year': 1994, 'volume': '85', 'issue': '6', 'title': 'Lifetime probability of developing lung cancer, by smoking status, Canada', 'author': [], 'page': '385', 'url': '', 'external_ids': {'pmid': '7895211'}}


In [15]:
test_cite6 = "{{cite journal |title=The Discodermia calyx Toxin Calyculin A |last1=Edelson |first1=Jessica R. |last2=Brautigan |first2=David L. |date=24 January 2011 |journal=Toxins |volume=3 |issue=1 |pages=105–119 |doi=10.3390/toxins3010105 |doi-access=free |pmid=22069692 |pmc=3210456}}"
print(test_cite6)

cite_info6 = parse_citation_data(test_cite6)
print(cite_info6)

{{cite journal |title=The Discodermia calyx Toxin Calyculin A |last1=Edelson |first1=Jessica R. |last2=Brautigan |first2=David L. |date=24 January 2011 |journal=Toxins |volume=3 |issue=1 |pages=105–119 |doi=10.3390/toxins3010105 |doi-access=free |pmid=22069692 |pmc=3210456}}
{'journal': 'Toxins', 'date': '2011-01', 'year': 2011, 'volume': '3', 'issue': '1', 'title': 'The Discodermia calyx Toxin Calyculin A', 'author': ['Jessica R. Edelson', 'David L. Brautigan'], 'page': '105', 'url': '', 'external_ids': {'doi': '10.3390/toxins3010105', 'pmid': '22069692', 'pmc': '3210456'}}


## Generate Query String

### Method 1 - Use as much info as possible

In [16]:
def generate_url_content(cite_info):
    url_fields = []
    if cite_info["journal"] != "":
        url_fields.append("(container_name:" + re.sub(" ", "%20", cite_info["journal"]) + ")")
    if cite_info["title"] != "":
        url_fields.append("(title:" + re.sub(" ", "%20", cite_info["title"]) + ")")
    if cite_info["volume"] != "":
        url_fields.append("(volume:" + re.sub(" ", "%20", cite_info["volume"]) + ")")
    if cite_info["year"] != 0:
        url_fields.append("(release_year:" + str(cite_info["year"]) + ")")
    if cite_info["author"] != []:
        for aut in cite_info["author"]:
            url_fields.append("(author:" + re.sub(" ", "%20", aut) + ")")
    url_content = "AND".join(url_fields)
    return url_content

### Method 2 - Use minimum info

In [17]:
def generate_url_content2(cite_info):
    url_fields = []
    
    if cite_info["journal"] != "":
#         url_fields.append("(container_name:" + re.sub(" ", "%20", cite_info["journal"]) + ")")
        url_fields.append("(container_name:" + re.sub(":", "", cite_info["journal"]) + ")")
    if cite_info["title"] != "":
#         url_fields.append("(title:" + re.sub(" ", "%20", cite_info["title"]) + ")")
        url_fields.append("(title:" + re.sub(":", "", cite_info["title"]) + ")")
    if cite_info["year"] != 0:
        url_fields.append("(release_year:" + str(cite_info["year"]) + ")")

    url_content = "AND".join(url_fields)
    url_content = re.sub("\[", "", url_content)
    url_content = re.sub("\]", "", url_content)
    
    return url_content

## Making Searches 
### Method 1 (use url)

In [18]:
def search_fatcat_cite_info(url_content, verbose = False):
    
    url_header = "https://search.fatcat.wiki/fatcat_release/_search?q="

    url = url_header + urllib.parse.quote(url_content, safe = "")
    
    if verbose: print(url)
    
    # make http request
    request = requests.get(url, timeout = 30)
    
    if request.status_code == 200:
        response = request.text
        try: 
            response_json = json.loads(response)

            search_results = response_json["hits"]["hits"]
            result = []
            for search_res in search_results:
                search_res_id = search_res["_id"]
                search_res_score = search_res["_score"]
                release = search_res["_source"]
                search_res_info = {"work_id": release["work_id"], "score": search_res_score, 
                                   "title": release["title"], 
                                   "year": release["release_year"], "journal":release["container_name"],
                                   "volume": release["volume"], "issue": release["issue"], "page": release["pages"],
                                   "author": release["contrib_names"], "url": release["best_pdf_url"]}
                result.append(search_res_info)
        except:
            return ""
    else:
        return ""

    return result

In [19]:
print(cite_info7)

{'journal': 'Canadian Journal of Public Health', 'date': '1994', 'year': 1994, 'volume': '85', 'issue': '6', 'title': 'Lifetime probability of developing lung cancer, by smoking status, Canada', 'author': [], 'page': '385', 'url': '', 'external_ids': {'pmid': '7895211'}}


In [20]:
url_content7_1 = generate_url_content(cite_info7)
url_content7_2 = generate_url_content2(cite_info7)
search_res7_1 = search_fatcat_cite_info(url_content7_1, True)
search_res7_2 = search_fatcat_cite_info(url_content7_2, True)
print("query method 1: ")
print(url_content7_1)
print("query method 2: ")
print(url_content7_2)

print("search res method 1: ")
print(search_res7_1)
print("search res method 2: ")
print(search_res7_2)

https://search.fatcat.wiki/fatcat_release/_search?q=%28container_name%3ACanadian%2520Journal%2520of%2520Public%2520Health%29AND%28title%3ALifetime%2520probability%2520of%2520developing%2520lung%2520cancer%2C%2520by%2520smoking%2520status%2C%2520Canada%29AND%28volume%3A85%29AND%28release_year%3A1994%29
https://search.fatcat.wiki/fatcat_release/_search?q=%28container_name%3ACanadian%20Journal%20of%20Public%20Health%29AND%28title%3ALifetime%20probability%20of%20developing%20lung%20cancer%2C%20by%20smoking%20status%2C%20Canada%29AND%28release_year%3A1994%29
query method 1: 
(container_name:Canadian%20Journal%20of%20Public%20Health)AND(title:Lifetime%20probability%20of%20developing%20lung%20cancer,%20by%20smoking%20status,%20Canada)AND(volume:85)AND(release_year:1994)
query method 2: 
(container_name:Canadian Journal of Public Health)AND(title:Lifetime probability of developing lung cancer, by smoking status, Canada)AND(release_year:1994)
search res method 1: 
[]
search res method 2: 
[{'wo

In [21]:
print(test_cite6)
print(cite_info6)

url_content6_1 = generate_url_content(cite_info6)
url_content6_2 = generate_url_content2(cite_info6)
search_res6_1 = search_fatcat_cite_info(url_content6_1, True)
search_res6_2 = search_fatcat_cite_info(url_content6_2, True)
print("query method 1: ")
print(url_content6_1)
print("query method 2: ")
print(url_content6_2)

print("search res method 1: ")
print(search_res6_1)
print("search res method 2: ")
print(search_res6_2)

{{cite journal |title=The Discodermia calyx Toxin Calyculin A |last1=Edelson |first1=Jessica R. |last2=Brautigan |first2=David L. |date=24 January 2011 |journal=Toxins |volume=3 |issue=1 |pages=105–119 |doi=10.3390/toxins3010105 |doi-access=free |pmid=22069692 |pmc=3210456}}
{'journal': 'Toxins', 'date': '2011-01', 'year': 2011, 'volume': '3', 'issue': '1', 'title': 'The Discodermia calyx Toxin Calyculin A', 'author': ['Jessica R. Edelson', 'David L. Brautigan'], 'page': '105', 'url': '', 'external_ids': {'doi': '10.3390/toxins3010105', 'pmid': '22069692', 'pmc': '3210456'}}
https://search.fatcat.wiki/fatcat_release/_search?q=%28container_name%3AToxins%29AND%28title%3AThe%2520Discodermia%2520calyx%2520Toxin%2520Calyculin%2520A%29AND%28volume%3A3%29AND%28release_year%3A2011%29AND%28author%3AJessica%2520R.%2520Edelson%29AND%28author%3ADavid%2520L.%2520Brautigan%29
https://search.fatcat.wiki/fatcat_release/_search?q=%28container_name%3AToxins%29AND%28title%3AThe%20Discodermia%20calyx%20To

### Method 2 - use python library

In [22]:
def elastic_search_query_string(url_content):

    query_string = {
      "query": {
        "query_string": {
          "query": url_content
        }
      }
    }
    
    return query_string


In [23]:
def elastic_search_cite_info(url_content, verbose = False):
    
    es = Elasticsearch(["https://search.fatcat.wiki"], timeout = 20)
    
    query_string = elastic_search_query_string(url_content)
    
    if verbose: print(query_string)

    try: 
        response_json = es.search(index="fatcat_release", body = query_string, track_total_hits=True)

        search_results = response_json["hits"]["hits"]
        result = []
        for search_res in search_results:
            search_res_id = search_res["_id"]
            search_res_score = search_res["_score"]
            release = search_res["_source"]
            search_res_info = {"work_id": release["work_id"], "score": search_res_score, 
                                "title": release["title"], 
                                "year": release["release_year"], "journal":release["container_name"],
                                "volume": release["volume"], "issue": release["issue"], "page": release["pages"],
                                "author": release["contrib_names"], "url": release["best_pdf_url"]}
            result.append(search_res_info)

        return result
    except: 
        return ""


In [24]:
print(cite_info7)

{'journal': 'Canadian Journal of Public Health', 'date': '1994', 'year': 1994, 'volume': '85', 'issue': '6', 'title': 'Lifetime probability of developing lung cancer, by smoking status, Canada', 'author': [], 'page': '385', 'url': '', 'external_ids': {'pmid': '7895211'}}


In [25]:
url_content7_1 = generate_url_content(cite_info7)
url_content7_2 = generate_url_content2(cite_info7)
print("query method 1: ")
print(url_content7_1)
print("query method 2: ")
print(url_content7_2)

elastic_search_7_1 = elastic_search_cite_info(url_content7_1, True)
elastic_search_7_2 = elastic_search_cite_info(url_content7_2, True)
print("search res method 1: ")
print(elastic_search_7_1)
print("search res method 2: ")
print(elastic_search_7_2[0])

query method 1: 
(container_name:Canadian%20Journal%20of%20Public%20Health)AND(title:Lifetime%20probability%20of%20developing%20lung%20cancer,%20by%20smoking%20status,%20Canada)AND(volume:85)AND(release_year:1994)
query method 2: 
(container_name:Canadian Journal of Public Health)AND(title:Lifetime probability of developing lung cancer, by smoking status, Canada)AND(release_year:1994)
{'query': {'query_string': {'query': '(container_name:Canadian%20Journal%20of%20Public%20Health)AND(title:Lifetime%20probability%20of%20developing%20lung%20cancer,%20by%20smoking%20status,%20Canada)AND(volume:85)AND(release_year:1994)'}}}
{'query': {'query_string': {'query': '(container_name:Canadian Journal of Public Health)AND(title:Lifetime probability of developing lung cancer, by smoking status, Canada)AND(release_year:1994)'}}}
search res method 1: 
[]
search res method 2: 
{'work_id': 'qwmp2lzwcnhhvo3wqohwm5u3g4', 'score': 31.122288, 'title': 'Positive Correlation between Normal Serum Gastrin Conce

In [26]:
print(cite_info6)

{'journal': 'Toxins', 'date': '2011-01', 'year': 2011, 'volume': '3', 'issue': '1', 'title': 'The Discodermia calyx Toxin Calyculin A', 'author': ['Jessica R. Edelson', 'David L. Brautigan'], 'page': '105', 'url': '', 'external_ids': {'doi': '10.3390/toxins3010105', 'pmid': '22069692', 'pmc': '3210456'}}


In [27]:
print(url_content6_2)

(container_name:Toxins)AND(title:The Discodermia calyx Toxin Calyculin A)AND(release_year:2011)


In [28]:
print(test_cite6)
print(cite_info6)

url_content6_1 = generate_url_content(cite_info6)
url_content6_2 = generate_url_content2(cite_info6)
print("query method 1: ")
print(url_content6_1)
print("query method 2: ")
print(url_content6_2)

elastic_search_6_1 = elastic_search_cite_info(url_content6_1, True)
elastic_search_6_2 = elastic_search_cite_info(url_content6_2, True)
print("search res method 1: ")
print(elastic_search_6_1)
print("search res method 2: ")
print(elastic_search_6_2[0:5])

{{cite journal |title=The Discodermia calyx Toxin Calyculin A |last1=Edelson |first1=Jessica R. |last2=Brautigan |first2=David L. |date=24 January 2011 |journal=Toxins |volume=3 |issue=1 |pages=105–119 |doi=10.3390/toxins3010105 |doi-access=free |pmid=22069692 |pmc=3210456}}
{'journal': 'Toxins', 'date': '2011-01', 'year': 2011, 'volume': '3', 'issue': '1', 'title': 'The Discodermia calyx Toxin Calyculin A', 'author': ['Jessica R. Edelson', 'David L. Brautigan'], 'page': '105', 'url': '', 'external_ids': {'doi': '10.3390/toxins3010105', 'pmid': '22069692', 'pmc': '3210456'}}
query method 1: 
(container_name:Toxins)AND(title:The%20Discodermia%20calyx%20Toxin%20Calyculin%20A)AND(volume:3)AND(release_year:2011)AND(author:Jessica%20R.%20Edelson)AND(author:David%20L.%20Brautigan)
query method 2: 
(container_name:Toxins)AND(title:The Discodermia calyx Toxin Calyculin A)AND(release_year:2011)
{'query': {'query_string': {'query': '(container_name:Toxins)AND(title:The%20Discodermia%20calyx%20To

## Evaluation Metric and Finding a Close Match

### Method 1 - Define my own metrics 

#### Jaccard - doesn't care about order

In [29]:
def jaccard_similarity(str1, str2):
    
    list1 = str1.split()
    list2 = str2.split()
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [30]:
jaccard_similarity("I like food", "I don't like food")

0.75

#### Levenshtein - edit distance
- https://www.datacamp.com/community/tutorials/fuzzy-string-python


In [31]:
from fuzzywuzzy import fuzz



In [32]:
Str1 = "The supreme court case of Nixon vs The United States"
Str2 = "Nixon v. United States"
Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2)
Token_Set_Ratio = fuzz.token_set_ratio(Str1,Str2)
print(Ratio)
print(Partial_Ratio)
print(Token_Sort_Ratio)
print(Token_Set_Ratio)

57
77
58
95


In [33]:
def levenshtein_similarity(str1, str2):
    return fuzz.ratio(str1, str2) / 100

In [34]:
fuzz.ratio("I like food", "I don't like food")

79

In [35]:
fuzz.partial_ratio("The Discodermia calyx Toxin Calyculin A", 
                    "The Discodermia calyx Toxin Calyculin A Enhances Cyclin D1 Phosphorylation and Degradation, and Arrests Cell Cycle Progression in Human Breast Cancer Cells")

100

In [36]:
def fatcat_match_metric(cite_info, result, verbose = False):
    
    score = 0
    
    journal_match = False
    volume_match = False
    issue_match = False
    year_match = False
    page_match = False
    
    if cite_info["journal"] != "" and cite_info["journal"] == result["journal"]:
        journal_match = True
    
    # volume 
    if cite_info["volume"] != "" and cite_info["volume"] == result["volume"]:
        volume_match = True
    
    # issue 
    if cite_info["issue"] != "" and cite_info["issue"] == result["issue"]:
        issue_match = True
    
    # year 
    if cite_info["year"] != "" and cite_info["year"] == result["year"]:
        year_match = True
        
    # page 
    if cite_info["page"] != "" and result["page"] != None: 
        if "-" in result["page"]:
            page_start = result["page"].split("-")[0]
            page_end = result["page"].split("-")[0]
            if cite_info["page"] in result["page"]:
                page_match = True
            try: 
                int(page_start)
                int(page_end)
                
                if cite_info["page"] > page_start and cite_info["page"] < page_end:
                    page_match = True
            except:
                page_match = False
        else: 
            if cite_info["page"] in result["page"]:
                page_match = True
        
    score += int(volume_match) * 10 + int(volume_match)*10 + int(issue_match)*10 + int(year_match)*10 + int(page_match)*10
    
    if verbose:
        print("journal match: " + str(journal_match))
        print("volume match: " + str(volume_match))
        print("issue match: " + str(issue_match))
        print("page match: " + str(page_match))
        print("year match: " + str(year_match))
        print("score so far: " + str(score))
        
    # title
    if cite_info["title"] != "" and result["title"] != "":
        # exact match 
        if cite_info["title"] == result["title"]:
            score += 10
        # partial match    
        elif cite_info["title"] in result["title"]:
            score += 5
        # partial match
        elif result["title"] in cite_info["title"]:
            score += 5
        # jaccard similarity
        else:
            score += 10 * levenshtein_similarity(cite_info["title"], result["title"])
    
    if verbose:
        print("score after title: " + str(score))
    # author 
    if cite_info["author"] != "":
        # exact match 
        cite_author_lst = cite_info["author"]
        result_author_lst = result["author"]
        
        
        if cite_author_lst == result_author_lst:
            score += 10
            
            if verbose: print("author has exact match")
            
        # partial match    
        else:
            if len(cite_author_lst) > len(result_author_lst):
                shorter_lst = result_author_lst
                longer_lst = cite_author_lst
            else:
                shorter_lst = cite_author_lst
                longer_lst = result_author_lst
                
            if len(shorter_lst) > 0:
                author_matches = 0
                for i in range(len(shorter_lst)):
                    value1 = re.sub("[^a-zA-Z\s]", "", shorter_lst[i])
                    value2 = re.sub("[^a-zA-Z\s]", "", longer_lst[i])
                    aut_score = jaccard_similarity(value1, value2)

                    if verbose:
                        print(value1)
                        print(value2)
                        print("jaccard_score " + str(aut_score))

    #                 if ((value1 != "" and value2 != "") 
    #                     and (value1 in value2 or value2 in value1)):
    #                     author_matches += 1
    #                     if verbose: print("current author has partial match ")
    #                 elif aut_score >= 0.5:
    #                     author_matches += 1
    #                 else:

                    author_matches += aut_score

                score += 10 * author_matches/len(shorter_lst)
        
        
    if verbose:
        print("score after author: " + str(score))
        
    return score

In [37]:
def find_best_cite_info_fatcat_search(cite_info, search_result, verbose = False):
    
    close_matches = []
    
    for curr_res in search_result:
        curr_score = fatcat_match_metric(cite_info, curr_res, verbose = verbose)
        if curr_score > 45:
            close_matches.append(curr_res)
            if verbose: 
                print(curr_score)
                print(curr_res)
            
    if verbose:
        print(close_matches)
            
    best_res = ""
    best_score = 0
    for match in close_matches:
        if match["score"] > best_score:
            best_res = match
            best_score = match["score"]
    
    return best_res

In [38]:
print(find_best_cite_info_fatcat_search(cite_info6, search_res6_1))
print(find_best_cite_info_fatcat_search(cite_info6, search_res6_2))
print(find_best_cite_info_fatcat_search(cite_info7, search_res7_1))
print(find_best_cite_info_fatcat_search(cite_info7, search_res7_2))







In [40]:
print(find_best_cite_info_fatcat_search(cite_info7, elastic_search_6_1))
print(find_best_cite_info_fatcat_search(cite_info7, elastic_search_6_2))
print(find_best_cite_info_fatcat_search(cite_info7, elastic_search_7_1))
print(find_best_cite_info_fatcat_search(cite_info7, elastic_search_7_2))







In [41]:
def abbreviate_journal_name(journal, verbose = False):
    url = "https://abbreviso.toolforge.org/a/" + journal
    request = requests.get(url)
    if request.status_code != 200:
        return journal
    
    return request.text

In [42]:
abbreviate_journal_name("Proceedings of the National Academy of Sciences of the United States of America")

'Proc. Natl. Acad. Sci. U. S. A.'

In [43]:
# abbreviate_journal_name("ACS Appl. Mater. Interfaces")

In [44]:
def fatcat_check_match(cite_info, result, verbose = False):
    
    score = 0
    
    journal_exact_match = False
    journal_partial_match = False
    journal_match_score = 0

    if cite_info["journal"] != "" and result["journal"] != None:
        cite_info["journal"] = cite_info["journal"].strip().lower()
        result["journal"] = result["journal"].strip().lower()
        if cite_info["journal"] == result["journal"] or fuzz.ratio(cite_info["journal"], result["journal"]) == 100:
            journal_exact_match = True
        elif (cite_info["journal"] in result["journal"] or result["journal"] in cite_info["journal"] 
              or fuzz.partial_ratio(cite_info["journal"], result["journal"]) == 100):
            journal_match_partial = False
            
        elif abbreviate_journal_name(cite_info["journal"]) == abbreviate_journal_name(result["journal"]):
            journal_exact_match = True
        elif fuzz.ratio(abbreviate_journal_name(cite_info["journal"]), abbreviate_journal_name(result["journal"])) > 95:
            journal_exact_match = True
        else:
            journal_match_score += fuzz.ratio(cite_info["journal"], result["journal"])/100
    
    # volume 
    volume_match = False
    if result["volume"] != "None" and cite_info["volume"] == result["volume"]:
        volume_match = True
    
    # issue 
    issue_match = False
    if result["issue"] != "None" and cite_info["issue"] == result["issue"]:
        issue_match = True
    
    # year 
    year_match = False
    if result["year"] != "None" and cite_info["year"] == result["year"]:
        year_match = True
        
    # page 
    page_match = False
    if result["page"] != None: 
        if "-" in result["page"]:
            page_start = result["page"].split("-")[0]
            page_end = result["page"].split("-")[0]
            if cite_info["page"] in result["page"]:
                page_match = True
            try: 
                int(page_start)
                int(page_end)
                
                if cite_info["page"] > page_start and cite_info["page"] < page_end:
                    page_match = True
            except:
                page_match = False
        else: 
            if cite_info["page"] in result["page"]:
                page_match = True
        
    # title
    title_exact_match = False
    title_partial_match = False
    title_match_score = 0
    if cite_info["title"] != "" and result["title"] != None:
        cite_info["title"] = cite_info["title"].strip().lower()
        result["title"] = result["title"].strip().lower()
#         if verbose: 
#             print(cite_info["title"])
#             print(result["title"])
#             print(cite_info["title"] == result["title"])
#             print(fuzz.ratio(cite_info["title"], result["title"]))
            
        # exact match 
        if cite_info["title"] == result["title"] or fuzz.ratio(cite_info["title"], result["title"]) == 100:
            title_exact_match = True
        # partial match    
        elif (cite_info["title"] in result["title"] or result["title"] in cite_info["title"] 
              or fuzz.partial_ratio(cite_info["title"], result["title"]) == 100):
            title_match_partial = False
        # jaccard similarity
        else:
            title_match_score += fuzz.ratio(cite_info["title"], result["title"])/100
    
    # author
    author_exact_match = False
    author_match_score = 0
    if cite_info["author"] != "":
        # exact match 
        cite_author_lst = cite_info["author"]
        result_author_lst = result["author"]
        
        
        if cite_author_lst == result_author_lst:
            author_exact_match = True
            
#             if verbose: print("author has exact match")
            
        # partial match    
        else:
            if len(cite_author_lst) > len(result_author_lst):
                shorter_lst = result_author_lst
                longer_lst = cite_author_lst
            else:
                shorter_lst = cite_author_lst
                longer_lst = result_author_lst
                
            if len(shorter_lst) > 0:
                author_matches = 0
                for i in range(len(shorter_lst)):
                    value1 = re.sub("[^a-zA-Z\s]", "", shorter_lst[i])
                    value2 = re.sub("[^a-zA-Z\s]", "", longer_lst[i])
                    aut_score = jaccard_similarity(value1, value2)

#                     if verbose:
#                         print(value1)
#                         print(value2)
#                         print("jaccard_score " + str(aut_score))

                    author_matches += aut_score
                author_match_score += author_matches/len(shorter_lst)
        
    
        
    if verbose:
        print("journal exact match: " + str(journal_exact_match))
        print("journal partial match: " + str(journal_partial_match))
        print("volume match: " + str(volume_match))
        print("issue match: " + str(issue_match))
        print("page match: " + str(page_match))
        print("year match: " + str(year_match))
        print("title exact match: " + str(title_exact_match))
        print("title partial match: " + str(title_partial_match))
        print("author exact match: " + str(author_exact_match))
        print("scores:")
        print("journal match score: " + str(journal_match_score))
        print("title match score: " + str(title_match_score))
        print("author match score: " + str(author_match_score))
        
    
    if journal_exact_match:
        temp_score = int(year_match) + int(volume_match) + int(issue_match) + int(page_match) 
        if title_exact_match:
            if temp_score >= 1:
                if verbose: print("journal exact, title exact, 1 other")
                return True
            if int(author_exact_match) >= 1:
                if verbose: print("journal exact, title exact, author exact")
                return True
            if temp_score + author_match_score > 1:
                if verbose: print("journal exact, title exact, 1 other and author score")
                return True
        elif title_partial_match:
            if temp_score >= 2:
                if verbose: print("journal exact, title partial, 2 other")
                return True
            if temp_score + int(author_exact_match) >= 2:
                if verbose: print("journal exact, title exact, 1 other and author exact")
                return True
            if temp_score + author_match_score >= 2:
                if verbose: print("journal exact, title exact, 2 other and author score")
                return True
        else:
            temp_score += int(title_match_score)
            if temp_score + int(author_exact_match) >= 3:
                if verbose: print("journal exact, title score, author exact or 1 other")
                return True
            if temp_score + author_match_score >= 3:
                if verbose: print("journal exact, title score, author score, or 2 other >=3")
                return True
    elif journal_partial_match:
        temp_score = int(year_match) + int(volume_match) + int(issue_match) + int(page_match) + int(author_match_score)
        if title_exact_match:
            if temp_score >= 2:
                if verbose: print("journal partial, title exact, 2 others")
                return True
        elif title_partial_match:
            if temp_score >= 2:
                if verbose: print("journal partial, title partial, 2 others")
                return True
        else:
            temp_score += title_match_score
            if temp_score >= 3:
                if verbose: print("journal partial, title exact, 2 others")
                return True
    else:
        temp_score = int(year_match) + int(volume_match) + int(issue_match) + int(page_match) 
        temp_score += int(author_match_score) + int(journal_match_score) + int(title_match_score)
        if temp_score > 4:
            if verbose: print("5 fields")
            return True
        
    return False

In [46]:
def find_best_cite_info_fatcat_search2(cite_info, search_result, verbose = False):
    
    for curr_res in search_result:
        curr_match = fatcat_check_match(cite_info, curr_res, verbose = verbose)
        if curr_match:
            return curr_res
            
    return ""

In [47]:
print(find_best_cite_info_fatcat_search2(cite_info6, search_res6_1))




In [48]:
print(cite_info6)
print(search_res6_2[0])
print(find_best_cite_info_fatcat_search2(cite_info6, search_res6_2, verbose = True))
print(find_best_cite_info_fatcat_search2(cite_info6, elastic_search_6_2))

{'journal': 'Toxins', 'date': '2011-01', 'year': 2011, 'volume': '3', 'issue': '1', 'title': 'The Discodermia calyx Toxin Calyculin A', 'author': ['Jessica R. Edelson', 'David L. Brautigan'], 'page': '105', 'url': '', 'external_ids': {'doi': '10.3390/toxins3010105', 'pmid': '22069692', 'pmc': '3210456'}}
{'work_id': '3iq73gpzjjaz5jr37x6mjn3zii', 'score': 61.302803, 'title': 'The Discodermia calyx Toxin Calyculin A Enhances Cyclin D1 Phosphorylation and Degradation, and Arrests Cell Cycle Progression in Human Breast Cancer Cells', 'year': 2011, 'journal': 'Toxins', 'volume': None, 'issue': None, 'page': '105-119', 'author': ['Jessica R. Edelson', 'David L. Brautigan'], 'url': 'https://web.archive.org/web/20180728074927/https://res.mdpi.com/def50200085a04acf69806274a694018e1a97cedb1a41827edad480ce169ecdb2edc9f9d8170cb248c739300e5851e3a4ea27ca8b7da35cbd74474603bfa07092c315d21af89a5b45f963c9490b78b43d4ce4cccc91e3c9b59351003fce4d8cb6231b4abfd170cf43219aebee39c890b73033cdede28c226d156e26de70

In [49]:
print(cite_info7)
print(search_res7_2[0])
print(elastic_search_7_2)

{'journal': 'Canadian Journal of Public Health', 'date': '1994', 'year': 1994, 'volume': '85', 'issue': '6', 'title': 'Lifetime probability of developing lung cancer, by smoking status, Canada', 'author': [], 'page': '385', 'url': '', 'external_ids': {'pmid': '7895211'}}
{'work_id': 'qwmp2lzwcnhhvo3wqohwm5u3g4', 'score': 31.104496, 'title': 'Positive Correlation between Normal Serum Gastrin Concentrations and Antral and Duodenal G Cells', 'year': 1994, 'journal': 'Canadian Journal of Gastroenterology', 'volume': None, 'issue': None, 'page': '235-238', 'author': ['WR Yacoub', 'ABR Thomson', 'RW Sherbaniuk', 'P Hooper', 'LD Jewell'], 'url': 'https://web.archive.org/web/201905030614/http://downloads.hindawi.com/journals/cjgh/1994/151096.pdf'}
[{'work_id': 'qwmp2lzwcnhhvo3wqohwm5u3g4', 'score': 31.122288, 'title': 'Positive Correlation between Normal Serum Gastrin Concentrations and Antral and Duodenal G Cells', 'year': 1994, 'journal': 'Canadian Journal of Gastroenterology', 'volume': Non

In [50]:
print(find_best_cite_info_fatcat_search2(cite_info6, elastic_search_6_1))
print(find_best_cite_info_fatcat_search2(cite_info7, elastic_search_7_1))
print(find_best_cite_info_fatcat_search2(cite_info7, elastic_search_7_2))






### Method 2 - Use Elastic Search's built-in 
- https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html
    - how to change/use functions for scores
    - explain different fields
    
- https://www.compose.com/articles/how-scoring-works-in-elasticsearch/
    - Tf-idf scoring: term frequency, inverse frequency 
    - normalization 
    - some explanations
    
- http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/package-summary.html#scoring
    - lucene 
    - can change query, similarity, weight, scorer
    - more theoretical 
    
- https://www.elastic.co/guide/en/elasticsearch/guide/current/controlling-relevance.html 
    - Boolean model of information retrieval to find matching
    - Use practical scoring (link 1) to calculate relevance
    - term frequency/inverse document frequency, the vector space model
    - more-modern features: coordination factor, field length normalization, and term or query clause boosting
    - Query-time boosting: tune relevance 
    - Function Score Query

#### Problem: Difficult to find uniform threshold 
- Example 1 score 65.9 = match 
    - Citation: {'journal': 'Nature', 'date': '2004-06', 'year': 2004, 'volume': '429', 'issue': '6994', 
       'title': 'Gene regulation and DNA damage in the ageing human brain', 'author': [], 
       'page': '883', 'url': '', 'external_ids': {}}
    - Search Result: {'work_id': 'hg2prp3rk5chtpvcs5cysmsq3y', 'score': 64.91316, 
        'title': 'Gene regulation and DNA damage in the ageing human brain', 'year': 2004, 
        'journal': 'Nature', 'volume': '429', 'issue': '6994', 'page': '883-891', 
        'author': ['Tao Lu', 'Ying Pan', 'Shyan-Yuan Kao', 'Cheng Li', 'Isaac Kohane', 'Jennifer Chan', 
        'Bruce A. Yankner'], 'url': 'https://web.archive.org/web/20100706102006/http://biosun1.harvard.edu/~cli/pdf/Lu_04.pdf'}
        
- Example 2 score 70.7 = no match 
    - Citation: {'journal': 'Proceedings of the National Academy of Sciences of the United States of America', 'date': '1998-02', 'year': 1998, 'volume': '95', 'issue': '3', 'title': 'The role of left prefrontal cortex in language and memory', 'author': [], 'page': '906', 'url': '', 'external_ids': {}}
    - Search Result (second value): {'work_id': 'rr5il275vnawfglg6zvvu6s64i', 'score': 70.74353, 'title': 'Memory fields of neurons in the primate prefrontal cortex', 'year': 1998, 'journal': 'Proceedings of the National Academy of Sciences of the United States of America', 'volume': '95', 'issue': '25', 'page': '15008-15013', 'author': ['G. Rainer', 'W. F. Asaad', 'E. K. Miller'], 'url': 'https://web.archive.org/web/20170814074904/http://www.cns.nyu.edu/~wendy/class/2006sp/reading10/Rainer_1998a.pdf'}
    
#### Solution: Script the Query String -- what would be the difference between that and my method one above?

## Main Function For Fatcat

### Method 1 

In [51]:
def process_citation_to_fatcat(citation, verbose = False):
    
    cite_info = parse_citation_data(citation)
    if verbose: print(cite_info)
    
    if cite_info["url"] != "":
        return "Has URL"
    
    if not cite_info["external_ids"]:
        return "Has external  DOI"
    
    search_result = search_fatcat_cite_info(cite_info)
    
    if search_result == []:
        return "No search result"
    
    if len(search_result) > 10:
        search_result = search_result[:10]
        
    closest_match = find_best_cite_info_fatcat_search(cite_info, search_result, verbose = verbose)
    
    if closest_match == "":
        return "No close match"
    
    url = closest_match["url"]
    if url == None:
        return "No URL in search result"
    if verbose: print(url)
        
    return url

#### We cannot add Wayback Urls to Wikipedia because it is not standard
#### So need to further process these generated urls or take another route

### Method #2 
#### link to scholar.archive.org pages! 
- old: (ex. https://scholar.archive.org/search?q=key:work_g23s7ktlbrhxhpcat34i4jnmme)
- new: https://scholar.archive.org/work/g23s7ktlbrhxhpcat34i4jnmme

In [52]:
def generate_scholars_archive_url(work_id):
    header = "https://scholar.archive.org/work/"
    content = work_id
    return header + content

### Verify with Web Scraping

In [89]:
from bs4 import BeautifulSoup

In [90]:
def has_link(url):
    res = requests.get(url, timeout = 20)
    soup = BeautifulSoup(res.content, "html.parser")
    dropdown = soup.find("div", attrs = {"class": "dropdown-menu"})
    h4 = dropdown.find("h4")
    
    
    
    if "Preserved Fulltext" not in h4.text:
        return "ehhh"
    p = dropdown.find("p")
    if "We don't yet know of a public preservation copy of this work." in p.text:
        return False
    if "has been preserved in the Wayback Machine." in p.text:
        return True
    
    return "maybe"

In [91]:
test_url1 = "https://scholar.archive.org/work/677annbiifaqbfyfaiqjxsjf5a"
has_link(test_url1)

False

In [92]:
test_url2 = "https://scholar.archive.org/work/smviomizcncc5cslifvbtuymfa"
has_link(test_url2)

True

In [93]:
test_url3 = "https://scholar.archive.org/work/oxvzp5qalre2thc6osx2v4eudy"
has_link(test_url3)

False

In [94]:
test_url4 = "https://scholar.archive.org/work/prjuj44mavbc5agm5bmgbpe5qy"
has_link(test_url4)

True

## Put it all together

In [53]:
def process_citation_to_scholars1(citation, verbose = False):
    
    cite_info = preprocessing_citation(citation)
    if type(cite_info) == str:
        return cite_info
    if verbose: print(cite_info)
    
    url_content = generate_url_content2(cite_info)
    search_result = search_fatcat_cite_info(url_content, verbose)
#     search_result = elastic_search_cite_info(url_content, verbose)
    
    if search_result == []:
        return "No search result"
    
    if len(search_result) > 10:
        search_result = search_result[:10]
        
    closest_match = find_best_cite_info_fatcat_search2(cite_info, search_result, verbose = verbose)
    
    if closest_match == "":
        return "No close match"
    
    url = closest_match["url"]
    if url == None:
        return "No URL in search result"
    
    work_id = closest_match["work_id"]
    if work_id == "":
        return "No work_id in search result"
    if verbose: print(work_id)
        
    return generate_scholars_archive_url(work_id)

In [95]:
def process_citation_to_scholars2(citation, verbose = False):
    
    cite_info = preprocessing_citation(citation)
    if type(cite_info) == str:
        return cite_info
    if verbose: print(cite_info)
    
    url_content = generate_url_content2(cite_info)
#     search_result = search_fatcat_cite_info(url_content, verbose)
    search_result = elastic_search_cite_info(url_content, verbose)
    
    if search_result == []:
        return "No search result"
    
    if len(search_result) > 10:
        search_result = search_result[:10]
        
    closest_match = find_best_cite_info_fatcat_search2(cite_info, search_result, verbose = verbose)
    
    if closest_match == "":
        return "No close match"
    
    url = closest_match["url"]
    if url == None:
        return "No URL in search result"
    
    work_id = closest_match["work_id"]
    if work_id == "":
        return "No work_id in search result"
    if verbose: print(work_id)
        
    url = generate_scholars_archive_url(work_id)
    if has_link(url):
        return url
    return "Search result has no url"

## Mass Experiment
### Data sample of 100

In [96]:
journal_dump = pd.read_json("tr.wikipedia.org.journal.20210621.json.gz", lines = True)
print(journal_dump.shape[0])
journal_dump.head(10)

14960


Unnamed: 0,a,c
0,1 + 2 + 3 + 4 + · · ·,{{Akademik dergi kaynağı\n| soyadı = Lepowsky ...
1,12 Victoria,{{Akademik dergi kaynağı\n | ad1 = B.\n | s...
2,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...
3,141 Likya depremi,{{Akademik dergi kaynağı | url=http://blackmed...
4,141 Likya depremi,{{Akademik dergi kaynağı | url=http://www.nat-...
5,1481 Rodos depremi,{{Akademik dergi kaynağı|başlık=Earthquake sou...
6,1481 Rodos depremi,{{Akademik dergi kaynağı|başlık=Historical and...
7,1481 Rodos depremi,{{Akademik dergi kaynağı|url=http://jgs.geosci...
8,1481 Rodos depremi,{{Akademik dergi kaynağı|başlık=New approaches...
9,1481 Rodos depremi,{{Akademik dergi kaynağı|url=http://hal-insu.a...


In [97]:
journal_dump_100 = journal_dump.sample(n = 100)
journal_dump_100 = journal_dump_100.reset_index(drop = True)
journal_dump_100.head()

Unnamed: 0,a,c
0,Punk rap,{{Akademik dergi kaynağı|başlık=Move Over My C...
1,İnsan,{{Akademik dergi kaynağı | soyadı1 = Henshilwo...
2,Alonzo Church,"{{Akademik dergi kaynağı|yazar=Henkin, Leon|ba..."
3,Su mikrobiyolojisi,{{Akademik dergi kaynağı|url=http://www.annual...
4,Türkiye Cumhuriyeti'nin kadın bakanları listesi,{{Akademik dergi kaynağı|url=https://www.jstor...


In [58]:
# %%time
# journal_dump_100["scholars_result"] = journal_dump_100['c'].apply(lambda x: 
#                                                                   process_citation_to_scholars1(x, verbose = False))

In [98]:
%%time
journal_dump_100["scholars_result"] = journal_dump_100['c'].apply(lambda x: 
                                                                  process_citation_to_scholars2(x, verbose = False))

CPU times: user 3.84 s, sys: 876 ms, total: 4.71 s
Wall time: 1min 29s


In [99]:
journal_dump_100.head()

Unnamed: 0,a,c,scholars_result
0,Punk rap,{{Akademik dergi kaynağı|başlık=Move Over My C...,Err1
1,İnsan,{{Akademik dergi kaynağı | soyadı1 = Henshilwo...,Err1
2,Alonzo Church,"{{Akademik dergi kaynağı|yazar=Henkin, Leon|ba...",Err1
3,Su mikrobiyolojisi,{{Akademik dergi kaynağı|url=http://www.annual...,Err1
4,Türkiye Cumhuriyeti'nin kadın bakanları listesi,{{Akademik dergi kaynağı|url=https://www.jstor...,Err1


In [102]:
journal_dump_100_good = journal_dump_100[(journal_dump_100["scholars_result"] != "Err1") 
                                           & (journal_dump_100["scholars_result"] != "Err2")
                                         & (journal_dump_100["scholars_result"] != "No search result")
                                     & (journal_dump_100["scholars_result"] != "No close match") 
                                     & (journal_dump_100["scholars_result"] != "No URL in search result")]
journal_dump_100_good = journal_dump_100_good.reset_index(drop = True)
journal_dump_100_good

Unnamed: 0,a,c,scholars_result
0,Postojna Mağarası,{{Akademik dergi kaynağı|başlık=Speleotourism ...,https://scholar.archive.org/work/2fzrottcf5fgx...


In [107]:
print(journal_dump_100_good.loc[0, "a"])
test_cite = journal_dump_100_good.loc[0, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
# print(test_search_result[0])
# print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_100_good.loc[0, "scholars_result"])

Postojna Mağarası
{{Akademik dergi kaynağı|başlık=Speleotourism in Slovenia: balancing between mass tourism and geoheritage protection|yazarlar=Tičar|tarih=11 Ağustos 2018|sayı=1|dil=İngilizce|sayfalar=344-357|çalışma=Open Geosciences|cilt=10}}
{'journal': 'Open Geosciences', 'date': '2018-08', 'year': 2018, 'volume': '10', 'issue': '1', 'title': 'Speleotourism in Slovenia: balancing between mass tourism and geoheritage protection', 'author': ['Tičar'], 'page': '344', 'url': '', 'external_ids': {}}
{'work_id': '2fzrottcf5fgxkbftnyzryf5vi', 'score': 94.23294, 'title': 'speleotourism in slovenia: balancing between mass tourism and geoheritage protection', 'year': 2018, 'journal': 'open geosciences', 'volume': '10', 'issue': None, 'page': '344-357', 'author': ['J. Tičar', 'N. Tomić', 'M. Breg Valjavec', 'M. Zorn', 'S. B. Marković', 'M. B. Gavrilov'], 'url': 'https://web.archive.org/web/20200226210729/https://www.degruyter.com/downloadpdf/j/geo.2018.10.issue-1/geo-2018-0027/geo-2018-0027.p

### Data Sample 2000

In [145]:
journal_dump_2000 = journal_dump.sample(n = 2000)
journal_dump_2000 = journal_dump_2000.reset_index(drop = True)
journal_dump_2000.head()

Unnamed: 0,a,c
0,Kısa süreli bellek,{{Akademik dergi kaynağı|url=|başlık=The mind ...
1,Solar döngüler,{{Akademik dergi kaynağı|url=http://epic.awi.d...
2,Atom probu,{{Akademik dergi kaynağı|başlık=The Atom-Probe...
3,Latin Pop Albums,{{Akademik dergi kaynağı|başlık=1993: The Year...
4,Stroop etkisi,{{Akademik dergi kaynağı|url=|başlık=Stroop in...


In [146]:
# journal_dump_2000["fatcat_result"] = journal_dump_2000['c'].apply(lambda x: 
#                                                                   process_citation_to_fatcat(x, verbose = False))

In [147]:
# journal_dump_2000.head()

In [148]:
journal_dump_2000["scholars_result"] = journal_dump_2000['c'].apply(lambda x: 
                                                                  process_citation_to_scholars2(x, verbose = False))

In [149]:
journal_dump_2000.head()

Unnamed: 0,a,c,scholars_result
0,Kısa süreli bellek,{{Akademik dergi kaynağı|url=|başlık=The mind ...,https://scholar.archive.org/search?q=key:work_...
1,Solar döngüler,{{Akademik dergi kaynağı|url=http://epic.awi.d...,Err1
2,Atom probu,{{Akademik dergi kaynağı|başlık=The Atom-Probe...,Err1
3,Latin Pop Albums,{{Akademik dergi kaynağı|başlık=1993: The Year...,Err1
4,Stroop etkisi,{{Akademik dergi kaynağı|url=|başlık=Stroop in...,No close match


In [150]:
journals_failed_basic_filtering = journal_dump_2000[journal_dump_2000["scholars_result"] == "Err1"]
print(journals_failed_basic_filtering.shape)
journals_failed_basic_filtering.head()

(1779, 3)


Unnamed: 0,a,c,scholars_result
1,Solar döngüler,{{Akademik dergi kaynağı|url=http://epic.awi.d...,Err1
2,Atom probu,{{Akademik dergi kaynağı|başlık=The Atom-Probe...,Err1
3,Latin Pop Albums,{{Akademik dergi kaynağı|başlık=1993: The Year...,Err1
5,Türkistan hamam böceği,{{Akademik dergi kaynağı|başlık=Catalog and at...,Err1
6,Küçük Prens (2015 film),{{Akademik dergi kaynağı|url=http://dx.doi.org...,Err1


In [152]:
journals_failed_autourl_check = journal_dump_100[journal_dump_100["scholars_result"] == "Err2"]
print(journals_failed_autourl_check.shape)
journals_failed_autourl_check.head()

(1, 3)


Unnamed: 0,a,c,scholars_result
45,Büyük kütleli yıldızlar listesi,{{Akademik dergi kaynağı|başlık=High-mass star...,Err2


In [153]:
journal_dump_2000[journal_dump_2000["scholars_result"] == "No search result"]

Unnamed: 0,a,c,scholars_result
986,Dalton Transactions,{{Akademik dergi kaynağı|başlık=Acta Chemica S...,No search result
1169,Theoktisti,{{Akademik dergi kaynağı |soyadı=Guilland |ad=...,No search result
1376,Elizabeth Loftus,{{Akademik dergi kaynağı|başlık=Loftus Receive...,No search result
1378,Gvozdansko Kuşatması,{{Akademik dergi kaynağı|soyadı=Kekez|ad=Hrvoj...,No search result
1420,Ural-Yukagir dilleri,{{Akademik dergi kaynağı|başlık=Uralo-jukagiri...,No search result
1446,Melissa Rosenberg,{{Akademik dergi kaynağı|başlık=Dark Tales|yaz...,No search result


# TODO: look into how to generate more search results... 
# loosen search restrictions and do more filtering with what is returned?

In [154]:
# # print(journal_dump_2000.loc[12365, "a"])
# # test_cite = journal_dump_2000.loc[12365, "c"]
# # print(test_cite)
# # test_cite_info = parse_citation_data(test_cite)
# # print(test_cite_info)
# # test_search_result = search_fatcat_cite_info(test_cite_info, verbose = True)
# # print(test_search_result)

# test_cite_info2 = {'journal': 'American Journal of Physical Anthropology', 
#                    'date': '2003', 'year': 2003, 'volume': '122', 'issue': '', 
#                    'title': 'Exploring artificial cranial deformation using elliptic Fourier analysis of procrustes aligned outlines', 
#                    'author': ['Martin Frieß'], 'page': '11', 'url': '', 'doi': ''}
# test_search_result2 = search_fatcat_cite_info(test_cite_info2)
# # print(test_search_result2[0])
# test_closest_match2 = find_best_cite_info_fatcat_search(test_cite_info2, test_search_result2)
# print(test_closest_match2)


In [174]:
journal_dump_2000_no_url = journal_dump_2000[journal_dump_2000["scholars_result"] == "No URL in search result"]
print(journal_dump_2000_no_url.shape)
journal_dump_2000_no_url.head()

(54, 3)


Unnamed: 0,a,c,scholars_result
7,Afrika'da kölelik,{{Akademik dergi kaynağı|başlık=Contours of Sl...,No URL in search result
30,T Yardımcı Hücresi 17,{{Akademik dergi kaynağı|başlık=Preventative r...,No URL in search result
150,Afrika'da kölelik,{{Akademik dergi kaynağı|başlık=Trading in Sla...,No URL in search result
158,Gece yeme sendromu,{{Akademik dergi kaynağı|url=|başlık=Nighttime...,No URL in search result
162,Chi-Huey Wong,{{Akademik dergi kaynağı|başlık=Chemical-enzym...,No URL in search result


# TODO: look at pages of ones without urls in search result and figure out if they indeed have no valid links within them  

In [156]:
journal_dump_2000[journal_dump_2000["scholars_result"] == "No work_id in search result"]

Unnamed: 0,a,c,scholars_result


In [175]:
journal_dump_2000_no_match = journal_dump_2000[journal_dump_2000["scholars_result"] == "No close match"]
journal_dump_2000_no_match = journal_dump_2000_no_match.reset_index(drop = True)
print(journal_dump_2000_no_match.shape)
journal_dump_2000_no_match.head()

(107, 3)


Unnamed: 0,a,c,scholars_result
0,Stroop etkisi,{{Akademik dergi kaynağı|url=|başlık=Stroop in...,No close match
1,Görme,{{Akademik dergi kaynağı|başlık=Who Is the Fou...,No close match
2,Güneş Sistemi'ndeki kütleçekimsel yuvarlak nes...,"{{Akademik dergi kaynağı|başlık=The size, dens...",No close match
3,Chi-Huey Wong,{{Akademik dergi kaynağı|başlık=Toward Automat...,No close match
4,Solaklara karşı önyargı,{{Akademik dergi kaynağı|başlık=Sinistrality a...,No close match


In [160]:
print(journal_dump_2000_no_match.loc[0, "a"])
test_cite = journal_dump_2000_no_match.loc[0, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
# print(test_search_result[0])
# print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_no_match.loc[0, "scholars_result"])

Stroop etkisi
{{Akademik dergi kaynağı|url=|başlık=Stroop interference and attention-deficit/hyperactivity disorder: a review and meta-analysis|tarih=Mart 2007|sayı=2|sayfalar=251-62|çalışma=Neuropsychology|cilt=21}}
{'journal': 'Neuropsychology', 'date': '2007-03', 'year': 2007, 'volume': '21', 'issue': '2', 'title': 'Stroop interference and attention-deficit/hyperactivity disorder: a review and meta-analysis', 'author': [], 'page': '251', 'url': '', 'external_ids': {}}

No close match


In [162]:
journal_dump_2000_good = journal_dump_2000[(journal_dump_2000["scholars_result"] != "Err1") 
                                           & (journal_dump_2000["scholars_result"] != "Err2")
                                         & (journal_dump_2000["scholars_result"] != "No search result")
                                     & (journal_dump_2000["scholars_result"] != "No close match") 
                                     & (journal_dump_2000["scholars_result"] != "No URL in search result")]
journal_dump_2000_good = journal_dump_2000_good.reset_index(drop = True)
journal_dump_2000_good

Unnamed: 0,a,c,scholars_result
0,Kısa süreli bellek,{{Akademik dergi kaynağı|url=|başlık=The mind ...,https://scholar.archive.org/search?q=key:work_...
1,Prososyal davranış,{{Akademik dergi kaynağı|url=|başlık=Moral emo...,https://scholar.archive.org/search?q=key:work_...
2,Joshua Coon,{{Akademik dergi kaynağı|başlık=Focus in Honor...,https://scholar.archive.org/search?q=key:work_...
3,Nöroplastisite,{{Akademik dergi kaynağı|başlık=Long-term medi...,https://scholar.archive.org/search?q=key:work_...
4,Hinokitiol,{{Akademik dergi kaynağı|başlık=Acavenging act...,https://scholar.archive.org/search?q=key:work_...
5,Chi-Huey Wong,{{Akademik dergi kaynağı|başlık=A common glyca...,https://scholar.archive.org/search?q=key:work_...
6,Prosopagnozi,{{Akademik dergi kaynağı|başlık=The Cambridge ...,https://scholar.archive.org/search?q=key:work_...
7,Nöroplastisite,"{{Akademik dergi kaynağı|başlık=Acquiring ""the...",https://scholar.archive.org/search?q=key:work_...
8,Psikolojik adaptasyon,{{Akademik dergi kaynağı|url=|başlık=Universal...,https://scholar.archive.org/search?q=key:work_...
9,Fototaksi,{{Akademik dergi kaynağı|başlık=Mechanism of p...,https://scholar.archive.org/search?q=key:work_...


In [164]:
print(journal_dump_2000_good.loc[0, "a"])
test_cite = journal_dump_2000_good.loc[0, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = elastic_search_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[0, "scholars_result"])

Kısa süreli bellek
{{Akademik dergi kaynağı|url=|başlık=The mind and brain of short-term memory|yazarlar=Jonides|sayı=|sayfalar=193-224|çalışma=Annual Review of Psychology|yıl=2008|cilt=59}}
{'journal': 'Annual Review of Psychology', 'date': '2008', 'year': 2008, 'volume': '59', 'issue': '', 'title': 'The mind and brain of short-term memory', 'author': ['Jonides'], 'page': '193', 'url': '', 'external_ids': {}}
{'work_id': 'smviomizcncc5cslifvbtuymfa', 'score': 55.42295, 'title': 'The Mind and Brain of Short-Term Memory', 'year': 2008, 'journal': 'Annual Review of Psychology', 'volume': '59', 'issue': None, 'page': '193-224', 'author': ['John Jonides', 'Richard L. Lewis', 'Derek Evan Nee', 'Cindy A. Lustig', 'Marc G. Berman', 'Katherine Sledge Moore'], 'url': 'https://web.archive.org/web/20131230075012/http://ling.umd.edu//~ellenlau/courses/nacs642/Jonides_2007.pdf'}
journal exact match: True
journal partial match: False
volume match: True
issue match: False
page match: True
year match:

In [168]:
print(journal_dump_2000_good.loc[1, "a"])
test_cite = journal_dump_2000_good.loc[1, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = elastic_search_cite_info(test_url_content)
# print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[1, "scholars_result"])

Prososyal davranış
{{Akademik dergi kaynağı|url=|başlık=Moral emotions and moral behavior|yazarlar=Tangney|sayı=|sayfalar=345-372|çalışma=Annual Review of Psychology|yıl=2007|cilt=58}}
{'journal': 'Annual Review of Psychology', 'date': '2007', 'year': 2007, 'volume': '58', 'issue': '', 'title': 'Moral emotions and moral behavior', 'author': ['Tangney'], 'page': '345', 'url': '', 'external_ids': {}}
journal exact match: True
journal partial match: False
volume match: True
issue match: False
page match: True
year match: True
title exact match: True
title partial match: False
author exact match: False
scores:
journal match score: 0
title match score: 0
author match score: 0.3333333333333333
journal exact, title exact, 1 other
True
{'work_id': '36y3v43ffraxdljatk7qg4d4em', 'score': 60.384995, 'title': 'moral emotions and moral behavior', 'year': 2007, 'journal': 'annual review of psychology', 'volume': '58', 'issue': None, 'page': '345-372', 'author': ['June Price Tangney', 'Jeff Stuewig',

In [187]:
print(journal_dump_2000_good.loc[2, "a"])
test_cite = journal_dump_2000_good.loc[2, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[2, "scholars_result"])

Nöroplastisite
{{Akademik dergi kaynağı|başlık=Navigation-related structural change in the hippocampi of taxi drivers|tarih=Nisan 2000|sayı=8|sayfalar=4398-403|çalışma=Proceedings of the National Academy of Sciences of the United States of America|cilt=97}}
{'journal': 'Proceedings of the National Academy of Sciences of the United States of America', 'date': '2000-04', 'year': 2000, 'volume': '97', 'issue': '8', 'title': 'Navigation-related structural change in the hippocampi of taxi drivers', 'author': [], 'page': '4398', 'url': '', 'external_ids': {}}
{'work_id': 'suhffvny7rd3tfqcxtnvltzilm', 'score': 93.54502, 'title': 'Navigation-related structural change in the hippocampi of taxi drivers', 'year': 2000, 'journal': 'Proceedings of the National Academy of Sciences of the United States of America', 'volume': '97', 'issue': None, 'page': '4398-4403', 'author': ['E. A. Maguire', 'D. G. Gadian', 'I. S. Johnsrude', 'C. D. Good', 'J. Ashburner', 'R. S. J. Frackowiak', 'C. D. Frith'], 'url

In [188]:
print(journal_dump_2000_good.loc[3, "a"])
test_cite = journal_dump_2000_good.loc[3, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[3, "scholars_result"])

Nöroplastisite
{{Akademik dergi kaynağı|başlık=Somatosensory cortical plasticity in carpal tunnel syndrome--a cross-sectional fMRI evaluation|tarih=Haziran 2006|sayı=2|sayfalar=520-30|çalışma=NeuroImage|cilt=31}}
{'journal': 'NeuroImage', 'date': '2006-06', 'year': 2006, 'volume': '31', 'issue': '2', 'title': 'Somatosensory cortical plasticity in carpal tunnel syndrome--a cross-sectional fMRI evaluation', 'author': [], 'page': '520', 'url': '', 'external_ids': {}}
{'work_id': 'j3l2d6t7yve3nnovxzo5dtfz2a', 'score': 85.35768, 'title': 'Somatosensory cortical plasticity in carpal tunnel syndrome—a cross-sectional fMRI evaluation', 'year': 2006, 'journal': 'NeuroImage', 'volume': '31', 'issue': None, 'page': '520-530', 'author': ['Vitaly Napadow', 'Norman Kettner', 'Angela Ryan', 'Kenneth K. Kwong', 'Joseph Audette', 'Kathleen K.S. Hui'], 'url': 'https://web.archive.org/web/20080221200946/http://www.nmr.mgh.harvard.edu/~vitaly/PDF/napadow_NI_2006.pdf'}
journal exact match: True
journal par

In [190]:
print(journal_dump_2000_good.loc[4, "a"])
test_cite = journal_dump_2000_good.loc[4, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[4, "scholars_result"])

Prosopagnozi
{{Akademik dergi kaynağı|başlık=The Cambridge Face Memory Test: results for neurologically intact individuals and an investigation of its validity using inverted face stimuli and prosopagnosic participants|sayı=4|sayfalar=576-85|çalışma=Neuropsychologia|yıl=2006|cilt=44}}
{'journal': 'Neuropsychologia', 'date': '2006', 'year': 2006, 'volume': '44', 'issue': '4', 'title': 'The Cambridge Face Memory Test: results for neurologically intact individuals and an investigation of its validity using inverted face stimuli and prosopagnosic participants', 'author': [], 'page': '576', 'url': '', 'external_ids': {}}
{'work_id': 'n7j7eyfytbdyveqjocsuen7qva', 'score': 135.86821, 'title': 'The Cambridge Face Memory Test: Results for neurologically intact individuals and an investigation of its validity using inverted face stimuli and prosopagnosic participants', 'year': 2006, 'journal': 'Neuropsychologia', 'volume': '44', 'issue': None, 'page': '576-585', 'author': ['Brad Duchaine', 'Ken 

In [191]:
print(journal_dump_2000_good.loc[5, "a"])
test_cite = journal_dump_2000_good.loc[5, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[5, "scholars_result"])

Dağ baştankarası
{{Akademik dergi kaynağı|başlık=Delayed cost of reproduction and senescence in the willow tit ''Parus montanus''|yazarlar=Orell|tarih=2002|sayı=1|sayfalar=55-64|çalışma=Journal of Animal Ecology|cilt=71}}
{'journal': 'Journal of Animal Ecology', 'date': '2002', 'year': 2002, 'volume': '71', 'issue': '1', 'title': "Delayed cost of reproduction and senescence in the willow tit ''Parus montanus''", 'author': ['Orell'], 'page': '55', 'url': '', 'external_ids': {}}
{'work_id': 't3iguxcwvrcdhm262gzautkfzi', 'score': 93.76954, 'title': 'Delayed cost of reproduction and senescence in the willow tit Parus montanus', 'year': 2002, 'journal': 'Journal of Animal Ecology', 'volume': '71', 'issue': None, 'page': '55-64', 'author': ['Markku Orell', 'Eduardo J. Belda'], 'url': 'https://web.archive.org/web/20200320190823/http://personales.upv.es/~edbelpe/orell%20and%20belda%202002.pdf'}
journal exact match: True
journal partial match: False
volume match: True
issue match: False
page ma

In [192]:
print(journal_dump_2000_good.loc[6, "a"])
test_cite = journal_dump_2000_good.loc[6, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[6, "scholars_result"])

Nöroplastisite
{{Akademik dergi kaynağı|başlık=Reorganization of remote cortical regions after ischemic brain injury: a potential substrate for stroke recovery|tarih=Haziran 2003|sayı=6|sayfalar=3205-14|çalışma=Journal of Neurophysiology|cilt=89}}
{'journal': 'Journal of Neurophysiology', 'date': '2003-06', 'year': 2003, 'volume': '89', 'issue': '6', 'title': 'Reorganization of remote cortical regions after ischemic brain injury: a potential substrate for stroke recovery', 'author': [], 'page': '3205', 'url': '', 'external_ids': {}}
{'work_id': 'ufvbbbfrjfd3jlzeobnyd4b4qa', 'score': 78.644455, 'title': 'Reorganization of Remote Cortical Regions After Ischemic Brain Injury: A  Potential Substrate for Stroke Recovery', 'year': 2003, 'journal': 'Journal of Neurophysiology', 'volume': '89', 'issue': None, 'page': '3205-3214', 'author': ['S. B. Frost', 'S. Barbay', 'K. M. Friel', 'E. J. Plautz', 'R. J. Nudo'], 'url': 'https://web.archive.org/web/20170902140404/http://jn.physiology.org/conte

In [193]:
print(journal_dump_2000_good.loc[7, "a"])
test_cite = journal_dump_2000_good.loc[7, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[7, "scholars_result"])

Bölünmüş beyin
{{Akademik dergi kaynağı|url=|başlık=Mike or me? Self-recognition in a split-brain patient|yazarlar=Turk D. J.|sayı=9|sayfalar=841-842|çalışma=Nature Neuroscience|yıl=2002|cilt=5}}
{'journal': 'Nature Neuroscience', 'date': '2002', 'year': 2002, 'volume': '5', 'issue': '9', 'title': 'Mike or me? Self-recognition in a split-brain patient', 'author': ['Turk D. J.'], 'page': '841', 'url': '', 'external_ids': {}}
{'work_id': 'xwkbzwq7xzfpta74wmx2wpaeyy', 'score': 65.82822, 'title': 'Mike or me? Self-recognition in a split-brain patient', 'year': 2002, 'journal': 'Nature Neuroscience', 'volume': None, 'issue': None, 'page': '841-842', 'author': ['David J. Turk', 'Todd F. Heatherton', 'William M. Kelley', 'Margaret G. Funnell', 'Michael S. Gazzaniga', 'C. Neil Macrae'], 'url': 'https://web.archive.org/web/20081216162433/http://www.webpages.uidaho.edu/~bdyre/psyc526/Turk%20et%20al.%202002.pdf'}
journal exact match: True
journal partial match: False
volume match: False
issue mat

In [194]:
print(journal_dump_2000_good.loc[8, "a"])
test_cite = journal_dump_2000_good.loc[8, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[8, "scholars_result"])

John R. Yates
{{Akademik dergi kaynağı|url=|başlık=Large-scale analysis of the yeast proteome by multidimensional protein identification technology|erişimtarihi=|yazarlar=Washburn|tarih=Mart 2001|sayı=3|dil=En|sayfalar=242-247|çalışma=Nature Biotechnology|yayıncı=|cilt=19}}
{'journal': 'Nature Biotechnology', 'date': '2001-03', 'year': 2001, 'volume': '19', 'issue': '3', 'title': 'Large-scale analysis of the yeast proteome by multidimensional protein identification technology', 'author': ['Washburn'], 'page': '242', 'url': '', 'external_ids': {}}
{'work_id': 'y7skt4khpff2zktij4n2uax4s4', 'score': 73.541626, 'title': 'Large-scale analysis of the yeast proteome by multidimensional protein identification technology', 'year': 2001, 'journal': 'Nature Biotechnology', 'volume': '19', 'issue': None, 'page': '242-247', 'author': ['Michael P. Washburn', 'Dirk Wolters', 'John R. Yates'], 'url': 'https://web.archive.org/web/20170808014930/http://arep.med.harvard.edu/pdf/Washburn01.pdf'}
journal e

In [195]:
print(journal_dump_2000_good.loc[9, "a"])
test_cite = journal_dump_2000_good.loc[9, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[9, "scholars_result"])

Stroop etkisi
{{Akademik dergi kaynağı|başlık=Stroop Performance in Normal Control Subjects: An fMRI Study|yazarlar=Gruber|yazarlarıgöster=etal|sayı=2|sayfalar=349-360|çalışma=NeuroImage|yıl=2002|cilt=16}}
{'journal': 'NeuroImage', 'date': '2002', 'year': 2002, 'volume': '16', 'issue': '2', 'title': 'Stroop Performance in Normal Control Subjects: An fMRI Study', 'author': ['Gruber'], 'page': '349', 'url': '', 'external_ids': {}}
{'work_id': 'kzeu5vo7jfc4xkcosbwsp22fsq', 'score': 55.944572, 'title': 'Stroop Performance in Normal Control Subjects: An fMRI Study', 'year': 2002, 'journal': 'NeuroImage', 'volume': '16', 'issue': None, 'page': '349-360', 'author': ['Staci A. Gruber', 'Jadwiga Rogowska', 'Philip Holcomb', 'Salvatore Soraci', 'Deborah Yurgelun-Todd'], 'url': 'https://web.archive.org/web/201902221955/http://pdfs.semanticscholar.org/398c/39e26ad26ebb6965f9646c7e6591e63b211c.pdf'}
journal exact match: True
journal partial match: False
volume match: True
issue match: False
page ma

In [198]:
print(journal_dump_2000_good.loc[10, "a"])
test_cite = journal_dump_2000_good.loc[10, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[10, "scholars_result"])

Rüzgâr türbini tasarımı
{{Akademik dergi kaynağı|başlık=Alternative Composite Materials for Megawatt-Scale Wind Turbine Blades: Design Considerations and Recommended Testing|yazarlar=Griffin, Dayton A.|sayı=4|sayfa=515|çalışma=Journal of Solar Energy Engineering|yıl=2003|cilt=125}}
{'journal': 'Journal of Solar Energy Engineering', 'date': '2003', 'year': 2003, 'volume': '125', 'issue': '4', 'title': 'Alternative Composite Materials for Megawatt-Scale Wind Turbine Blades: Design Considerations and Recommended Testing', 'author': ['Griffin, Dayton A.'], 'page': '515', 'url': '', 'external_ids': {}}
{'work_id': 'oxvzp5qalre2thc6osx2v4eudy', 'score': 93.024704, 'title': 'Alternative Composite Materials for Megawatt-Scale Wind Turbine Blades: Design Considerations and Recommended Testing', 'year': 2003, 'journal': 'Journal of solar energy engineering', 'volume': '125', 'issue': None, 'page': '515', 'author': ['Thomas D. Ashwill'], 'url': 'https://web.archive.org/web/20060924132607/http://w

In [199]:
print(journal_dump_2000_good.loc[11, "a"])
test_cite = journal_dump_2000_good.loc[11, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[11, "scholars_result"])

Güvenli seks
{{Akademik dergi kaynağı|başlık=Abstinence and abstinence-only education|yazarlar=Ott|tarih=Ekim 2007|sayı=5|sayfalar=446-52|çalışma=Current Opinion in Obstetrics and Gynecology|cilt=19}}
{'journal': 'Current Opinion in Obstetrics and Gynecology', 'date': '2007-10', 'year': 2007, 'volume': '19', 'issue': '5', 'title': 'Abstinence and abstinence-only education', 'author': ['Ott'], 'page': '446', 'url': '', 'external_ids': {}}
{'work_id': 'h3o763e4sjgunellktyoq2iaee', 'score': 79.328804, 'title': 'Abstinence and abstinence-only education', 'year': 2007, 'journal': 'Current Opinion in Obstetrics and Gynecology', 'volume': '19', 'issue': None, 'page': '446-452', 'author': ['Mary A Ott', 'John S Santelli'], 'url': 'https://web.archive.org/web/20200310173144/https://scholarworks.iupui.edu/bitstream/handle/1805/17056/nihms208289.pdf;jsessionid=E625B6D46FB847676A9932DA8D061F4F?sequence=1'}
journal exact match: True
journal partial match: False
volume match: True
issue match: False

In [200]:
print(journal_dump_2000_good.loc[12, "a"])
test_cite = journal_dump_2000_good.loc[12, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[12, "scholars_result"])

Ruedi Aebersold
{{Akademik dergi kaynağı|başlık=Quantitative analysis of complex protein mixtures using isotope-coded affinity tags|tarih=Ekim 1999|sayı=10|sayfalar=994-9|çalışma=Nature Biotechnology|cilt=17}}
{'journal': 'Nature Biotechnology', 'date': '1999-10', 'year': 1999, 'volume': '17', 'issue': '10', 'title': 'Quantitative analysis of complex protein mixtures using isotope-coded affinity tags', 'author': [], 'page': '994', 'url': '', 'external_ids': {}}
{'work_id': 'sswnnewfavdsjaahqrlucdpkaq', 'score': 80.415726, 'title': 'Quantitative analysis of complex protein mixtures using isotope-coded affinity tags', 'year': 1999, 'journal': 'Nature Biotechnology', 'volume': '17', 'issue': '10', 'page': '994-999', 'author': ['Steven P. Gygi', 'Beate Rist', 'Scott A. Gerber', 'Frantisek Turecek', 'Michael H. Gelb', 'Ruedi Aebersold'], 'url': 'https://web.archive.org/web/20170808222315/http://llama.mshri.on.ca/courses/Biophysics205/Papers/Gygi_1999.pdf'}
journal exact match: True
journal 

In [201]:
print(journal_dump_2000_good.loc[13, "a"])
test_cite = journal_dump_2000_good.loc[13, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[13, "scholars_result"])

Stroop etkisi
{{Akademik dergi kaynağı|başlık=Practice-related Effects Demonstrate Complementary Roles Of Anterior Cingulate And Prefrontal Cortices In Attentional Control|yazarlar=Milham|sayı=2|sayfalar=483-493|çalışma=NeuroImage|yıl=2003|cilt=18}}
{'journal': 'NeuroImage', 'date': '2003', 'year': 2003, 'volume': '18', 'issue': '2', 'title': 'Practice-related Effects Demonstrate Complementary Roles Of Anterior Cingulate And Prefrontal Cortices In Attentional Control', 'author': ['Milham'], 'page': '483', 'url': '', 'external_ids': {}}
{'work_id': 'hxi64jgwl5g47n5k55zhqlkepe', 'score': 71.04921, 'title': 'Practice-related effects demonstrate complementary roles of anterior cingulate and prefrontal cortices in attentional control☆☆This study was supported by the Beckman Institute for Advanced Science and Technology at the University of Illinois, Urbana-Champaign; Carle Clinic, Urbana, Illinois; and NIMH MD/PhD predoctoral National Research Service Award provided support to M.P.M. (MH124

In [203]:
fuzz.partial_ratio("Practice-related Effects Demonstrate Complementary Roles Of Anterior Cingulate And Prefrontal Cortices In Attentional Control", "Practice-related effects demonstrate complementary roles of anterior cingulate and prefrontal cortices in attentional control☆☆This study was supported by the Beckman Institute for Advanced Science and Technology at the University of Illinois, Urbana-Champaign; Carle Clinic, Urbana, Illinois; and NIMH MD/PhD predoctoral National Research Service Award provided support to M.P.M. (MH12415-01).")

90

In [204]:
print(journal_dump_2000_good.loc[14, "a"])
test_cite = journal_dump_2000_good.loc[14, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[14, "scholars_result"])

T Yardımcı Hücresi 17
{{Akademik dergi kaynağı|başlık=A human colonic commensal promotes colon tumorigenesis via activation of T helper type 17 T cell responses|tarih=Eylül 2009|sayı=9|sayfalar=1016-22|çalışma=Nature Medicine|cilt=15}}
{'journal': 'Nature Medicine', 'date': '2009-09', 'year': 2009, 'volume': '15', 'issue': '9', 'title': 'A human colonic commensal promotes colon tumorigenesis via activation of T helper type 17 T cell responses', 'author': [], 'page': '1016', 'url': '', 'external_ids': {}}
{'work_id': 'n4vbzetjabdjlinqbx2hehjpvu', 'score': 99.53417, 'title': 'A human colonic commensal promotes colon tumorigenesis via activation of T helper type 17 T cell responses', 'year': 2009, 'journal': 'Nature Medicine', 'volume': '15', 'issue': None, 'page': '1016-1022', 'author': ['Shaoguang Wu', 'Ki-Jong Rhee', 'Emilia Albesiano', 'Shervin Rabizadeh', 'Xinqun Wu', 'Hung-Rong Yen', 'David L Huso', 'Frederick L Brancati', 'Elizabeth Wick', 'Florencia McAllister', 'Franck Housseau',

In [205]:
print(journal_dump_2000_good.loc[15, "a"])
test_cite = journal_dump_2000_good.loc[15, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[15, "scholars_result"])

Pelvik inflamatuar hastalık
{{Akademik dergi kaynağı|başlık=Pelvic inflammatory disease: current concepts in pathogenesis, diagnosis and treatment.|yazarlar=Mitchell|tarih=Aralık 2013|sayı=4|sayfalar=793-809|çalışma=Infectious Disease Clinics of North America|cilt=27}}
{'journal': 'Infectious Disease Clinics of North America', 'date': '2013-12', 'year': 2013, 'volume': '27', 'issue': '4', 'title': 'Pelvic inflammatory disease: current concepts in pathogenesis, diagnosis and treatment.', 'author': ['Mitchell'], 'page': '793', 'url': '', 'external_ids': {}}
{'work_id': 'quoxpsjky5co3k4fp7iccyfhai', 'score': 54.00481, 'title': 'Pelvic Inflammatory Disease', 'year': 2013, 'journal': 'Infectious Disease Clinics of North America', 'volume': '27', 'issue': None, 'page': '793-809', 'author': ['Caroline Mitchell', 'Malavika Prabhu'], 'url': 'https://web.archive.org/web/20200205230301/http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC3843151&blobtype=pdf'}
journal exact match: True
journal 

In [206]:
print(journal_dump_2000_good.loc[16, "a"])
test_cite = journal_dump_2000_good.loc[16, "c"]
print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)
print(journal_dump_2000_good.loc[16, "scholars_result"])

T Yardımcı Hücresi 17
{{Akademik dergi kaynağı|başlık=T helper 17 cells promote cytotoxic T cell activation in tumor immunity|tarih=Kasım 2009|sayı=5|sayfalar=787-98|çalışma=Immunity|cilt=31}}
{'journal': 'Immunity', 'date': '2009-11', 'year': 2009, 'volume': '31', 'issue': '5', 'title': 'T helper 17 cells promote cytotoxic T cell activation in tumor immunity', 'author': [], 'page': '787', 'url': '', 'external_ids': {}}
{'work_id': 'sb5a5fmlabfh7ne36udfj2ydqi', 'score': 84.668045, 'title': 'T Helper 17 Cells Promote Cytotoxic T Cell Activation in Tumor Immunity', 'year': 2009, 'journal': 'Immunity', 'volume': '31', 'issue': None, 'page': '787-798', 'author': ['Natalia Martin-Orozco', 'Pawel Muranski', 'Yeonseok Chung', 'Xuexian O. Yang', 'Tomohide Yamazaki', 'Sijie Lu', 'Patrick Hwu', 'Nicholas P. Restifo', 'Willem W. Overwijk', 'Chen Dong'], 'url': 'https://web.archive.org/web/20190303235119/https://core.ac.uk/download/pdf/82452021.pdf'}
journal exact match: True
journal partial match

In [178]:
test_cite = "{{Akademik dergi kaynağı|başlık=Alternative Composite Materials for Megawatt-Scale Wind Turbine Blades: Design Considerations and Recommended Testing|yazarlar=Griffin, Dayton A.|sayı=4|sayfa=515|çalışma=Journal of Solar Energy Engineering|yıl=2003|cilt=125}}"

print(test_cite)
test_cite_info = parse_citation_data(test_cite)
print(test_cite_info)
test_url_content = generate_url_content2(test_cite_info)
test_search_result = search_fatcat_cite_info(test_url_content, True)
print(test_search_result[0])
print(fatcat_check_match(test_cite_info, test_search_result[0], verbose = True))
test_closest_match = find_best_cite_info_fatcat_search2(test_cite_info, test_search_result)
print(test_closest_match)

test_url = process_citation_to_scholars2(test_cite)
print(test_url)

{{Akademik dergi kaynağı|başlık=Alternative Composite Materials for Megawatt-Scale Wind Turbine Blades: Design Considerations and Recommended Testing|yazarlar=Griffin, Dayton A.|sayı=4|sayfa=515|çalışma=Journal of Solar Energy Engineering|yıl=2003|cilt=125}}
{'journal': 'Journal of Solar Energy Engineering', 'date': '2003', 'year': 2003, 'volume': '125', 'issue': '4', 'title': 'Alternative Composite Materials for Megawatt-Scale Wind Turbine Blades: Design Considerations and Recommended Testing', 'author': ['Griffin, Dayton A.'], 'page': '515', 'url': '', 'external_ids': {}}
https://search.fatcat.wiki/fatcat_release/_search?q=%28container_name%3AJournal%20of%20Solar%20Energy%20Engineering%29AND%28title%3AAlternative%20Composite%20Materials%20for%20Megawatt-Scale%20Wind%20Turbine%20Blades%20Design%20Considerations%20and%20Recommended%20Testing%29AND%28release_year%3A2003%29
{'work_id': 'oxvzp5qalre2thc6osx2v4eudy', 'score': 107.28194, 'title': 'Alternative Composite Materials for Megawat

In [169]:
import gspread
from gspread_dataframe import set_with_dataframe

In [170]:
gc = gspread.service_account("service_account.json")

sh = gc.open("Citations SIM Test")

print(sh.sheet1.get('A1'))

[['a']]


In [173]:
# ACCESS GOOGLE SHEET
gc = gspread.service_account(filename='service_account.json')
sh = gc.open_by_key('1ih5bIk5_d5WLEtArRzEFPzPlwZVA_BI-O8kSxWH1rNU')
worksheet = sh.get_worksheet(5) #-> 0 - first sheet, 1 - second sheet etc. 

# APPEND DATA TO SHEET
df_to_write = journal_dump_2000_good[["a", "scholars_result", "c"]]
set_with_dataframe(worksheet, df_to_write) #-> THIS EXPORTS YOUR DATAFRAME TO THE GOOGLE SHEET