# Citation Preprocessing

- Preprocess citations to obtain desired fields before making searches with SIM and fatcat 
- Consists of 
    - parsing for different languages 
    - basic filtering 
    - autourl verfication 
    - data normalization 

## Citation Parsing

In [1]:
import dateparser
import re

In [2]:
### change this part for different alias for different languages 
journal_aliases = ['journal', 'newspaper', 'magazine', 'work','website',  'periodical', 
                       'encyclopedia', 'encyclopaedia', 'dictionary', 'mailinglist','dergi', 'gazete', 
                       'eser', 'çalışma', 'iş', 'websitesi', 'süreliyayın', 'ansiklopedi', 'sözlük', 'program']
    
date_aliases = ['date', 'air-date', 'airdate', 'tarih']
    
year_aliases = ['year', 'yıl', 'sene']
    
volume_aliases = ['volume', 'cilt']
    
issue_aliases = ['issue', 'number', 'sayı', 'numara']

page_aliases = ['p', 'page', 's', 'sayfa']
pages_aliases = ['pp', 'pages', 'ss', 'sayfalar']

url_aliases = ['url', 'URL', 'katkı-url', 'chapter-url', 'contribution-url', 'entry-url', 
               'article-url', 'section-url']

title_aliases= ['title', 'başlık']

ext_id_aliases = ['pmid', 'PMID', 'jstor', 'doi', 'DOI', 'ISBN', 'pmc']

In [8]:
author_aliases = ['authors', 'people', 'credits', 'host', 'yazarlar', 'yazars', 'katkıdabulunanlar', 
                  'muhataplar', 'kişiler']

first_name_aliases = ["first#", "given#", "author-first#", "author#-first", "ad#", "ilk#", "muhatapadı#"]

last_name_aliases = ["last#", "author#", "surname#", "author-last#", "author#-last", "subject#", "son#", 
                     "soyadı#", "yazar#", "muhatap#", "muhatapsoyadı#", "özne#", "süje#", "konu#"]

### There's more aliases for different languages.
- Should I separate things out into languages and detect language first before setting this? 

In [9]:
from textblob import TextBlob

In [10]:
## test hypothesis 
test_cite = "{{Akademik dergi kaynağı|başlık=Lifetime probability of developing lung cancer, by smoking status, Canada|sayı=6|sayfalar=385-8|çalışma=Canadian Journal of Public Health|yıl=1994|cilt=85|pmid=7895211}}"
b = TextBlob(test_cite)
b.detect_language()

'tr'

In [14]:
def get_aliases(citation):
    b = TextBlob(citation)
    lang = b.detect_language()
    
    ### Default is in english and turkish
    journal_aliases = ['journal', 'newspaper', 'magazine', 'work','website',  'periodical', 
                       'encyclopedia', 'encyclopaedia', 'dictionary', 'mailinglist', 'dergi', 'gazete', 
                       'eser', 'çalışma', 'iş', 'websitesi', 'süreliyayın', 'ansiklopedi', 'sözlük', 'program']
    date_aliases = ['date', 'air-date', 'airdate', 'tarih']
    year_aliases = ['year', 'yıl', 'sene']
    volume_aliases = ['volume', 'cilt']
    issue_aliases = ['issue', 'number', 'sayı', 'numara']
    page_aliases = ['p', 'page', 's', 'sayfa']
    pages_aliases = ['pp', 'pages', 'ss', 'sayfalar']
    url_aliases = ['url', 'URL', 'katkı-url', 'chapter-url', 'contribution-url', 'entry-url', 
               'article-url', 'section-url']
    title_aliases= ['title', 'başlık']
    ext_id_aliases = ['pmid', 'PMID', 'jstor', 'doi', 'DOI', 'isbn', 'ISBN', 'pmc', "oclc", "OCLC", "lccn", "LCCN"]
    author_aliases = ['authors', 'people', 'credits', 'host', 'yazarlar', 'yazars', 'katkıdabulunanlar', 
                  'muhataplar', 'kişiler']
    first_name_aliases = ["first#", "given#", "author-first#", "author#-first", "ad#", "ilk#", "muhatapadı#"]
    last_name_aliases = ["last#", "author#", "surname#", "author-last#", "author#-last", "subject#", "son#", 
                     "soyadı#", "yazar#", "muhatap#", "muhatapsoyadı#", "özne#", "süje#", "konu#"]
    
    this_journal = []
    this_date = []
    this_year = []
    this_volume = []
    this_issue = []
    this_page = []
    this_pages = []
    this_title = []
    this_author = []
    this_first_name = []
    this_last_name = []
    
    
    if lang == "nl":
        this_journal = []
        this_date = ["datum"]
        this_year = []
        this_volume = ["version", "edition", "editie"]
        this_issue = []
        this_page = []
        this_pages = ["pagina's", "paginas"]
        this_title = ["titel"]
        this_author = ["auteur", "auteur#", "medeauteurs#", "editor#"]
        this_first_name = ["voornaam#"]
        this_last_name = ["achternaam#"]
    
#     ### template for adding in new languages
#     if lang == "":
#         this_journal = []
#         this_date = []
#         this_year = []
#         this_volume = []
#         this_issue = []
#         this_page = []
#         this_pages = []
#         this_title = []
#         this_author = []
#         this_first_name = []
#         this_last_name = []
        
    
    journal_aliases.extend(this_journal)
    date_aliases.extend(this_date) 
    year_aliases.extend(this_year) 
    volume_aliases.extend(this_volume) 
    issue_aliases.extend(this_issue)
    page_aliases.extend(this_page) 
    pages_aliases.extend(this_pages)
    title_aliases.extend(this_title) 
    author_aliases.extend(this_author) 
    first_name_aliases.extend(this_first_name) 
    last_name_aliases.extend(this_last_name)
    
        
    return [journal_aliases, date_aliases, year_aliases, volume_aliases, issue_aliases, page_aliases, pages_aliases, 
            url_aliases, title_aliases, ext_id_aliases, author_aliases, first_name_aliases, last_name_aliases]

In [15]:
# Parsing a wikipedia citation data
def parse_citation_data(citation):

    alias_lst = get_aliases(citation)
    journal_aliases = alias_lst[0]
    date_aliases = alias_lst[1]
    year_aliases = alias_lst[2]
    volume_aliases = alias_lst[3]
    issue_aliases = alias_lst[4]
    page_aliases = alias_lst[5]
    pages_aliases = alias_lst[6]
    url_aliases = alias_lst[7]
    title_aliases = alias_lst[8]
    ext_id_aliases = alias_lst[9]
    author_aliases = alias_lst[10]
    first_name_aliases = alias_lst[11]
    last_name_aliases = alias_lst[12]
    
    citation = re.sub('[{}]', '', citation)
    citation_list = citation.split("|")
    
    journal = ""
    volume = ""
    issue = ""
    
    title = ""
    page = ""
    
    url = ""
    external_ids = {}
    
    date = ""
    month_str = ""
    year = 0
    
    authors = []
    first_names = []
    last_names = []
    
    for field in citation_list:
        field = field.strip()
        
        # find journal title
        for j_a in journal_aliases:
            journal_regex = j_a + "(\s{0,})="
            if re.match(re.compile(journal_regex), field):
                journal = field.split("=")[1].strip()
                break
   
        # find journal volume 
        for v_a in volume_aliases:
            volume_regex = v_a + "(\s{0,})="
            if re.match(re.compile(volume_regex), field):
                volume = field.split("=")[1].strip()
                volume = re.sub('[^0-9]+', '', volume)
                break
            
        # find journal issue
        for i_a in issue_aliases:
            issue_regex = i_a + "(\s{0,})="
            if re.match(issue_regex, field):
                issue = field.split("=")[1].strip()
                break
        
        # find journal year
        for y_a in year_aliases:
            year_regex = y_a + "(\s{0,})="
            if re.match(year_regex, field):
                year = field.split("=")[1].strip()
                date = re.sub('[^0-9]+', '', year)
                try:
                    year = int(date)
                except:
                    year = 0
                break
            
        # find journal date
        for d_a in date_aliases:
            date_regex = d_a + "(\s{0,})="
            if re.match(date_regex, field):
                date = field.split("=")[1].strip()

                try:
                    year = int(date)
                    date = str(year)
                except:
                    # use the python library for parsing
                    parsed_date = dateparser.parse(date)
                    if parsed_date != None:
                        if parsed_date.year < 2021 and parsed_date.year > 1800:
                            year = parsed_date.year
                            date = str(year)

                        if parsed_date.month < 10:
                            month = parsed_date.month
                            month_str = "0" + str(month)
                        else:
                            month = parsed_date.month
                            month_str = str(month)

                        if month_str != "":
                            date = date + "-" + month_str 
                break
        
         # find existing url
        for u_a in url_aliases:
            url_regex = u_a + "(\s{0,})="
            if re.match(url_regex, field):
                url = field.split("=")[1].strip()
                break
            
        # find page field 
        for p_a in page_aliases:
            page_regex = p_a + "(\s{0,})="
            if re.match(page_regex, field):
                page = field.split("=")[1].strip()
                if "[" in page:
                    page = ""
                break
                
        # find pages field
        for ps_a in pages_aliases:
            pages_regex = ps_a + "(\s{0,})="
            if re.match(pages_regex, field):
                pages = field.split("=")[1].strip()
                if "[" not in pages:
                    if "-" in pages:
                        page = pages.split("-")[0].strip()
                    elif "–" in pages:
                        page = pages.split("–")[0].strip()
                    else:
                        page = ""
                        
                break
                
        # find page field 
        for t_a in title_aliases:
            title_regex = t_a + "(\s{0,})="
            if re.match(title_regex, field):
                title = field.split("=")[1].strip()
                if "[" in title:
                    title = ""
                break
                
        # find external identifier field 
        for ext_id_a in ext_id_aliases:
            ext_id_regex = ext_id_a + "(\s{0,})="
            if re.match(ext_id_regex, field):
                ext_id = field.split("=")[1].strip()
                external_ids[ext_id_a] = ext_id
                break
                
        # find author field 
        for au_a in author_aliases:
            author_regex = au_a + "(\s{0,})="
            if re.match(author_regex, field):
                authors.append(field.split("=")[1].strip())
                break
        
        # find author first name 
        for f_n in first_name_aliases:
            first_name_regex = re.sub("#", r"[0-9]", f_n)
            if re.match(first_name_regex, field):
                first_names.append(field.split("=")[1].strip())
                break
                
        # find author last name 
        for l_n in last_name_aliases:
            last_name_regex = re.sub("#", r"[0-9]", l_n)
            if re.match(last_name_regex, field):
                last_names.append(field.split("=")[1].strip())
                break
            
    if len(last_names) == len(first_names):
        for i in range(len(last_names)):
            authors.append(first_names[i] + " " + last_names[i])
    else:
        authors.extend(last_names)
            
    return {"journal": journal, "date": date, "year": year, 
            "volume": volume, "issue": issue, 
            "title": title, "author": authors,
            "page": page, 
            "url": url, "external_ids": external_ids}
        

### Function test

In [16]:
test_cite7 = "{{Akademik dergi kaynağı|başlık=Lifetime probability of developing lung cancer, by smoking status, Canada|sayı=6|sayfalar=385-8|çalışma=Canadian Journal of Public Health|yıl=1994|cilt=85|pmid=7895211}}"
print(test_cite7)

cite_info7 = parse_citation_data(test_cite7)
print(cite_info7)

{{Akademik dergi kaynağı|başlık=Lifetime probability of developing lung cancer, by smoking status, Canada|sayı=6|sayfalar=385-8|çalışma=Canadian Journal of Public Health|yıl=1994|cilt=85|pmid=7895211}}
{'journal': 'Canadian Journal of Public Health', 'date': '1994', 'year': 1994, 'volume': '85', 'issue': '6', 'title': 'Lifetime probability of developing lung cancer, by smoking status, Canada', 'author': [], 'page': '385', 'url': '', 'external_ids': {'pmid': '7895211'}}


## Basic Filtering 

In [17]:
def citation_contains_relevant_info(cite_info, verbose = False):
    
    # make sure there's no existing url 
    if cite_info['url'] != '':
        if verbose: print("There is already an existing url.")
        return False
    
    # no existing external identifiers for auto urls 
    if cite_info['external_ids']:
        if verbose: print("There is already an existing doi link.")
        return False

    # check citation has all desired info
    if cite_info['journal'] == '':
        if verbose: print("Citation has no journal name. ")
        return False
    if cite_info['volume'] == '':
        if verbose: print("Citation has no volume.")
        return False
    if cite_info['year'] == 0:
        if verbose: print("Citation has no valid year.")
        return False
    if cite_info['page'] == '':
        if verbose: print("Citation has no page.")
        return False
    if cite_info['title'] == '':
        if verbose: print("Citation has no article title.")
        return False
        
    
    return True

In [18]:
print(cite_info7)
print(citation_contains_relevant_info(cite_info7, verbose = True))

{'journal': 'Canadian Journal of Public Health', 'date': '1994', 'year': 1994, 'volume': '85', 'issue': '6', 'title': 'Lifetime probability of developing lung cancer, by smoking status, Canada', 'author': [], 'page': '385', 'url': '', 'external_ids': {'pmid': '7895211'}}
There is already an existing doi link.
False


## Autourl Check

In [20]:
from ipynb.fs.full.Autourl_Verification import autourl_exists

yay


In [21]:
autourl_exists(test_cite7)

True

## Data Normalization

### Journal Name Abbreviations
- https://github.com/JabRef/abbrv.jabref.org/tree/master/journals
- In the journals folder, download all these files 
    - journal_abbreviations_acs.csv
    - journal_abbreviations_ams.csv
    - journal_abbreviations_annee-philologique.csv
    - journal_abbreviations_dainst.csv
    - journal_abbreviations_entrez.csv
    - journal_abbreviations_general.csv
    - journal_abbreviations_geology_physics.csv
    - journal_abbreviations_geology_physics_variations.csv
    - journal_abbreviations_ieee.csv
    - journal_abbreviations_lifescience.csv
    - journal_abbreviations_mathematics.csv
    - journal_abbreviations_mechanical.csv
    - journal_abbreviations_medicus.csv
    - journal_abbreviations_meteorology.csv
    - journal_abbreviations_sociology.csv

In [22]:
import pandas as pd

In [23]:
# temp = pd.read_csv("journal_abbrev/ALL.csv")
data = []
with open("journal_abbrev/ALL.csv") as file:
    lines = file.readlines()
    for curr_line in lines:
        curr_info = {}
        curr_line = curr_line.strip()
        curr_line_lst = curr_line.split(";")
        if len(curr_line_lst) == 2:
            curr_info["full"] = curr_line_lst[0]
            curr_info["abbrev"] = curr_line_lst[1]
        elif len(curr_line_lst) == 1:
            curr_info["full"] = curr_line_lst[0]
            curr_info["abbrev"] = ""
        else:
            curr_info["full"] = ""
            curr_info["full"] = ""
        data.append(curr_info)

In [24]:
data[:5]

[{'full': 'ACS Applied Materials & Interfaces',
  'abbrev': 'ACS Appl. Mater. Interfaces'},
 {'full': 'ACS Applied Nano Materials', 'abbrev': 'ACS Appl. Nano Mater.'},
 {'full': 'ACS Biomaterials Science & Engineering',
  'abbrev': 'ACS Biomater. Sci. Eng.'},
 {'full': 'ACS Catalysis', 'abbrev': 'ACS Catal.'},
 {'full': 'ACS Central Science', 'abbrev': 'ACS Cent. Sci.'}]

In [25]:
abbrev_df = pd.DataFrame(data)
abbrev_df.head()

Unnamed: 0,full,abbrev
0,ACS Applied Materials & Interfaces,ACS Appl. Mater. Interfaces
1,ACS Applied Nano Materials,ACS Appl. Nano Mater.
2,ACS Biomaterials Science & Engineering,ACS Biomater. Sci. Eng.
3,ACS Catalysis,ACS Catal.
4,ACS Central Science,ACS Cent. Sci.


In [26]:
abbrev_df = abbrev_df.drop_duplicates()

In [27]:
# abbrev_df.value_counts()

#### International Organization of standardization
- https://en.wikipedia.org/wiki/ISO_4

In [28]:
# temp = pd.read_csv("journal_abbrev/ALL.csv")
ltwa_data = []
with open("journal_abbrev/ltwa_20210702.csv") as file:
    lines = file.readlines()
    for curr_line in lines:
        curr_info = {}
        curr_line = curr_line.strip()
        curr_line = re.sub("\"", "", curr_line)
        curr_line_lst = curr_line.split(";")
        if len(curr_line_lst) == 3:
            
            curr_info["full"] = curr_line_lst[0]
            if curr_line_lst[1] == "n.a.":
                curr_info["abbrev"] =  ""
            else:
                curr_info["abbrev"] = curr_line_lst[1]
                
            if "," in curr_line_lst[2]:
                curr_info["lang"] = curr_line_lst[2].split(",")
            else:
                curr_info["lang"] = curr_line_lst[2]
        
        ltwa_data.append(curr_info)

In [29]:
ltwa_df = pd.DataFrame(ltwa_data)
print(ltwa_df.shape)
ltwa_df.head()

(56136, 3)


Unnamed: 0,full,abbrev,lang
0,WORDS,ABBREVIATIONS,LANGUAGES
1,'s-Graveland,,dut
2,'s-Gravenhage,,dut
3,'s-Gravenmoer,,dut
4,'s-Heerenberg,,dut


In [30]:
ltwa_df.sample(n=20)

Unnamed: 0,full,abbrev,lang
4794,bedrijfsvoorlichting,bedrijfsvoorlicht.,dut
3584,Augsburger,Augsbg.,ger
29980,mēnešraksts,mēnešr.,lav
25225,Kompetenz,Kompet.,ger
28898,Maintal,,ger
43666,secretar-,secr.,"[fre, eng]"
11902,dorsal,dors.,fre
28631,łykowy,łyk.,pol
49762,toolmaker,toolmak.,eng
23967,Karteiform,,ger


In [31]:
ltwa_df_with_abbrev = ltwa_df[ltwa_df["abbrev"] != ""]
ltwa_df_with_abbrev = ltwa_df_with_abbrev.reset_index(drop = True)
print(ltwa_df_with_abbrev.shape)
ltwa_df_with_abbrev.head()

(37333, 3)


Unnamed: 0,full,abbrev,lang
0,WORDS,ABBREVIATIONS,LANGUAGES
1,-agōgē,-ag.,gre
2,-aineisto,-ain.,fin
3,-Alföld,-Alf.,hun
4,-asema,-as.,fin


In [32]:
ltwa_df_with_abbrev["abbrev"].value_counts()

inf.         21
dir.         18
stud.        15
nov.         14
invest.      14
             ..
spój.         1
champ.        1
paletnol.     1
milieun.      1
antipod.      1
Name: abbrev, Length: 32814, dtype: int64

In [33]:
ltwa_df_with_abbrev[ltwa_df_with_abbrev["abbrev"] == "inf."]

Unnamed: 0,full,abbrev,lang
14379,inform,inf.,eng
14380,informa,inf.,spa
14381,informāc-,inf.,"[lav, lit, spa]"
14382,információ,inf.,hun
14383,informador,inf.,spa
14384,informant,inf.,eng
14385,informare,inf.,rum
14386,informasi,inf.,ind
14387,informasjon,inf.,nor
14388,informateur,inf.,fre


In [58]:
ltwa_df_with_abbrev[ltwa_df_with_abbrev["full"] == "geograph-"]

Unnamed: 0,full,abbrev,lang
11638,geograph-,geogr.,"[fre, eng]"


In [35]:
ltwa_df_with_abbrev[ltwa_df_with_abbrev["lang"] == "eng"]

Unnamed: 0,full,abbrev,lang
20,-book,-b.,eng
21,-borough,-brgh.,eng
23,-bourne,-b.,eng
28,-business,-bus.,eng
29,-craft,-cr.,eng
...,...,...,...
36680,yugoslav-,yugosl.,eng
36970,Zellkultur-,Zellkult.,eng
37028,zetetic,zetet.,eng
37093,zionism,zion.,eng


### Problem: Even within the same language, one abbreviation can come from different full words
### De-abbreviate.ipynb is where I try to tackle this issue 

In [40]:
ltwa_df_with_abbrev["lang"].value_counts()[:10]

ger    6733
eng    4464
hun    2548
rus    2419
dut    2241
fre    2006
spa    1516
gre    1205
pol    1133
swe     994
Name: lang, dtype: int64

In [41]:
ltwa_df_with_abbrev[ltwa_df_with_abbrev["lang"] == "tur"]["abbrev"].value_counts()

genç.     2
mesl.      2
kim.       2
rap.       2
hij.       2
          ..
harit.     1
fizyol.    1
Turk.      1
hekim.     1
lise.      1
Name: abbrev, Length: 248, dtype: int64

In [42]:
def normalize_journal_name(journal):
    
    df = abbrev_df[abbrev_df["abbrev"] == journal]
    
    if not df.empty:
        row = df.iloc[0]
        return row["full"]
    
    return journal

# Combine All Preprocessing

In [48]:
def preprocessing_citation(citation, verbose = False):
    
    cite_info = parse_citation_data(citation)
    
    if not citation_contains_relevant_info(cite_info, verbose):
        if verbose: print("This citaiton does not have all relevant info, or contains external identifiers.")
        return "Err1" 
    
    if autourl_exists(citation):
        if verbose: print("There already exists an autourl.")
        return "Err2"
    
    cite_info["journal"] = normalize_journal_name(cite_info["journal"])
    
    return cite_info
    

In [49]:
test_cite7 = "{{Akademik dergi kaynağı|başlık=Lifetime probability of developing lung cancer, by smoking status, Canada|sayı=6|sayfalar=385-8|çalışma=Canadian Journal of Public Health|yıl=1994|cilt=85|pmid=7895211}}"
print(test_cite7)

test_preprocessed7 = preprocessing_citation(test_cite7, verbose = True)
print(test_preprocessed7)

{{Akademik dergi kaynağı|başlık=Lifetime probability of developing lung cancer, by smoking status, Canada|sayı=6|sayfalar=385-8|çalışma=Canadian Journal of Public Health|yıl=1994|cilt=85|pmid=7895211}}
There is already an existing doi link.
This citaiton does not have all relevant info, or contains external identifiers.
Err1


# Data

In [50]:
journal_dump = pd.read_json("tr.wikipedia.org.journal.20210621.json.gz", lines = True)
print(journal_dump.shape[0])
journal_dump.head(10)

14960


Unnamed: 0,a,c
0,1 + 2 + 3 + 4 + · · ·,{{Akademik dergi kaynağı\n| soyadı = Lepowsky ...
1,12 Victoria,{{Akademik dergi kaynağı\n | ad1 = B.\n | s...
2,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...
3,141 Likya depremi,{{Akademik dergi kaynağı | url=http://blackmed...
4,141 Likya depremi,{{Akademik dergi kaynağı | url=http://www.nat-...
5,1481 Rodos depremi,{{Akademik dergi kaynağı|başlık=Earthquake sou...
6,1481 Rodos depremi,{{Akademik dergi kaynağı|başlık=Historical and...
7,1481 Rodos depremi,{{Akademik dergi kaynağı|url=http://jgs.geosci...
8,1481 Rodos depremi,{{Akademik dergi kaynağı|başlık=New approaches...
9,1481 Rodos depremi,{{Akademik dergi kaynağı|url=http://hal-insu.a...


In [51]:
journal_dump_100 = journal_dump.sample(n = 100)
journal_dump_100.head()

Unnamed: 0,a,c
2911,Dmanisi 5 kafatası,{{Akademik dergi kaynağı|başlık=A Complete Sku...
10800,Flynn etkisi,{{Akademik dergi kaynağı|başlık=A long-term ri...
3203,Göğüs ağrısı,{{Akademik dergi kaynağı|url=http://jama.jaman...
3083,Fikri Alican,{{Akademik dergi kaynağı|başlık=Ischemic Gangr...
4451,Sera gazları,{{Akademik dergi kaynağı|url=http://adsabs.har...


In [52]:
journal_dump_100["cite_info"] = journal_dump_100['c'].apply(lambda x: preprocessing_citation(x, verbose = False))

In [53]:
journal_dump_100.head()

Unnamed: 0,a,c,cite_info
2911,Dmanisi 5 kafatası,{{Akademik dergi kaynağı|başlık=A Complete Sku...,Err1
10800,Flynn etkisi,{{Akademik dergi kaynağı|başlık=A long-term ri...,Err1
3203,Göğüs ağrısı,{{Akademik dergi kaynağı|url=http://jama.jaman...,Err1
3083,Fikri Alican,{{Akademik dergi kaynağı|başlık=Ischemic Gangr...,Err1
4451,Sera gazları,{{Akademik dergi kaynağı|url=http://adsabs.har...,Err1


In [54]:
journals_failed_basic_filtering = journal_dump_100[journal_dump_100["cite_info"] == "Err1"]
print(journals_failed_basic_filtering.shape)
journals_failed_basic_filtering.head()

(93, 3)


Unnamed: 0,a,c,cite_info
2911,Dmanisi 5 kafatası,{{Akademik dergi kaynağı|başlık=A Complete Sku...,Err1
10800,Flynn etkisi,{{Akademik dergi kaynağı|başlık=A long-term ri...,Err1
3203,Göğüs ağrısı,{{Akademik dergi kaynağı|url=http://jama.jaman...,Err1
3083,Fikri Alican,{{Akademik dergi kaynağı|başlık=Ischemic Gangr...,Err1
4451,Sera gazları,{{Akademik dergi kaynağı|url=http://adsabs.har...,Err1


In [55]:
journals_failed_autourl_check = journal_dump_100[journal_dump_100["cite_info"] == "Err2"]
print(journals_failed_autourl_check.shape)
journals_failed_autourl_check.head()

(2, 3)


Unnamed: 0,a,c,cite_info
7925,Elgaland-Vargaland,{{Akademik dergi kaynağı|başlık=Changing reali...,Err2
10513,Education in Chemistry,{{Akademik dergi kaynağı | başlık = Education ...,Err2


In [56]:
journals_good_cite_info = journal_dump_100[(journal_dump_100["cite_info"] != "Err1")
                                           & (journal_dump_100["cite_info"] != "Err2")]
print(journals_good_cite_info.shape)
journals_good_cite_info

(5, 3)


Unnamed: 0,a,c,cite_info
12612,Aşamalı vergi,{{Akademik dergi kaynağı|url=|başlık=11 Econom...,"{'journal': 'Focus (San Francisco, Calif.)', '..."
6613,Perikardit,{{Akademik dergi kaynağı|başlık=Lectures on th...,{'journal': 'American Medical Times: Being a W...
2799,Carol V. Robinson,{{Akademik dergi kaynağı|başlık=Protein comple...,"{'journal': 'Cell', 'date': '2013', 'year': 20..."
5422,Çinko oksit,"{{Akademik dergi kaynağı|başlık=Synthesis, mic...",{'journal': 'Science and Technology of Advance...
8755,Meluhha,{{Akademik dergi kaynağı|başlık=On the relatio...,"{'journal': 'Studia Orientalia', 'date': '1975..."


In [90]:
# print(journals_good_cite_info.loc[1552, "c"])
# print(journals_good_cite_info.loc[1552, "cite_info"])