# Expanding Abbreviations

## ISO4 
- description: https://en.wikipedia.org/wiki/ISO_4
- tools to make abbreviations
    - https://github.com/adlpr/iso4 (python lib)
    - https://github.com/marcinwrochna/abbrevIso (API)
    - https://abbreviso.toolforge.org/
    
##### Problem: how can we go from abbreviation to full name? 
- recover lost semantic meaning? 
- do machine learning models help?

In [1]:
import numpy as np
import pandas as pd
import re



In [2]:
languages = set()

ltwa_data = []
with open("journal_abbrev/ltwa_20210702.csv") as file:
    lines = file.readlines()
    for curr_line in lines:
        curr_info = {}
        curr_line = curr_line.strip()
        curr_line = re.sub("\"", "", curr_line)
        curr_line_lst = curr_line.split(";")
        if len(curr_line_lst) == 3:
            
            curr_info["full"] = curr_line_lst[0]
            if curr_line_lst[1] == "n.a.":
                curr_info["abbrev"] =  ""
            else:
                curr_info["abbrev"] = curr_line_lst[1]
                
            if "," in curr_line_lst[2]:
                lang_lst = [x.strip() for x in curr_line_lst[2].split(",")]
                curr_info["lang"] = lang_lst
                languages.update(lang_lst)
            else:
                curr_info["lang"] = curr_line_lst[2]
                languages.add(curr_line_lst[2])
        
        ltwa_data.append(curr_info)

In [3]:
ltwa_df = pd.DataFrame(ltwa_data)
print(ltwa_df.shape)
ltwa_df.head()

(56136, 3)


Unnamed: 0,full,abbrev,lang
0,WORDS,ABBREVIATIONS,LANGUAGES
1,'s-Graveland,,dut
2,'s-Gravenhage,,dut
3,'s-Gravenmoer,,dut
4,'s-Heerenberg,,dut


In [4]:
ltwa_df_with_abbrev = ltwa_df[ltwa_df["abbrev"] != ""]
ltwa_df_with_abbrev = ltwa_df_with_abbrev.reset_index(drop = True)
print(ltwa_df_with_abbrev.shape)
ltwa_df_with_abbrev.head()

(37333, 3)


Unnamed: 0,full,abbrev,lang
0,WORDS,ABBREVIATIONS,LANGUAGES
1,-agōgē,-ag.,gre
2,-aineisto,-ain.,fin
3,-Alföld,-Alf.,hun
4,-asema,-as.,fin


In [5]:
list(languages)

['',
 'rum',
 'kur',
 'uzb',
 'ice',
 'bre',
 'cor',
 'baq',
 'mon',
 'LANGUAGES',
 'dut',
 'por',
 'ind',
 'ltz',
 'iku',
 'lat',
 'pro',
 'ger',
 'afr',
 'alb',
 'grc',
 'lit',
 'yid',
 'sla',
 'nor',
 'swe',
 'ukr',
 'geo',
 'wel',
 'mlt',
 'gre',
 'hun',
 'bul',
 'glg',
 'rus',
 'may',
 'mol',
 'mac',
 'tur',
 'fin',
 'dan',
 'eng',
 'kaz',
 'und',
 'slv',
 'crp',
 'fre',
 'srp',
 'mul',
 'roa',
 'gem',
 'slo',
 'arm',
 'lav',
 'cat',
 'bos',
 'bel',
 'spa',
 'tat',
 'yor',
 'est',
 'ita',
 'pol',
 'aze',
 'cze',
 'kir',
 'hrv',
 'cos']

## Work with English only 

In [6]:
iso4_eng = ltwa_df_with_abbrev[(ltwa_df_with_abbrev["lang"] == "eng") | ("eng" in ltwa_df_with_abbrev["lang"])]
iso4_eng = iso4_eng.reset_index(drop = True)
print(iso4_eng.shape)
iso4_eng.head()

(4464, 3)


Unnamed: 0,full,abbrev,lang
0,-book,-b.,eng
1,-borough,-brgh.,eng
2,-bourne,-b.,eng
3,-business,-bus.,eng
4,-craft,-cr.,eng


In [7]:
eng_abbrev_counts = pd.DataFrame(iso4_eng["abbrev"].value_counts())
eng_abbrev_counts = eng_abbrev_counts.rename(columns = {"abbrev":"count"})
print(eng_abbrev_counts.shape)
eng_abbrev_counts.head()

(4111, 1)


Unnamed: 0,count
program.,4
restor.,4
-gr.,4
gov.,4
treat.,4


In [8]:
overlaps = eng_abbrev_counts[eng_abbrev_counts["count"] != 1]
print(overlaps.shape)
overlaps.head()

(291, 1)


Unnamed: 0,count
program.,4
restor.,4
-gr.,4
gov.,4
treat.,4


## Training Data
#### Fully abbreviated data that people compiled
- from https://github.com/JabRef/abbrv.jabref.org/tree/master/journals

In [9]:
data = []
with open("journal_abbrev/ALL.csv") as file:
    lines = file.readlines()
    for curr_line in lines:
        curr_info = {}
        curr_line = curr_line.strip()
        curr_line_lst = curr_line.split(";")
        if len(curr_line_lst) == 2:
            curr_info["full"] = curr_line_lst[0]
            curr_info["abbrev"] = curr_line_lst[1]
        elif len(curr_line_lst) == 1:
            curr_info["full"] = curr_line_lst[0]
            curr_info["abbrev"] = ""
        else:
            curr_info["full"] = ""
            curr_info["full"] = ""
        data.append(curr_info)

In [10]:
abbrev_df = pd.DataFrame(data)
abbrev_df = abbrev_df.fillna("")
abbrev_df = abbrev_df.drop_duplicates()
abbrev_df.head()

Unnamed: 0,full,abbrev
0,ACS Applied Materials & Interfaces,ACS Appl. Mater. Interfaces
1,ACS Applied Nano Materials,ACS Appl. Nano Mater.
2,ACS Biomaterials Science & Engineering,ACS Biomater. Sci. Eng.
3,ACS Catalysis,ACS Catal.
4,ACS Central Science,ACS Cent. Sci.


In [426]:
np.nan

nan

### Validity of Training Data

#### Method 1 
- https://abbreviso.toolforge.org/

In [11]:
import requests

In [12]:
def abbreviate_journal_name(journal):
    url = "https://abbreviso.toolforge.org/a/" + journal
    request = requests.get(url)
    if request.status_code != 200:
        return journal
    
#     print("ABBREV: " + request.text)
    
    return request.text

In [13]:
abbreviate_journal_name("Proceedings of the National Academy of Sciences of the United States of America")

'Proc. Natl. Acad. Sci. U. S. A.'

#### Method #2 
-  https://github.com/adlpr/iso4 (python lib)

In [14]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/xinyuechen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
from iso4 import abbreviate

print(abbreviate("Recent Advances in Studies on Cardiac Structure and Metabolism"))
# 'Recent Adv. Stud. Card. Struct. Metab.'

print(abbreviate("Journal of the American Academy of Dermatology", periods=False))
# 'J Am Acad Dermatol'

# print(abbreviate("Real Living with Multiple Sclerosis"))
# Traceback (most recent call last):
#   File "<stdin>", line 1, in <module>
#   ...
# Exception: Ambiguous word in title: real; must disambiguate between langs: eng, fre, spa

print(abbreviate("Real Living with Multiple Sclerosis", disambiguation_langs=['eng']))
# 'Real Living Mult. Scler.'

print(abbreviate("Anales de la Real Academia de Farmacia", True, ['spa']))
# 'An. R. Acad. Farm.'


Recent Adv. Stud. Card. Struct. Metab.
J Am Acad Dermatol
Real Living Mult. Scler.
An. R. Acad. Farm.


In [16]:
# from textblob import TextBlob
# b = TextBlob("hello")
# b.detect_language()

In [18]:
# abbreviate_journal_name2("Journal of the American Academy of Dermatology")

#### Run sample of abbreviation

In [19]:
abbrev_df_sample = abbrev_df.sample(n = 200)

In [20]:
abbrev_df_sample["abbrev_exact"] = abbrev_df_sample["full"].apply(abbreviate_journal_name)

In [21]:
abbrev_df_sample.head()

Unnamed: 0,full,abbrev,abbrev_exact
11834,Denver journal of international law and policy,Denver J Int Law Policy,Denver j. int. law policy
22386,Shujutsu. Operation,Shujutsu,"Shujutsu, Oper."
38612,Journal of Nutrition Education and Behavior,J. Nutr. Educ. Behav.,J. Nutr. Educ. Behav.
24435,The Philippine journal of pediatrics,Philipp J Pediatr,Philipp. j. pediatr.
44974,Mathematical Forum,Math. Forum,Math. Forum


In [22]:
abbrev_sample_correct = abbrev_df_sample[abbrev_df_sample["abbrev_exact"] == abbrev_df_sample["abbrev"]]
print(abbrev_sample_correct.shape)
abbrev_sample_correct.head()

(77, 3)


Unnamed: 0,full,abbrev,abbrev_exact
38612,Journal of Nutrition Education and Behavior,J. Nutr. Educ. Behav.,J. Nutr. Educ. Behav.
44974,Mathematical Forum,Math. Forum,Math. Forum
2728,Integrating Materials and Manufacturing Innova...,Integr. Mater. Manuf. Innov.,Integr. Mater. Manuf. Innov.
299,Cellular Microbiology,Cell. Microbiol.,Cell. Microbiol.
358,Chemistry and Technology of Fuels and Oils,Chem. Technol. Fuels Oils,Chem. Technol. Fuels Oils


In [23]:
abbrev_sample_incorrect = abbrev_df_sample[abbrev_df_sample["abbrev_exact"] != abbrev_df_sample["abbrev"]]
print(abbrev_sample_incorrect.shape)
abbrev_sample_incorrect.head(10)

(123, 3)


Unnamed: 0,full,abbrev,abbrev_exact
11834,Denver journal of international law and policy,Denver J Int Law Policy,Denver j. int. law policy
22386,Shujutsu. Operation,Shujutsu,"Shujutsu, Oper."
24435,The Philippine journal of pediatrics,Philipp J Pediatr,Philipp. j. pediatr.
20778,Quaderni della nutrizione,Quad Nutr,Quad. nutr.
7415,Advances in genetics,Adv Genet,Adv. genet.
36277,Diskussionsforum Medizinische Ethik,Diskussionsforum Med. Ethik,Diskuss. Med. Ethik
15711,Journal of chromatographic science,J Chromatogr Sci,J. chromatogr. sci.
23495,The Canadian dental hygienist,Can Dent Hyg,Can. dent. hyg.
9427,Biotechnic & histochemistry : official publica...,Biotech Histochem,Biotech. histochem. : off. publ. Biol. Stain C...
40919,Problemes Actuels d Oto-Rhino-Laryngologie,Probl. Actuels Otorhinolaryngol.,Probl. Actuels d Oto-Rhino-Laryngol.


In [25]:
print(abbrev_sample_incorrect.loc[9427, "full"])
print(abbrev_sample_incorrect.loc[9427, "abbrev"])
print(abbrev_sample_incorrect.loc[9427, "abbrev_exact"])

Biotechnic & histochemistry : official publication of the Biological Stain Commission
Biotech Histochem
Biotech. histochem. : off. publ. Biol. Stain Comm.


*Should everything in brackets be ignored?

In [26]:
print(abbrev_sample_incorrect.loc[40919, "full"])
print(abbrev_sample_incorrect.loc[40919, "abbrev"])
print(abbrev_sample_incorrect.loc[40919, "abbrev_exact"])

Problemes Actuels d Oto-Rhino-Laryngologie
Probl. Actuels Otorhinolaryngol.
Probl. Actuels d Oto-Rhino-Laryngol.


#### The data downloaded (from https://github.com/JabRef/abbrv.jabref.org/tree/master/journals) does not exactly follow the results of iso4 program (at https://abbreviso.toolforge.org/)
* Possibility 1: data is computed by hand and misses some things?
* Possibility 2: the program is not well built?


In [27]:
### Run all of the dataframe, probably timeout
# abbrev_df["abbrev_exact"] = abbrev_df.loc["full"].apply(abbreviate_journal_name)
# abbrev_df.head()

In [28]:
# abbrev_correct = abbrev_df[abbrev_df["abbrev_exact"] == abbrev_df["abbrev"]]
# print(abbrev_correct.shape)
# abbrev_correct.head()

In [29]:
# abbrev_incorrect = abbrev_df[abbrev_df["abbrev_exact"] != abbrev_df["abbrev"]]
# print(abbrev_incorrect.shape)
# abbrev_incorrect.head()

### Stop Words

In [30]:
stopwords = []
with open("stopwords.txt") as file:
    lines = file.readlines()
    for curr_line in lines:
        stopwords.append(re.sub("\n", "", curr_line))

In [31]:
stopwords[:10]

['a',
 'about',
 'afore',
 'after',
 'ago',
 'along',
 'amid',
 'among',
 'amongst',
 'an']

#### Mapping

In [32]:
def split_name(journal_full):
    if journal_full == "" or journal_full == np.nan:
        return ""
    
    journal_new = re.sub(r"[!?&*+=|]", "", journal_full)
    if ":" in journal_new:
        before_colon = journal_new.split(":")[0]
        brackets = ""
        if "(" in journal_new and ")" in journal_new:
            brackets = journal_new[journal_new.find("("):journal_new.find(")")+1]
        journal_new = before_colon + " " + brackets
        
    journal_lst = journal_new.split(" ")
        
    result = []
    for item in journal_lst:
        if item != "":
            if item.lower() not in stopwords:
                result.append(item.strip())

    return result

In [33]:
split_name("ACS Applied Materials & Interfaces")

['ACS', 'Applied', 'Materials', 'Interfaces']

In [34]:
split_name("ACS Appl. Mater. Interfaces")

['ACS', 'Appl.', 'Mater.', 'Interfaces']

In [35]:
split_name("Physica C: Superconductivity and Its Application (Amsterdam, Netherland)")

['Physica', 'C', '(Amsterdam,', 'Netherland)']

In [45]:
abbrev_df["full_lst"] = abbrev_df["full"].apply(split_name)
abbrev_df["abbrev_lst"] = abbrev_df["abbrev"].apply(split_name)
abbrev_df.head()

Unnamed: 0,full,abbrev,full_lst,abbrev_lst,mappings
0,ACS Applied Materials & Interfaces,ACS Appl. Mater. Interfaces,"[ACS, Applied, Materials, Interfaces]","[ACS, Appl., Mater., Interfaces]","[(ACS, ACS), (Applied, Appl.), (Materials, Mat..."
1,ACS Applied Nano Materials,ACS Appl. Nano Mater.,"[ACS, Applied, Nano, Materials]","[ACS, Appl., Nano, Mater.]","[(ACS, ACS), (Applied, Appl.), (Nano, Nano), (..."
2,ACS Biomaterials Science & Engineering,ACS Biomater. Sci. Eng.,"[ACS, Biomaterials, Science, Engineering]","[ACS, Biomater., Sci., Eng.]","[(ACS, ACS), (Biomaterials, Biomater.), (Scien..."
3,ACS Catalysis,ACS Catal.,"[ACS, Catalysis]","[ACS, Catal.]","[(ACS, ACS), (Catalysis, Catal.)]"
4,ACS Central Science,ACS Cent. Sci.,"[ACS, Central, Science]","[ACS, Cent., Sci.]","[(ACS, ACS), (Central, Cent.), (Science, Sci.)]"


In [46]:
abbrev_df.tail()

Unnamed: 0,full,abbrev,full_lst,abbrev_lst,mappings
54469,Theory And Society,Theory Soc.,"[Theory, Society]","[Theory, Soc.]","[(Theory, Theory), (Society, Soc.)]"
54470,Work And Occupations,Work Occup.,"[Work, Occupations]","[Work, Occup.]","[(Work, Work), (Occupations, Occup.)]"
54471,Work Employment And Society,Work Employ. Soc.,"[Work, Employment, Society]","[Work, Employ., Soc.]","[(Work, Work), (Employment, Employ.), (Society..."
54472,Youth & Society,Youth Soc.,"[Youth, Society]","[Youth, Soc.]","[(Youth, Youth), (Society, Soc.)]"
54473,Zeitschrift für Soziologie,Z. Soziol.,"[Zeitschrift, Soziologie]","[Z., Soziol.]","[(Zeitschrift, Z.), (Soziologie, Soziol.)]"


In [47]:
abbrev_df.sample(n = 5)

Unnamed: 0,full,abbrev,full_lst,abbrev_lst,mappings
38256,Journal of Emergency Nursing,J. Emerg. Nurs.,"[Journal, Emergency, Nursing]","[J., Emerg., Nurs.]","[(Journal, J.), (Emergency, Emerg.), (Nursing,..."
45736,Series in International Business and Economics,Ser. Internat. Bus. Econom.,"[Series, International, Business, Economics]","[Ser., Internat., Bus., Econom.]","[(Series, Ser.), (International, Internat.), (..."
23558,The Clinical investigator,Clin Investig,"[Clinical, investigator]","[Clin, Investig]","[(Clinical, Clin), (investigator, Investig)]"
50981,Wein Wissenschaft,Wein Wiss.,"[Wein, Wissenschaft]","[Wein, Wiss.]","[(Wein, Wein), (Wissenschaft, Wiss.)]"
130,Applied Spectroscopy,Appl. Spectrosc.,"[Applied, Spectroscopy]","[Appl., Spectrosc.]","[(Applied, Appl.), (Spectroscopy, Spectrosc.)]"


In [39]:
def matching(full_lst, abbrev_lst):
    
    if len(full_lst) > len(abbrev_lst):
        for elem in full_lst:
            if elem.lower() in stopwords:
                full_lst.remove(elem)
    
    if len(full_lst) == len(abbrev_lst):
        result = []
        for i in range(len(full_lst)):
            result.append((full_lst[i], abbrev_lst[i]))
        return result
#         result = {}
#         for i in range(len(full_lst)):
#             result[full_lst[i]] = abbrev_lst[i]
#         return result
    else:
        return ""

In [40]:
matching(["Acta", "Academiae", "Scientiarum", "Taurinensis"], ["Acta", "Acad.", "Sci.", "Taurinensis"])

[('Acta', 'Acta'),
 ('Academiae', 'Acad.'),
 ('Scientiarum', 'Sci.'),
 ('Taurinensis', 'Taurinensis')]

In [48]:
abbrev_df["mappings"] = abbrev_df.apply(lambda x: matching(x["full_lst"], x["abbrev_lst"]), axis = 1)
abbrev_df.head()

Unnamed: 0,full,abbrev,full_lst,abbrev_lst,mappings
0,ACS Applied Materials & Interfaces,ACS Appl. Mater. Interfaces,"[ACS, Applied, Materials, Interfaces]","[ACS, Appl., Mater., Interfaces]","[(ACS, ACS), (Applied, Appl.), (Materials, Mat..."
1,ACS Applied Nano Materials,ACS Appl. Nano Mater.,"[ACS, Applied, Nano, Materials]","[ACS, Appl., Nano, Mater.]","[(ACS, ACS), (Applied, Appl.), (Nano, Nano), (..."
2,ACS Biomaterials Science & Engineering,ACS Biomater. Sci. Eng.,"[ACS, Biomaterials, Science, Engineering]","[ACS, Biomater., Sci., Eng.]","[(ACS, ACS), (Biomaterials, Biomater.), (Scien..."
3,ACS Catalysis,ACS Catal.,"[ACS, Catalysis]","[ACS, Catal.]","[(ACS, ACS), (Catalysis, Catal.)]"
4,ACS Central Science,ACS Cent. Sci.,"[ACS, Central, Science]","[ACS, Cent., Sci.]","[(ACS, ACS), (Central, Cent.), (Science, Sci.)]"


In [49]:
no_mappings_df = abbrev_df[abbrev_df["mappings"] == ""]
print(no_mappings_df.shape)
no_mappings_df.head(10)

(10338, 5)


Unnamed: 0,full,abbrev,full_lst,abbrev_lst,mappings
772,International Journal of Biomedical Nanoscienc...,Int. J. Biomed. Nanosci. Nanotechnol.,"[International, Journal, Biomedical, Nanoscience]","[Int., J., Biomed., Nanosci., Nanotechnol.]",
775,International Journal of Chemical Engineering and,Int. J. Chem. Eng. Appl.,"[International, Journal, Chemical, Engineering]","[Int., J., Chem., Eng., Appl.]",
920,"Journal of Environmental Engineering (Reston, ...","J. Environ. Eng. (Reston, VA, U.S.)","[Journal, Environmental, Engineering, (Reston,...","[J., Environ., Eng., (Reston,, VA,, U.S.)]",
924,"Journal of Environmental Science and Health, P...",J. Environ. Sci. Health A: Toxic/Hazard. Subst...,"[Journal, Environmental, Science, Health,, Part]","[J., Environ., Sci., Health]",
925,"Journal of Environmental Science and Health, P...",J. Environ. Sci. Health B,"[Journal, Environmental, Science, Health,, Par...","[J., Environ., Sci., Health, B]",
1121,Kemija u Industriji,Kem. Ind.,"[Kemija, u, Industriji]","[Kem., Ind.]",
1124,Khimiya v Interesakh Ustoichivogo Razvitiya,Khim. Interesakh Ustoich. Razvit.,"[Khimiya, v, Interesakh, Ustoichivogo, Razvitiya]","[Khim., Interesakh, Ustoich., Razvit.]",
1263,Monatshefte fuer Chemie,Monatsh. Chem.,"[Monatshefte, fuer, Chemie]","[Monatsh., Chem.]",
1314,"Neues Jahrbuch fuer Mineralogie, Abhandlungen","Neues Jahrb. Mineral., Abh.","[Neues, Jahrbuch, fuer, Mineralogie,, Abhandlu...","[Neues, Jahrb., Mineral.,, Abh.]",
1409,physica status solidi (a),Phys. Status Solidi A,"[physica, status, solidi, (a)]","[Phys., Status, Solidi]",


In [50]:
mappings_df = abbrev_df[abbrev_df["mappings"] != ""]
print(mappings_df.shape)
mappings_df.head()

(33395, 5)


Unnamed: 0,full,abbrev,full_lst,abbrev_lst,mappings
0,ACS Applied Materials & Interfaces,ACS Appl. Mater. Interfaces,"[ACS, Applied, Materials, Interfaces]","[ACS, Appl., Mater., Interfaces]","[(ACS, ACS), (Applied, Appl.), (Materials, Mat..."
1,ACS Applied Nano Materials,ACS Appl. Nano Mater.,"[ACS, Applied, Nano, Materials]","[ACS, Appl., Nano, Mater.]","[(ACS, ACS), (Applied, Appl.), (Nano, Nano), (..."
2,ACS Biomaterials Science & Engineering,ACS Biomater. Sci. Eng.,"[ACS, Biomaterials, Science, Engineering]","[ACS, Biomater., Sci., Eng.]","[(ACS, ACS), (Biomaterials, Biomater.), (Scien..."
3,ACS Catalysis,ACS Catal.,"[ACS, Catalysis]","[ACS, Catal.]","[(ACS, ACS), (Catalysis, Catal.)]"
4,ACS Central Science,ACS Cent. Sci.,"[ACS, Central, Science]","[ACS, Cent., Sci.]","[(ACS, ACS), (Central, Cent.), (Science, Sci.)]"


In [51]:
# mapping_lst = abbrev_df["mappings"].tolist()
# mappings = {}
# for m in mapping_lst:
#     mappings.update(m)
# mappings = pd.DataFrame.from_dict(mappings, orient='index')

In [52]:
mapping_lst = mappings_df["mappings"].tolist()
mappings = []
for m in mapping_lst:
    mappings.extend(m)

In [53]:
dataset = pd.DataFrame(mappings, columns = ["full", "abbrev"])
dataset

Unnamed: 0,full,abbrev
0,ACS,ACS
1,Applied,Appl.
2,Materials,Mater.
3,Interfaces,Interfaces
4,ACS,ACS
...,...,...
106602,Society,Soc.
106603,Youth,Youth
106604,Society,Soc.
106605,Zeitschrift,Z.


In [54]:
# dataset_unique = input_data.drop_duplicates()
# dataset_unique

In [63]:
def clean_word(word):
    word = re.sub("\(", "", word)
    word = re.sub("\)", "", word)
    word = re.sub(",", "", word)
    return word.lower()

In [64]:
dataset["full"] = dataset["full"].apply(clean_word)
dataset["abbrev"] = dataset["abbrev"].apply(clean_word)
dataset.head()

Unnamed: 0,full,abbrev
0,acs,acs
1,applied,appl.
2,materials,mater.
3,interfaces,interfaces
4,acs,acs


In [65]:
dataset_abbrev = dataset[dataset["full"] != dataset["abbrev"]]
dataset_abbrev = dataset_abbrev.reset_index(drop = True)
dataset_abbrev

Unnamed: 0,full,abbrev
0,applied,appl.
1,materials,mater.
2,applied,appl.
3,materials,mater.
4,biomaterials,biomater.
...,...,...
81631,employment,employ.
81632,society,soc.
81633,society,soc.
81634,zeitschrift,z.


In [66]:
def is_substring(full, abbrev):
    abbrev = re.sub(r"\.", "", abbrev)
    return abbrev in full

In [67]:
dataset_abbrev["is_substring"] = dataset_abbrev.apply(lambda x: is_substring(x["full"], x["abbrev"]), axis = 1)
dataset_abbrev.head()

Unnamed: 0,full,abbrev,is_substring
0,applied,appl.,True
1,materials,mater.,True
2,applied,appl.,True
3,materials,mater.,True
4,biomaterials,biomater.,True


In [68]:
dataset_abbrev[dataset_abbrev["is_substring"] == False]

Unnamed: 0,full,abbrev,is_substring
434,iochimica,biochim.,False
565,japan,jpn.,False
670,ceramics-silikaty,ceram.-silik.,False
742,chemico-biological,chem.-biol.,False
778,compounds,compd.,False
...,...,...,...
81388,japan,jpn.,False
81402,national,natl.,False
81480,sociology-cahiers,sociol.-cahiers,False
81494,discourse,niscl.,False


In [69]:
dataset_abbrev["abbrev"].value_counts()

j.                4774
j                 3417
med               1485
sci.              1436
rev               1137
                  ... 
ros.                 1
bronconeumol         1
neumol               1
praxisfuhr.          1
otoophthalmol.       1
Name: abbrev, Length: 6804, dtype: int64

In [70]:
dataset_abbrev_no_sub = dataset_abbrev[dataset_abbrev["is_substring"] == False]["abbrev"]
dataset_abbrev_no_sub

434             biochim.
565                 jpn.
670        ceram.-silik.
742          chem.-biol.
778               compd.
              ...       
81388               jpn.
81402              natl.
81480    sociol.-cahiers
81494             niscl.
81551              koln.
Name: abbrev, Length: 2367, dtype: object

In [71]:
dataset_abbrev_sub = dataset_abbrev[dataset_abbrev["is_substring"] == True]["abbrev"]
dataset_abbrev_sub.value_counts()

j.              4765
j               3413
med             1463
sci.            1427
rev             1137
                ... 
nanoeurosci.       1
aikak.             1
ottalmol.          1
neuss              1
microlith.         1
Name: abbrev, Length: 5944, dtype: int64

### Frequencies

In [72]:
dataset_abbrev_freq = dataset_abbrev.groupby(["abbrev"]).agg(list)
dataset_abbrev_freq = dataset_abbrev_freq.reset_index(drop = False)
dataset_abbrev_freq

Unnamed: 0,abbrev,full,is_substring
0,$k$-monogr.,[$k$-monographs],[True]
1,-,"[–, --]","[False, True]"
2,-geol.,"[-geologie, -geologie]","[True, True]"
3,.r.,"[rendus, rendus]","[True, True]"
4,11,[fisica],[False]
...,...,...,...
6799,ökonom.,[ökonometrie],[True]
6800,österr.,"[österreichischen, österreichische, österreich...","[True, True, True, True, True]"
6801,überbl.,[überblicke],[True]
6802,ül.,[ülikooli],[True]


In [73]:
def calc_count(lst_words):
    result = {}
    for elem in lst_words:
        if elem in result.keys():
            result[elem] += 1
        else:
            result[elem] = 1

    return result

In [74]:
dataset_abbrev_freq["count"] = dataset_abbrev_freq["full"].apply(calc_count)
dataset_abbrev_freq

Unnamed: 0,abbrev,full,is_substring,count
0,$k$-monogr.,[$k$-monographs],[True],{'$k$-monographs': 1}
1,-,"[–, --]","[False, True]","{'–': 1, '--': 1}"
2,-geol.,"[-geologie, -geologie]","[True, True]",{'-geologie': 2}
3,.r.,"[rendus, rendus]","[True, True]",{'rendus': 2}
4,11,[fisica],[False],{'fisica': 1}
...,...,...,...,...
6799,ökonom.,[ökonometrie],[True],{'ökonometrie': 1}
6800,österr.,"[österreichischen, österreichische, österreich...","[True, True, True, True, True]","{'österreichischen': 1, 'österreichische': 3, ..."
6801,überbl.,[überblicke],[True],{'überblicke': 1}
6802,ül.,[ülikooli],[True],{'ülikooli': 1}


In [75]:
def calc_freq(counts):
    total = sum(counts.values())
    result = {}
    for elem in counts.keys():
        result[elem] = counts[elem] / total
    return result

In [76]:
dataset_abbrev_freq["freq"] = dataset_abbrev_freq["count"].apply(calc_freq)
dataset_abbrev_freq

Unnamed: 0,abbrev,full,is_substring,count,freq
0,$k$-monogr.,[$k$-monographs],[True],{'$k$-monographs': 1},{'$k$-monographs': 1.0}
1,-,"[–, --]","[False, True]","{'–': 1, '--': 1}","{'–': 0.5, '--': 0.5}"
2,-geol.,"[-geologie, -geologie]","[True, True]",{'-geologie': 2},{'-geologie': 1.0}
3,.r.,"[rendus, rendus]","[True, True]",{'rendus': 2},{'rendus': 1.0}
4,11,[fisica],[False],{'fisica': 1},{'fisica': 1.0}
...,...,...,...,...,...
6799,ökonom.,[ökonometrie],[True],{'ökonometrie': 1},{'ökonometrie': 1.0}
6800,österr.,"[österreichischen, österreichische, österreich...","[True, True, True, True, True]","{'österreichischen': 1, 'österreichische': 3, ...","{'österreichischen': 0.2, 'österreichische': 0..."
6801,überbl.,[überblicke],[True],{'überblicke': 1},{'überblicke': 1.0}
6802,ül.,[ülikooli],[True],{'ülikooli': 1},{'ülikooli': 1.0}


In [78]:
def most_freq(frequencies):
    v = list(frequencies.values())
    k = list(frequencies.keys())
    return k[v.index(max(v))]

In [79]:
dataset_abbrev_freq["most_freq"] = dataset_abbrev_freq["freq"].apply(most_freq)
dataset_abbrev_freq

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq
0,$k$-monogr.,[$k$-monographs],[True],{'$k$-monographs': 1},{'$k$-monographs': 1.0},$k$-monographs
1,-,"[–, --]","[False, True]","{'–': 1, '--': 1}","{'–': 0.5, '--': 0.5}",–
2,-geol.,"[-geologie, -geologie]","[True, True]",{'-geologie': 2},{'-geologie': 1.0},-geologie
3,.r.,"[rendus, rendus]","[True, True]",{'rendus': 2},{'rendus': 1.0},rendus
4,11,[fisica],[False],{'fisica': 1},{'fisica': 1.0},fisica
...,...,...,...,...,...,...
6799,ökonom.,[ökonometrie],[True],{'ökonometrie': 1},{'ökonometrie': 1.0},ökonometrie
6800,österr.,"[österreichischen, österreichische, österreich...","[True, True, True, True, True]","{'österreichischen': 1, 'österreichische': 3, ...","{'österreichischen': 0.2, 'österreichische': 0...",österreichische
6801,überbl.,[überblicke],[True],{'überblicke': 1},{'überblicke': 1.0},überblicke
6802,ül.,[ülikooli],[True],{'ülikooli': 1},{'ülikooli': 1.0},ülikooli


In [87]:
def iso4_expand(abbrev):
#     abbrev_form = abbrev.lower() + "."
    iso4_abbrev_match_df = ltwa_df[ltwa_df["abbrev"] == abbrev]
    if iso4_abbrev_match_df.empty:
        return "abbrev not in iso4"
    elif iso4_abbrev_match_df.shape[0] == 1:
        return iso4_abbrev_match_df.iloc[0]["full"]
    else:
        return iso4_abbrev_match_df["full"].tolist()

In [88]:
dataset_abbrev_freq["abbrev_expanded"] = dataset_abbrev_freq["abbrev"].apply(iso4_expand)
dataset_abbrev_freq

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
0,$k$-monogr.,[$k$-monographs],[True],{'$k$-monographs': 1},{'$k$-monographs': 1.0},$k$-monographs,abbrev not in iso4
1,-,"[–, --]","[False, True]","{'–': 1, '--': 1}","{'–': 0.5, '--': 0.5}",–,abbrev not in iso4
2,-geol.,"[-geologie, -geologie]","[True, True]",{'-geologie': 2},{'-geologie': 1.0},-geologie,abbrev not in iso4
3,.r.,"[rendus, rendus]","[True, True]",{'rendus': 2},{'rendus': 1.0},rendus,abbrev not in iso4
4,11,[fisica],[False],{'fisica': 1},{'fisica': 1.0},fisica,abbrev not in iso4
...,...,...,...,...,...,...,...
6799,ökonom.,[ökonometrie],[True],{'ökonometrie': 1},{'ökonometrie': 1.0},ökonometrie,abbrev not in iso4
6800,österr.,"[österreichischen, österreichische, österreich...","[True, True, True, True, True]","{'österreichischen': 1, 'österreichische': 3, ...","{'österreichischen': 0.2, 'österreichische': 0...",österreichische,abbrev not in iso4
6801,überbl.,[überblicke],[True],{'überblicke': 1},{'überblicke': 1.0},überblicke,abbrev not in iso4
6802,ül.,[ülikooli],[True],{'ülikooli': 1},{'ülikooli': 1.0},ülikooli,abbrev not in iso4


In [89]:
dataset_abbrev_iso4 = dataset_abbrev_freq[dataset_abbrev_freq["abbrev_expanded"] != "abbrev not in iso4"]
print(dataset_abbrev_iso4.shape)
dataset_abbrev_iso4

(2151, 7)


Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
51,aargau.,[aargauischen],[True],{'aargauischen': 1},{'aargauischen': 1.0},aargauischen,aargauisch-
52,aastaraam.,[aastaraamat.],[True],{'aastaraamat.': 1},{'aastaraamat.': 1.0},aastaraamat.,aastaraam-
54,abdom.,[abdominal],[True],{'abdominal': 1},{'abdominal': 1.0},abdominal,abdominal
60,abnorm.,"[abnormal, abnormal]","[True, True]",{'abnormal': 2},{'abnormal': 1.0},abnormal,abnormal-
63,abr.,[d'abreviations],[True],{'d'abreviations': 1},{'d'abreviations': 1.0},d'abreviations,"[abrég-, abrevia-, abréviation, abridged, ab..."
...,...,...,...,...,...,...,...
6773,znan.,"[znanosti, znanosti, znanosti]","[True, True, True]",{'znanosti': 3},{'znanosti': 1.0},znanosti,"[znanost-, znanstv-]"
6775,zobozdrav.,[zobozdravstveni],[True],{'zobozdravstveni': 1},{'zobozdravstveni': 1.0},zobozdravstveni,zobozdravstven-
6781,zpr.,[zpravy],[True],{'zpravy': 1},{'zpravy': 1.0},zpravy,zpráva
6785,zubn.,[zubni],[True],{'zubni': 1},{'zubni': 1.0},zubni,zubn-


In [90]:
dataset_abbrev_iso4_match = dataset_abbrev_iso4[dataset_abbrev_iso4["most_freq"] == dataset_abbrev_iso4["abbrev_expanded"]]
print(dataset_abbrev_iso4_match.shape)
dataset_abbrev_iso4_match.head()

(210, 7)


Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
54,abdom.,[abdominal],[True],{'abdominal': 1},{'abdominal': 1.0},abdominal,abdominal
64,abras.,[abrasive],[True],{'abrasive': 1},{'abrasive': 1.0},abrasive,abrasive
168,agral.,[agralia],[True],{'agralia': 1},{'agralia': 1.0},agralia,agralia
195,aircr.,"[aircraft, aircraft]","[True, True]",{'aircraft': 2},{'aircraft': 1.0},aircraft,aircraft
225,alerg.,"[alergia, alergia]","[True, True]",{'alergia': 2},{'alergia': 1.0},alergia,alergia


In [91]:
dataset_abbrev_iso4_no_match = dataset_abbrev_iso4[dataset_abbrev_iso4["most_freq"] != dataset_abbrev_iso4["abbrev_expanded"]]
print(dataset_abbrev_iso4_no_match.shape)
dataset_abbrev_iso4_no_match.head()

(1941, 7)


Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
51,aargau.,[aargauischen],[True],{'aargauischen': 1},{'aargauischen': 1.0},aargauischen,aargauisch-
52,aastaraam.,[aastaraamat.],[True],{'aastaraamat.': 1},{'aastaraamat.': 1.0},aastaraamat.,aastaraam-
60,abnorm.,"[abnormal, abnormal]","[True, True]",{'abnormal': 2},{'abnormal': 1.0},abnormal,abnormal-
63,abr.,[d'abreviations],[True],{'d'abreviations': 1},{'d'abreviations': 1.0},d'abreviations,"[abrég-, abrevia-, abréviation, abridged, ab..."
66,abstr.,"[abstract, abstracta, abstracts, abstracts, ab...","[True, True, True, True, True, True, True, Tru...","{'abstract': 5, 'abstracta': 1, 'abstracts': 7}","{'abstract': 0.38461538461538464, 'abstracta':...",abstracts,"[abstracc-, abstract-, abstrak]"


In [92]:
def most_freq_is_iso4_abbrev(most_freq, abbrev_expanded):
    if type(abbrev_expanded) == str:
        if abbrev_expanded in most_freq:
            return True
        if most_freq in abbrev_expanded:
            return True
        if "-" in abbrev_expanded:
            return abbrev_expanded.split("-")[0] in most_freq
        return False
    elif type(abbrev_expanded) == list:
        for elem in abbrev_expanded:
            if elem in most_freq:
                return True
            if most_freq in elem:
                return True
            if "-" in elem:
                if elem.split("-")[0] in most_freq:
                    return True
        return False
    else:
        return False

In [93]:
dataset_abbrev_iso4_no_match["most_freq_is_iso4_abbrev"] = dataset_abbrev_iso4_no_match.apply(lambda x:most_freq_is_iso4_abbrev(x["most_freq"],
                                                                        x["abbrev_expanded"]), axis = 1)
dataset_abbrev_iso4_no_match.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded,most_freq_is_iso4_abbrev
51,aargau.,[aargauischen],[True],{'aargauischen': 1},{'aargauischen': 1.0},aargauischen,aargauisch-,True
52,aastaraam.,[aastaraamat.],[True],{'aastaraamat.': 1},{'aastaraamat.': 1.0},aastaraamat.,aastaraam-,True
60,abnorm.,"[abnormal, abnormal]","[True, True]",{'abnormal': 2},{'abnormal': 1.0},abnormal,abnormal-,True
63,abr.,[d'abreviations],[True],{'d'abreviations': 1},{'d'abreviations': 1.0},d'abreviations,"[abrég-, abrevia-, abréviation, abridged, ab...",True
66,abstr.,"[abstract, abstracta, abstracts, abstracts, ab...","[True, True, True, True, True, True, True, Tru...","{'abstract': 5, 'abstracta': 1, 'abstracts': 7}","{'abstract': 0.38461538461538464, 'abstracta':...",abstracts,"[abstracc-, abstract-, abstrak]",True


In [94]:
dataset_abbrev_mf_iso4 = dataset_abbrev_iso4_no_match[dataset_abbrev_iso4_no_match["most_freq_is_iso4_abbrev"] == True]
print(dataset_abbrev_mf_iso4.shape)
dataset_abbrev_mf_iso4.head()

(1680, 8)


Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded,most_freq_is_iso4_abbrev
51,aargau.,[aargauischen],[True],{'aargauischen': 1},{'aargauischen': 1.0},aargauischen,aargauisch-,True
52,aastaraam.,[aastaraamat.],[True],{'aastaraamat.': 1},{'aastaraamat.': 1.0},aastaraamat.,aastaraam-,True
60,abnorm.,"[abnormal, abnormal]","[True, True]",{'abnormal': 2},{'abnormal': 1.0},abnormal,abnormal-,True
63,abr.,[d'abreviations],[True],{'d'abreviations': 1},{'d'abreviations': 1.0},d'abreviations,"[abrég-, abrevia-, abréviation, abridged, ab...",True
66,abstr.,"[abstract, abstracta, abstracts, abstracts, ab...","[True, True, True, True, True, True, True, Tru...","{'abstract': 5, 'abstracta': 1, 'abstracts': 7}","{'abstract': 0.38461538461538464, 'abstracta':...",abstracts,"[abstracc-, abstract-, abstrak]",True


In [95]:
dataset_abbrev_iso4_no_match[dataset_abbrev_iso4_no_match["most_freq_is_iso4_abbrev"] == False]

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded,most_freq_is_iso4_abbrev
87,accredit.,"[accreditation, accreditation, accreditation]","[True, True, True]",{'accreditation': 3},{'accreditation': 1.0},accreditation,accredited,False
113,add.,[additive],[True],{'additive': 1},{'additive': 1.0},additive,addendum,False
161,afr.,"[african, africaine, african, african, african...","[True, True, True, True, True, True, True, Tru...","{'african': 40, 'africaine': 2, 'd'afrique': 1...","{'african': 0.8, 'africaine': 0.04, 'd'afrique...",african,afrikanistisch,False
167,agr.,[agricultural],[True],{'agricultural': 1},{'agricultural': 1.0},agricultural,agrotikos,False
181,agropecu.,[agropecuária],[True],{'agropecuária': 1},{'agropecuária': 1.0},agropecuária,agropecuari-,False
...,...,...,...,...,...,...,...,...
6640,wewn.,[wewnetrznej],[True],{'wewnetrznej': 1},{'wewnetrznej': 1.0},wewnetrznej,wewne̢trzn-,False
6642,wiad.,"[wiadomosci, wiadomosci, wiadomosci]","[True, True, True]",{'wiadomosci': 3},{'wiadomosci': 1.0},wiadomosci,wiadomośc-,False
6690,wroc.,[wroclawskiej],[True],{'wroclawskiej': 1},{'wroclawskiej': 1.0},wroclawskiej,"[wrocław, wrocławsk-]",False
6697,wydz.,[wydzialu],[True],{'wydzialu': 1},{'wydzialu': 1.0},wydzialu,wydział-,False


In [96]:
def abbrev_expanded_is_one_of_full(freq, abbrev_expanded):
    if type(abbrev_expanded) == list:
        res = {}
        for ab_e in abbrev_expanded:
            if ab_e in list(freq.keys()):
                res[ab_e] = freq[ab_e]
        if res:
            return res
        return False
    else:
        if abbrev_expanded in list(freq.keys()):
            return True
        return False

In [97]:
dataset_abbrev_iso4_no_match["abbrev_expanded_is_one_of_full"] = dataset_abbrev_iso4_no_match.apply(lambda x:abbrev_expanded_is_one_of_full(x["freq"],
                                                                        x["abbrev_expanded"]), axis = 1)
dataset_abbrev_iso4_no_match.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded,most_freq_is_iso4_abbrev,abbrev_expanded_is_one_of_full
51,aargau.,[aargauischen],[True],{'aargauischen': 1},{'aargauischen': 1.0},aargauischen,aargauisch-,True,False
52,aastaraam.,[aastaraamat.],[True],{'aastaraamat.': 1},{'aastaraamat.': 1.0},aastaraamat.,aastaraam-,True,False
60,abnorm.,"[abnormal, abnormal]","[True, True]",{'abnormal': 2},{'abnormal': 1.0},abnormal,abnormal-,True,False
63,abr.,[d'abreviations],[True],{'d'abreviations': 1},{'d'abreviations': 1.0},d'abreviations,"[abrég-, abrevia-, abréviation, abridged, ab...",True,False
66,abstr.,"[abstract, abstracta, abstracts, abstracts, ab...","[True, True, True, True, True, True, True, Tru...","{'abstract': 5, 'abstracta': 1, 'abstracts': 7}","{'abstract': 0.38461538461538464, 'abstracta':...",abstracts,"[abstracc-, abstract-, abstrak]",True,False


In [98]:
dataset_abbrev_iso4_no_match[dataset_abbrev_iso4_no_match["abbrev_expanded_is_one_of_full"] == True].head()

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded,most_freq_is_iso4_abbrev,abbrev_expanded_is_one_of_full
77,acc.,"[accounts, accounts, account, accounts]","[True, True, True, True]","{'accounts': 3, 'account': 1}","{'accounts': 0.75, 'account': 0.25}",accounts,account,True,True
805,bioprocess.,"[bioprocess, bioprocessing]","[True, True]","{'bioprocess': 1, 'bioprocessing': 1}","{'bioprocess': 0.5, 'bioprocessing': 0.5}",bioprocess,bioprocessing,True,True
1952,equilib.,"[equilibria, equilibria, equilibria, equilibri...","[True, True, True, True, True, True]","{'equilibria': 3, 'equilibrium': 3}","{'equilibria': 0.5, 'equilibrium': 0.5}",equilibria,equilibrium,False,True
2932,inequal.,"[inequalities, inequalities, inequality, inequ...","[True, True, True, True, True, True, True]","{'inequalities': 6, 'inequality': 1}","{'inequalities': 0.8571428571428571, 'inequali...",inequalities,inequality,False,True
3684,mechatron.,"[mechatronics, mechatronics, mechatronic, mech...","[True, True, True, True]","{'mechatronics': 3, 'mechatronic': 1}","{'mechatronics': 0.75, 'mechatronic': 0.25}",mechatronics,mechatronic,True,True


In [99]:
dataset_abbrev_iso4_no_match[dataset_abbrev_iso4_no_match["abbrev_expanded_is_one_of_full"] == False].head()

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded,most_freq_is_iso4_abbrev,abbrev_expanded_is_one_of_full
51,aargau.,[aargauischen],[True],{'aargauischen': 1},{'aargauischen': 1.0},aargauischen,aargauisch-,True,False
52,aastaraam.,[aastaraamat.],[True],{'aastaraamat.': 1},{'aastaraamat.': 1.0},aastaraamat.,aastaraam-,True,False
60,abnorm.,"[abnormal, abnormal]","[True, True]",{'abnormal': 2},{'abnormal': 1.0},abnormal,abnormal-,True,False
63,abr.,[d'abreviations],[True],{'d'abreviations': 1},{'d'abreviations': 1.0},d'abreviations,"[abrég-, abrevia-, abréviation, abridged, ab...",True,False
66,abstr.,"[abstract, abstracta, abstracts, abstracts, ab...","[True, True, True, True, True, True, True, Tru...","{'abstract': 5, 'abstracta': 1, 'abstracts': 7}","{'abstract': 0.38461538461538464, 'abstracta':...",abstracts,"[abstracc-, abstract-, abstrak]",True,False


In [100]:
dataset_abbrev_iso4_no_match[(dataset_abbrev_iso4_no_match["abbrev_expanded_is_one_of_full"] != False) &
                           (dataset_abbrev_iso4_no_match["abbrev_expanded_is_one_of_full"] != True)].head()

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded,most_freq_is_iso4_abbrev,abbrev_expanded_is_one_of_full
85,account.,"[accounting, accounting, accounting]","[True, True, True]",{'accounting': 3},{'accounting': 1.0},accounting,"[accountability, accountancy, accountant, acco...",True,{'accounting': 1.0}
110,acupunct.,[acupuncture],[True],{'acupuncture': 1},{'acupuncture': 1.0},acupuncture,"[acupuncteur, acupuncture]",True,{'acupuncture': 1.0}
150,aerosp.,"[aerospace, aerospace, aerospace, aerospace, a...","[True, True, True, True, True, True, True, Tru...",{'aerospace': 14},{'aerospace': 1.0},aerospace,"[aerospace, aerospatial-]",True,{'aerospace': 1.0}
177,agrofor.,[agroforestry],[True],{'agroforestry': 1},{'agroforestry': 1.0},agroforestry,"[agroforestal, agroforestry]",True,{'agroforestry': 1.0}
253,alp.,"[alpine, alpine]","[True, True]",{'alpine': 2},{'alpine': 1.0},alpine,"[alpesi, alpin, alpina, alpine, alpini-]",True,{'alpine': 1.0}


In [101]:
dataset_abbrev_not_iso4 = dataset_abbrev_freq[dataset_abbrev_freq["abbrev_expanded"] == "abbrev not in iso4"]
print(dataset_abbrev_not_iso4.shape)
dataset_abbrev_not_iso4

(4653, 7)


Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
0,$k$-monogr.,[$k$-monographs],[True],{'$k$-monographs': 1},{'$k$-monographs': 1.0},$k$-monographs,abbrev not in iso4
1,-,"[–, --]","[False, True]","{'–': 1, '--': 1}","{'–': 0.5, '--': 0.5}",–,abbrev not in iso4
2,-geol.,"[-geologie, -geologie]","[True, True]",{'-geologie': 2},{'-geologie': 1.0},-geologie,abbrev not in iso4
3,.r.,"[rendus, rendus]","[True, True]",{'rendus': 2},{'rendus': 1.0},rendus,abbrev not in iso4
4,11,[fisica],[False],{'fisica': 1},{'fisica': 1.0},fisica,abbrev not in iso4
...,...,...,...,...,...,...,...
6799,ökonom.,[ökonometrie],[True],{'ökonometrie': 1},{'ökonometrie': 1.0},ökonometrie,abbrev not in iso4
6800,österr.,"[österreichischen, österreichische, österreich...","[True, True, True, True, True]","{'österreichischen': 1, 'österreichische': 3, ...","{'österreichischen': 0.2, 'österreichische': 0...",österreichische,abbrev not in iso4
6801,überbl.,[überblicke],[True],{'überblicke': 1},{'überblicke': 1.0},überblicke,abbrev not in iso4
6802,ül.,[ülikooli],[True],{'ülikooli': 1},{'ülikooli': 1.0},ülikooli,abbrev not in iso4


## Understanding Abbreviation properties 

#### Are all abbreviations substrings of full name?

In [102]:
def is_substring(full, abbrev):
    abbrev = re.sub("\.", "", abbrev)
    if abbrev in full:
        return True
    else:
        return False

In [103]:
is_substring("WORDS", "ABBREVIATION")

False

In [104]:
ltwa_df_with_abbrev["is_substring"] = ltwa_df_with_abbrev.apply(lambda x: 
                                                                is_substring(x["full"], x["abbrev"]), axis = 1)
print(ltwa_df_with_abbrev.shape)
ltwa_df_with_abbrev.head()

(37333, 4)


Unnamed: 0,full,abbrev,lang,is_substring
0,WORDS,ABBREVIATIONS,LANGUAGES,False
1,-agōgē,-ag.,gre,True
2,-aineisto,-ain.,fin,True
3,-Alföld,-Alf.,hun,True
4,-asema,-as.,fin,True


In [105]:
ltwa_df_with_abbrev[ltwa_df_with_abbrev["is_substring"] == True]

Unnamed: 0,full,abbrev,lang,is_substring
1,-agōgē,-ag.,gre,True
2,-aineisto,-ain.,fin,True
3,-Alföld,-Alf.,hun,True
4,-asema,-as.,fin,True
5,-baden,-bad.,ger,True
...,...,...,...,...
37328,żywienie,żyw.,pol,True
37329,żywnośc-,żywn.,pol,True
37330,ģeogrāfiski,ģeogr.,lav,True
37331,ģeoloģ-,ģeol.,lav,True


In [106]:
ltwa_df_with_abbrev[ltwa_df_with_abbrev["is_substring"] == False]

Unnamed: 0,full,abbrev,lang,is_substring
0,WORDS,ABBREVIATIONS,LANGUAGES,False
6,-band (book),-bd.,ger,False
21,-borough,-brgh.,eng,False
22,-bourg,-bg.,fre,False
26,-burg,-bg.,mul,False
...,...,...,...,...
37206,zsebszámológép,zsebszámgép.,hun,False
37208,zsebútikönyv,zsebútikv.,hun,False
37266,zusammen,zs.,ger,False
37267,zusammenfass-,zs.fass.,ger,False


*No! Not all abbreviations are substrings of the full text so cannot use it as a filter.

#### Are all abbreviations supposed to end with "."?

In [107]:
ltwa_df_with_abbrev["end_with_period"] = ltwa_df_with_abbrev["abbrev"].apply(lambda x: "." in x)
print(ltwa_df_with_abbrev.shape)
ltwa_df_with_abbrev.head()

(37333, 5)


Unnamed: 0,full,abbrev,lang,is_substring,end_with_period
0,WORDS,ABBREVIATIONS,LANGUAGES,False,False
1,-agōgē,-ag.,gre,True,True
2,-aineisto,-ain.,fin,True,True
3,-Alföld,-Alf.,hun,True,True
4,-asema,-as.,fin,True,True


In [110]:
ltwa_df_with_abbrev[ltwa_df_with_abbrev["end_with_period"] == True].head()

Unnamed: 0,full,abbrev,lang,is_substring,end_with_period
1,-agōgē,-ag.,gre,True,True
2,-aineisto,-ain.,fin,True,True
3,-Alföld,-Alf.,hun,True,True
4,-asema,-as.,fin,True,True
5,-baden,-bad.,ger,True,True


In [111]:
ltwa_df_with_abbrev[ltwa_df_with_abbrev["end_with_period"] == False].head()

Unnamed: 0,full,abbrev,lang,is_substring,end_with_period
0,WORDS,ABBREVIATIONS,LANGUAGES,False,False
3332,bendrij-,bendrij-,lit,True,False
4315,Boserup,Boserup,dan,True,False
7484,Dollard des Ormeaux,Dollard Ormeaux,fre,False,False
9273,esquina-,esquina,spa,True,False


*No! Not All abbreviations end with ".". In fact, some doens't have endings and some have "-", which means that they are root-like structures of worlds

## Language Detection
 - TextBlob (NLTK package, uses Google)


In [112]:
from textblob import TextBlob
b = TextBlob("bonjour")
b.detect_language()

'fr'

In [113]:
b = TextBlob("ACS Appl. Mater. Interfaces")
b.detect_language()

'en'

In [114]:
b = TextBlob("hola")
b.detect_language()

'es'

https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes

In [115]:
language_code_2d_3d = {"en":"eng", "tr":"tur", "fr": "fre" , "sv":"swe", "ru":"rus", "et":"est", "no":"nor", 
                       "lv":"lat", "cs":"cze", "pt":"por", "mt":"mlt", "sq":"alb", "es":"spa", "de":"ger", 
                       "uk":"ukr", "is":"ice", "ka":"geo", "cy":"wel", "pl": "pol", "id":"ind", "da":"dan", 
                       "az":"aze", "bg":"bul", "ca":"cat", "nl":"dut", "af":"afr", "eu":"baq", "yi":"yid", 
                       "it":"ita", "hu":"hun", "fi":"fin", "sl":"slv", "sk":"slo", "ms":"may", "mk":"mac", "lt":"lit"}

In [116]:
def find_language(s):
    b = TextBlob(s)
    lang = b.detect_language()
    if lang in list(language_code_2d_3d.keys()):
        return language_code_2d_3d[lang]
    return ""

In [117]:
# def abbreviate_journal_name2(journal):
    
#     lang = find_language(journal)
#     return abbreviate(journal, ["eng", lang])

In [118]:
def find_possible_word_expansions(abbrev, lang):
    if "." in abbrev:
        df = ltwa_df_with_abbrev[(ltwa_df_with_abbrev["lang"] == lang) & (ltwa_df_with_abbrev["abbrev"] == abbrev)]
        if df.empty:
            return abbrev
        elif df.shape[0] == 1:
            return df.iloc[0]["full"]
        else:
            return df["full"].tolist()
    else:
        return abbrev

In [119]:
def find_possible_phrase_expansions(phrase):
    lang = find_language(phrase)
    print(lang)
    
    words = phrase.split()
    result = [""]
    for word in words:
        
        word = word.lower()
        curr_expan = find_possible_word_expansions(word, lang)
        print(curr_expan) 
        if type(curr_expan) == list:
            result_copy = result.copy()
            result = []
            for c_e in curr_expan:
                for res in result_copy:
                    result.append(res + " " + c_e)
        elif type(curr_expan) == str:
            result_copy = result.copy()
            result = []
            for res in result_copy:
                result.append(res.strip() + " " + curr_expan)
                
#         print(result)
    return result

In [120]:
find_possible_phrase_expansions("ACS Catal.")

cat
acs
catal.


['acs catal.']

In [121]:
find_possible_phrase_expansions("ACS Appl. Mater. Interfaces")

eng
acs
['appliance', 'applied', 'applying']
['materials', 'materiel']
interfaces


['acs appliance materials interfaces',
 'acs applied materials interfaces',
 'acs applying materials interfaces',
 'acs appliance materiel interfaces',
 'acs applied materiel interfaces',
 'acs applying materiel interfaces']

In [122]:
find_possible_phrase_expansions("Proc. Natl. Acad. Sci. U. S. A.")

eng
proceedings
natl.
acad.
sci.
u.
s.
a.


['proceedings natl. acad. sci. u. s. a.']

In [123]:
ltwa_df[ltwa_df["abbrev"] == "natl."]

Unnamed: 0,full,abbrev,lang
32315,nationaal,natl.,dut
32316,nat̡ional-,natl.,"[rum, fre, eng]"
32317,nationaliteit,natl.,dut


In [124]:
dataset_abbrev_freq[dataset_abbrev_freq["abbrev"] == "proc."]

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
5037,proc.,"[proceedings, proceedings, proceedings, procee...","[True, True, True, True, True, True, True, Tru...","{'proceedings': 182, '""proceedings': 1, 'proce...","{'proceedings': 0.9479166666666666, '""proceedi...",proceedings,proceedings


In [125]:
dataset_abbrev_freq[dataset_abbrev_freq["abbrev"] == "natl"]

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
4076,natl,"[national, nationale, national, nationale, fur...","[False, False, False, False, False, False, Fal...","{'national': 56, 'nationale': 3, 'fur': 1, 'na...","{'national': 0.9180327868852459, 'nationale': ...",national,abbrev not in iso4


In [126]:
dataset_abbrev_freq[dataset_abbrev_freq["abbrev"] == "acad."]

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
72,acad.,"[academy, academy, academia, academy, academy,...","[True, True, True, True, True, True, True, Tru...","{'academy': 82, 'academia': 8, 'l'académie': 5...","{'academy': 0.5774647887323944, 'academia': 0....",academy,academ-


In [127]:
dataset_abbrev_freq[dataset_abbrev_freq["abbrev"] == "sci."]

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
5599,sci.,"[science, science, science, science, science, ...","[True, True, True, True, True, True, True, Tru...","{'science': 861, 'sciences': 415, 'scientific'...","{'science': 0.5995821727019499, 'sciences': 0....",science,"[scienc-, scient-, scienz-]"


In [128]:
dataset_abbrev_freq[dataset_abbrev_freq["abbrev"] == "u."]

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
6343,u.,"[united, united, united, united, united, unite...","[True, True, True, True, True, True, True, Tru...",{'united': 77},{'united': 1.0},united,abbrev not in iso4


In [129]:
dataset_abbrev_freq[dataset_abbrev_freq["abbrev"] == "s."]

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
5510,s.,"[states, states, states, states, states, state...","[True, True, True, True, True, True, True, Tru...","{'states': 50, 'south': 25, 'scotia': 1, 'são'...","{'states': 0.6172839506172839, 'south': 0.3086...",states,abbrev not in iso4


In [130]:
dataset_abbrev_freq[dataset_abbrev_freq["abbrev"] == "a."]

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
48,a.,"[america, -, part]","[True, False, True]","{'america': 1, '-': 1, 'part': 1}","{'america': 0.3333333333333333, '-': 0.3333333...",america,abbrev not in iso4


## N-grams training

### Example

In [131]:
import nltk
nltk.download('reuters')

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/xinyuechen/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [132]:
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1

for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [133]:
dict(model["today", "the"])

{'public': 0.05555555555555555,
 'European': 0.05555555555555555,
 'Bank': 0.05555555555555555,
 'price': 0.1111111111111111,
 'emirate': 0.05555555555555555,
 'overseas': 0.05555555555555555,
 'newspaper': 0.05555555555555555,
 'company': 0.16666666666666666,
 'Turkish': 0.05555555555555555,
 'increase': 0.05555555555555555,
 'options': 0.05555555555555555,
 'Higher': 0.05555555555555555,
 'pound': 0.05555555555555555,
 'Italian': 0.05555555555555555,
 'time': 0.05555555555555555}

*Relies heavily on training data

### Play with ngram frequencies of Full Titles

In [213]:
def clean_word(word):
    word = re.sub("\(", "", word)
    word = re.sub("\)", "", word)
    word = re.sub(",", "", word)
    return word.lower()

In [214]:
def get_journal_lst(journal_full):
    if journal_full == "" or journal_full == np.nan:
        return ""
    
    journal_new = re.sub(r"[!?&*+=|,]", "", journal_full)
    if ":" in journal_new:
        before_colon = journal_new.split(":")[0]
        brackets = ""
        if "(" in journal_new and ")" in journal_new:
            brackets = journal_new[journal_new.find("("):journal_new.find(")")+1]
        journal_new = before_colon + " " + brackets
        
    journal_lst = journal_new.split(" ")
        
    result = []
    for item in journal_lst:
        if item != "":
            result.append(clean_word(item.strip()))

    return result

In [215]:
abbrev_df.head()

Unnamed: 0,full,abbrev,full_lst,abbrev_lst,mappings
0,ACS Applied Materials & Interfaces,ACS Appl. Mater. Interfaces,"[ACS, Applied, Materials, Interfaces]","[ACS, Appl., Mater., Interfaces]","[(ACS, ACS), (Applied, Appl.), (Materials, Mat..."
1,ACS Applied Nano Materials,ACS Appl. Nano Mater.,"[ACS, Applied, Nano, Materials]","[ACS, Appl., Nano, Mater.]","[(ACS, ACS), (Applied, Appl.), (Nano, Nano), (..."
2,ACS Biomaterials Science & Engineering,ACS Biomater. Sci. Eng.,"[ACS, Biomaterials, Science, Engineering]","[ACS, Biomater., Sci., Eng.]","[(ACS, ACS), (Biomaterials, Biomater.), (Scien..."
3,ACS Catalysis,ACS Catal.,"[ACS, Catalysis]","[ACS, Catal.]","[(ACS, ACS), (Catalysis, Catal.)]"
4,ACS Central Science,ACS Cent. Sci.,"[ACS, Central, Science]","[ACS, Cent., Sci.]","[(ACS, ACS), (Central, Cent.), (Science, Sci.)]"


In [216]:
# Training data need to be list of lists
ngram_train = abbrev_df["full"].apply(get_journal_lst).tolist()
ngram_train[10:20]

[['acs', 'macro', 'letters'],
 ['acs', 'medicinal', 'chemistry', 'letters'],
 ['acs', 'nano'],
 ['acs', 'photonics'],
 ['acs', 'sensors'],
 ['acs', 'sustainable', 'chemistry', 'engineering'],
 ['acs', 'symposium', 'series'],
 ['acs', 'synthetic', 'biology'],
 ['aiche', 'journal'],
 ['arkivoc', 'gainesville', 'fl', 'united', 'states']]

In [217]:
# Create a placeholder for model
model_bi = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for sentence in ngram_train:
    for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
#         print(w1, w2)
        model_bi[w1][w2] += 1

for w1 in model_bi:
    total_count = float(sum(model_bi[w1].values()))
#     if total_count != 0:
#     print(w1, total_count)
    for w2 in model_bi[w1]:
        model_bi[w1][w2] /= total_count
#     else:
#         model_bi[w1][w2] = 0

In [218]:
dict(model_bi["computer"])

{'physics': 0.014705882352941176,
 'chemistry': 0.014705882352941176,
 'engineering': 0.03431372549019608,
 'science': 0.37745098039215685,
 'aided': 0.024509803921568627,
 'methods': 0.03431372549019608,
 'modeling': 0.004901960784313725,
 'graphics': 0.058823529411764705,
 'mathematics': 0.014705882352941176,
 'scientists': 0.004901960784313725,
 'and': 0.0392156862745098,
 'sciences': 0.024509803921568627,
 'applications': 0.03431372549019608,
 'modelling': 0.024509803921568627,
 'programming': 0.004901960784313725,
 'technology': 0.00980392156862745,
 'programs': 0.00980392156862745,
 'supported': 0.00980392156862745,
 'vision': 0.0392156862745098,
 'society': 0.0196078431372549,
 'assisted': 0.0196078431372549,
 'application': 0.004901960784313725,
 'review': 0.004901960784313725,
 'information': 0.004901960784313725,
 None: 0.014705882352941176,
 'architecture': 0.004901960784313725,
 'group': 0.004901960784313725,
 'algebra': 0.00980392156862745,
 'simulation': 0.009803921568627

In [219]:
# Create a placeholder for model
model_tri = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for sentence in ngram_train:
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model_tri[(w1, w2)][w3] += 1

for w1_w2 in model_tri:
    total_count = float(sum(model_tri[w1_w2].values()))
    if total_count != 0:
        for w3 in model_tri[w1_w2]:
            model_tri[w1_w2][w3] /= total_count
    else:
        model_tri[w1_w2][w3] = 0

In [221]:
dict(model_tri["of", "human"])

{'genetics': 0.2558139534883721,
 'biology': 0.06976744186046512,
 'resources': 0.06976744186046512,
 'behavior': 0.023255813953488372,
 'ecology': 0.023255813953488372,
 'ergology': 0.046511627906976744,
 'evolution': 0.046511627906976744,
 'hypertension': 0.046511627906976744,
 'lactation': 0.046511627906976744,
 'nutrition': 0.06976744186046512,
 'stress': 0.046511627906976744,
 'virology': 0.046511627906976744,
 'sexuality': 0.06976744186046512,
 'diseases': 0.046511627906976744,
 'services': 0.046511627906976744,
 'rights': 0.023255813953488372,
 'development.': 0.023255813953488372}

## Wholistic Model Idea 
- given abbreviation, separate into words 
- directly translate what we can into full words 
- use mappings, check that mappings are substrings or almost half substrings
- use n-gram models to predict words that are still abbreviations

In [274]:
def clean_word(word):
    word = re.sub("\(", "", word)
    word = re.sub("\)", "", word)
    word = re.sub(",", "", word)
    return word

In [367]:
def get_journal_lst(journal):
    if journal == "" or journal == np.nan:
        return ""
    
    journal_new = re.sub(r"[!?&*+=|,]", "", journal)
    if ":" in journal_new and journal_new.find(":") < journal_new.find("("):
        before_colon = journal_new.split(":")[0]
        brackets = ""
        if "(" in journal_new and ")" in journal_new:
            brackets = journal_new[journal_new.find("("):journal_new.find(")")+1]
        journal_new = before_colon + " " + brackets
    
    journal_lst = journal_new.split(" ")
    print(journal_lst)
    result = []
    for item in journal_lst:
        if item != "":
            result.append(clean_word(item.strip()))

    return result

In [368]:
get_journal_lst("Anat. pathol. (Chic. Ill, : annu.)")

['Anat.', 'pathol.', '(Chic.', 'Ill', ':', 'annu.)']


['Anat.', 'pathol.', 'Chic.', 'Ill', ':', 'annu.']

In [225]:
def get_language(s):
    b = TextBlob(s)
    lang = b.detect_language()
    if lang in list(language_code_2d_3d.keys()):
        return language_code_2d_3d[lang]
    return ""

In [226]:
def languages_contain_lang(languages, lang):
    if type(languages) == str:
        if languages == "mul": # multiple
            return True
        if lang == languages:
            return True
    elif type(languages) == list:
        return lang in languages
    return False

def get_iso4_df_for_lang(lang):
    bools = ltwa_df_with_abbrev.apply(lambda x: languages_contain_lang(x["lang"], lang), axis = 1)
    return ltwa_df_with_abbrev[bools]

In [227]:
get_iso4_df_for_lang("eng")

Unnamed: 0,full,abbrev,lang,is_substring,end_with_period
20,-book,-b.,eng,True,True
21,-borough,-brgh.,eng,False,True
23,-bourne,-b.,eng,True,True
26,-burg,-bg.,mul,False,True
28,-business,-bus.,eng,True,True
...,...,...,...,...,...
37097,zirconium,zircon.,mul,True,True
37135,znač-,znač.,mul,True,True
37175,zoophysiolog-,zoophysiol.,mul,True,True
37185,zooveterinar-,zoovet.,mul,True,True


In [228]:
def get_iso4_expansions_for_word(word, lang):
    if "." in word: # it is an abbreviation
        lang_df = get_iso4_df_for_lang(lang)
        df = lang_df[(lang_df["abbrev"] == word)]
        if df.empty:
            return word
        elif df.shape[0] == 1:
            return df.iloc[0]["full"]
        else:
            return df["full"].tolist()
    else: # it is not an abbreviation
        return word

In [229]:
def get_raw_iso4_expansions(journal_lst, lang):
    raw_expansions = []
    for word in journal_lst:
        
        # special case with hyphenated word
        if "-" in word:
            w_lst = word.split("-")
            w_new = ""
            for w in w_lst:
                w_new += "|" + str(get_iso4_expansions_for_word(w.lower(), lang))
            raw_expansions.append(w_new[1:])
            continue
        
        # regular case 
        iso4_expansions = get_iso4_expansions_for_word(word.lower(), lang)
        raw_expansions.append(iso4_expansions)
    return raw_expansions

In [230]:
get_raw_iso4_expansions(["Oil-Bear."], "eng")

["oil|['bearer', 'bearing-']"]

In [231]:
dataset_abbrev_freq.tail(5)

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
6799,ökonom.,[ökonometrie],[True],{'ökonometrie': 1},{'ökonometrie': 1.0},ökonometrie,abbrev not in iso4
6800,österr.,"[österreichischen, österreichische, österreich...","[True, True, True, True, True]","{'österreichischen': 1, 'österreichische': 3, ...","{'österreichischen': 0.2, 'österreichische': 0...",österreichische,abbrev not in iso4
6801,überbl.,[überblicke],[True],{'überblicke': 1},{'überblicke': 1.0},überblicke,abbrev not in iso4
6802,ül.,[ülikooli],[True],{'ülikooli': 1},{'ülikooli': 1.0},ülikooli,abbrev not in iso4
6803,ştiinţ.,[ştiinţific],[True],{'ştiinţific': 1},{'ştiinţific': 1.0},ştiinţific,abbrev not in iso4


In [232]:
def get_most_freq_for_word(word):
    df = dataset_abbrev_freq[dataset_abbrev_freq["abbrev"] == word]
    if df.empty:
        return word
    elif df.shape[0] == 1:
        return df.iloc[0]["most_freq"]
    else:
        return df["most_freq"].tolist()

In [233]:
def get_most_frequent_expansions(journal_lst):
    mf_expansions = []
    for word in journal_lst:
        
        # special case with hyphenated word
        if "-" in word:
            w_lst = word.split("-")
            w_new = ""
            for w in w_lst:
                w_new += "|" + str(get_most_freq_for_word(w.lower()))
            mf_expansions.append(w_new[1:])
            continue
        
        # regular case
        curr_mf = get_most_freq_for_word(word.lower())
        mf_expansions.append(curr_mf)
    return mf_expansions

In [422]:
def get_frequency_for_word(abbrev, full):
    df = dataset_abbrev_freq[dataset_abbrev_freq["abbrev"] == abbrev]
    if df.empty:
        return 0
    elif df.shape[0] == 1:
        freq_dict =  df.iloc[0]["freq"]
        if full in freq_dict.keys():
            return freq_dict[full]
        return 0
    else:
        freq_dict_lst =  df.iloc[0]["freq"].tolist()
        for freq_dict in freq_dict_lst:
            if full in freq_dict.keys():
                return freq_dict[full]
        return 0

In [425]:
type({"a": 1, "b":2})

dict

In [283]:
def match_raw_iso4_most_freq_word(curr_raw, curr_mf, curr_abbrev):
    # correct if iso4 is the most frequent
    if curr_raw == curr_mf:
        return curr_raw

    # correct if iso4 is substring of most frequent (ex. iso4's europ- for mf's european)
    if "-" in curr_raw:
        if curr_raw.split("-")[0] in curr_mf:
            return curr_mf
        if re.sub(r"\.", "", curr_mf) in curr_raw.split("-")[0]:
            return curr_raw.split("-")[0]

    # correct if most frequent word has frequency > 80%
    if get_frequency_for_word(curr_abbrev.lower(), curr_mf) > 0.8:
        return curr_mf

    # correct if most frequent word is abbreviation format and substring raw expansion
    if "." in curr_mf:
        if curr_mf.split(".")[0] == curr_raw[:len(curr_mf)-1]:
            return curr_raw
        
    if "." in curr_raw:
        if curr_raw.split(".")[0] == curr_mf[:len(curr_raw)-1]:
            return curr_mf
        
    return ""

In [236]:
import ast

In [237]:
ast.literal_eval("['1', '2']")

['1', '2']

In [404]:
def verify_raw_iso4_most_freq_lst(journal_lst, raw_expansion, mf_expansion):
    verification_lst = []
    for i in range(len(journal_lst)):
        curr_abbrev = journal_lst[i].lower()
        curr_raw = raw_expansion[i]
        curr_mf = mf_expansion[i]
        
        # keep non abbreviations the way they are
        if "." not in curr_abbrev:
            verification_lst.append(curr_abbrev)
            continue
        
        # special case with hyphenated word
        if "|" in curr_raw and "|" in curr_mf:
            raw_lst = curr_raw.split("|")
            mf_lst = curr_mf.split("|")
            w_new = ""
            for i in range(len(raw_lst)):
                if "[" in raw_lst[i] or "]" in raw_lst[i]:
                    options_for_i = ast.literal_eval(raw_lst[i])
                    opt = ""
                    for o in options_for_i:
                        tmp = match_raw_iso4_most_freq_word(o, mf_lst[i], curr_abbrev)
                        if tmp:
                            opt = tmp
                    w_new += "|" + opt
                else:
                    w_new += "|" + match_raw_iso4_most_freq_word(raw_lst[i], mf_lst[i], curr_abbrev)
            verification_lst.append(re.sub(r"\|", "-", w_new[1:]))
            continue
        
        if type(curr_raw) == str:
#             print(curr_abbrev, curr_mf, get_frequency_for_word(curr_abbrev.lower(), curr_mf))
            word_to_add = match_raw_iso4_most_freq_word(curr_raw, curr_mf, curr_abbrev)
            if word_to_add != "":
                verification_lst.append(word_to_add)
            else:
                if "-" in curr_raw:
                    verification_lst.append(curr_raw.split("-")[0])
                
                elif fuzz.ratio(curr_raw, curr_mf) > 90:
                    verificiation_lst.append(curr_raw)
                
            continue
            
                    
        elif type(curr_raw) == list:
            verified_word = ""
            for elem in curr_raw:
                
                word_to_add = match_raw_iso4_most_freq_word(elem, curr_mf, curr_abbrev)
                if word_to_add != "":
                    verified_word = word_to_add
                else:
                    if "-" in curr_raw:
                        verified_word = curr_raw.split("-")[0]
                    elif fuzz.ratio(curr_raw, curr_mf) > 95:
                        verified_word = curr_raw
                    
            verification_lst.append(verified_word)
            continue
                
        verification_lst.append("")
    return verification_lst

In [239]:
def abbreviate_journal_name(journal):
    url = "https://abbreviso.toolforge.org/a/" + journal
    request = requests.get(url)
    if request.status_code != 200:
        return journal
    return request.text

In [240]:
abbreviate_journal_name("United States of America")

'U. S. A.'

In [241]:
def get_frequencies_for_word(word):
    df = dataset_abbrev_freq[dataset_abbrev_freq["abbrev"] == word]
    if df.empty:
        return word
    elif df.shape[0] == 1:
        return df.iloc[0]["freq"]
    else:
        return df["freq"].tolist()

In [242]:
def get_frequencies_expansions(journal_lst):
    freq_expansions = []
    for word in journal_lst:
        
        # special case with hyphenated word
        if "-" in word:
            w_lst = word.split("-")
            w_new = ""
            for w in w_lst:
                w_new += "|" + str(get_frequencies_for_word(w.lower()))
            freq_expansions.append(w_new[1:])
            continue
        
        # regular case
        curr_mf = get_frequencies_for_word(word.lower())
        freq_expansions.append(curr_mf)
    return freq_expansions

In [243]:
get_frequencies_expansions(["J.","Earthq.", "Eng."])

[{'journal': 0.991830749895266,
  'society': 0.00020946795140343527,
  'american': 0.00020946795140343527,
  'polish': 0.00020946795140343527,
  'university': 0.00041893590280687055,
  'jersey': 0.0025136154168412233,
  'japan': 0.00020946795140343527,
  'jounal': 0.00020946795140343527,
  'jurnali': 0.00020946795140343527,
  'journals': 0.00020946795140343527,
  'journal.': 0.0016757436112274822,
  'j': 0.00020946795140343527,
  'jornal': 0.00041893590280687055,
  'journalen': 0.00020946795140343527,
  'journalism': 0.00020946795140343527,
  'bihar': 0.00020946795140343527,
  'operations': 0.00020946795140343527,
  'institute': 0.00020946795140343527,
  'islamic': 0.00020946795140343527,
  'journa': 0.00020946795140343527},
 'earthq.',
 {'engineering': 0.9326047358834244,
  'engineer': 0.01092896174863388,
  'engineers': 0.04371584699453552,
  'engineered': 0.0036429872495446266,
  'energy': 0.0018214936247723133,
  'engineering.': 0.0018214936247723133,
  'engineerign': 0.00364298724

In [244]:
from nltk import word_tokenize, pos_tag

In [245]:
def get_pos_tag_for_word(word):
    text = word_tokenize(word)
    pos_lst = nltk.pos_tag(text)
    if len(pos_lst) >= 1:
        pos_item = pos_lst[0]
        if len(pos_item) == 2:
            pos = pos_item[1]
            if "NN" in pos:
                return "noun"
            if "JJ" in pos:
                return "adj"
            if "V" in pos:
                return "verb"
            return "other"
        return ""
    return ""

In [246]:
# def get_pos_tag_for_lst(word_lst):
    

In [247]:
get_pos_tag_for_word("therapeutical")

'adj'

In [248]:
def get_max_prob_word_in_dict(d):
    if not d:
        return ""
    keys = list(d.keys())
    vals = list(d.values())
    return keys[vals.index(max(vals))]

In [249]:
def most_freq_noun_for_word(word):
    word = word.lower()
    frequencies = get_frequencies_for_word(word)
    nouns = dict()
    if frequencies and type(frequencies) == str:
        return frequencies
    elif frequencies and type(frequencies) == dict:
        for elem in frequencies.keys():
            if word.split(".")[0] == elem[:len(word)-1]:
                if get_pos_tag_for_word(elem) == "noun":
                    nouns[elem] = frequencies[elem]
    if nouns:
        return get_max_prob_word_in_dict(nouns)
    return ""

In [250]:
most_freq_noun_for_word("A.")

'america'

In [251]:
def build_model_bi_forward_predict(train_data):
    # Create a placeholder for model
    m = defaultdict(lambda: defaultdict(lambda: 0))

    # Count frequency of co-occurance  
    for sentence in train_data:
        for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
            if w1:
                w1 = w1.lower()
            if w2: 
                w2 = w2.lower()
            m[w1][w2] += 1

    for w1 in m:
        total_count = float(sum(m[w1].values()))
        for w2 in m[w1]:
            m[w1][w2] /= total_count
            
    return m

In [252]:
def build_model_bi_backward_predict(train_data):
    # Create a placeholder for model
    m = defaultdict(lambda: defaultdict(lambda: 0))

    # Count frequency of co-occurance  
    for sentence in train_data:
        for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
            if w1:
                w1 = w1.lower()
            if w2: 
                w2 = w2.lower()
            m[w2][w1] += 1

    for w2 in m:
        total_count = float(sum(m[w2].values()))
        for w1 in m[w2]:
            m[w2][w1] /= total_count
            
    return m

In [253]:
model_bi_forward = build_model_bi_forward_predict(ngram_train)

In [254]:
model_bi_backward = build_model_bi_backward_predict(ngram_train)

In [255]:
def filter_non_stopwords_in_dict(d):
    result = {}
    for e in d.keys():
        if e in stopwords:
            result[e] = d[e]
    return result

In [256]:
def filter_stopwords_in_dict(d):
    result = {}
    for e in d.keys():
        if e not in stopwords:
            result[e] = d[e]
    return result

In [257]:
dict(model_bi_forward["perspectives"])

{'in': 0.2891566265060241,
 'on': 0.24096385542168675,
 None: 0.37349397590361444,
 'innovations': 0.012048192771084338,
 'and': 0.012048192771084338,
 'quarterly': 0.012048192771084338,
 'en': 0.012048192771084338,
 'gerontological': 0.012048192771084338,
 'internationales': 0.012048192771084338,
 '/': 0.012048192771084338,
 'crm': 0.012048192771084338}

In [258]:
dict(model_bi_forward["united"])

{'states': 0.5220125786163522,
 'kingdom': 0.1949685534591195,
 'arab': 0.018867924528301886,
 'states.': 0.1761006289308176,
 'nations': 0.05660377358490566,
 'nations.': 0.006289308176100629,
 'hospital': 0.012578616352201259,
 'synagogue': 0.006289308176100629,
 'evangelical': 0.006289308176100629}

In [261]:
def ngram_predict(model, word, stopwords = False, abbrev = ""):
    word_pred_d = dict(model[word])
    if stopwords:
        word_pred_d = filter_non_stopwords_in_dict(word_pred_d)
    else:
        word_pred_d = filter_stopwords_in_dict(word_pred_d)
        
    result = {}
    if abbrev == "":
        max_prob_word = get_max_prob_word_in_dict(word_pred_d)
        if max_prob_word and word_pred_d[max_prob_word]:
            result[max_prob_word] = word_pred_d[max_prob_word]
    else:    
        ## make sure we are only choosing from words with appropriate stem 
        if word_pred_d:
            temp = {}
            for w in word_pred_d.keys():
                if w and abbrev.split(".")[0] in w:
                    temp[w] = word_pred_d[w]
                
            max_prob_word = get_max_prob_word_in_dict(temp)
            if max_prob_word and word_pred_d[max_prob_word]:
                result[max_prob_word] = word_pred_d[max_prob_word]
                
    return result

In [262]:
ngram_predict(model_bi_forward, "perspectives", True)

{'in': 0.2891566265060241}

In [300]:
def bigram_stopword_recovery(word_lst, m_forward, m_backward):
    result = []
    for i in range(len(word_lst) - 1):
        
        curr_word = word_lst[i]
        result.append(curr_word)
        next_stopword_pred = ngram_predict(m_forward, curr_word, True)
        
        next_word = word_lst[i + 1]
        prev_stopword_pred = ngram_predict(m_backward, next_word, True)
        
        if not next_stopword_pred:
            continue

        if list(next_stopword_pred.values())[0] > 0.8:
            result.append(list(next_stopword_pred.keys())[0])
        elif (prev_stopword_pred and 
              (list(next_stopword_pred.values())[0] > 0.3 or list(prev_stopword_pred.values())[0] > 0.3)):
            
            if list(next_stopword_pred.keys())[0] == list(prev_stopword_pred.keys())[0]:
                result.append(list(next_stopword_pred.keys())[0])
        
    result.append(word_lst[len(word_lst) - 1])
    return result

In [297]:
bigram_stopword_recovery(['Perspectives', 'Drug', 'Discovery', 'Design'], model_bi_forward, model_bi_backward)

['Perspectives', 'Drug', 'Discovery', 'Design']

In [298]:
bigram_stopword_recovery(['journal', 'earthquake', 'engineering'], model_bi_forward, model_bi_backward)

['journal', 'of', 'earthquake', 'engineering']

In [301]:
bigram_stopword_recovery(['united', 'states', 'america'], model_bi_forward, model_bi_backward)

['united', 'states', 'of', 'america']

In [314]:
def bigram_find_blank_words(word_lst, m_forward, m_backward, abbrev_lst):
    
    # have a vector that keeps track of what should be changed 
    check = [0] * len(word_lst)
    for i in range(len(word_lst)):
        if word_lst[i] == "":
            check[i] = 1
    
    # all words filled already 
    if sum(check) < 1:
        return word_lst
    
    for i in range(len(check)):
        
        if check[i] == 1:
            abbrev = abbrev_lst[i].lower()
            next_word_pred = {}
            prev_word_pred = {}
            if i == 0:
                next_word = word_lst[i + 1]
                prev_word_pred = ngram_predict(m_backward, next_word, False, abbrev)
            elif i == len(check) - 1:
                prev_word = word_lst[i - 1]
                next_word_pred = ngram_predict(m_forward, prev_word, False, abbrev)
            else:
                prev_word = word_lst[i - 1]
                next_word_pred = ngram_predict(m_forward, prev_word, False, abbrev)
                next_word = word_lst[i + 1]
                prev_word_pred = ngram_predict(m_backward, next_word, False, abbrev)
            
            curr_from_prev = ""
            curr_from_next = ""
            if next_word_pred:
                curr_from_prev = list(next_word_pred.keys())[0]
            if prev_word_pred:
                curr_from_next = list(prev_word_pred.keys())[0]
                
            if curr_from_prev != "" and curr_from_next != "":
                if curr_from_next == curr_from_prev:
                    word_lst[i] = curr_from_prev
                elif prev_word_pred[curr_from_next] > next_word_pred[curr_from_prev]:
                    word_lst[i] = curr_from_next
                else:
                    word_lst[i] = curr_from_prev
            elif curr_from_prev != "":
                word_lst[i] = curr_from_prev
            elif curr_from_next != "":
                word_lst[i] = curr_from_next
        
    return word_lst

In [267]:
bigram_find_blank_words(['journal', '', 'engineering'], model_bi_forward, model_bi_backward, ['J.', 'Earthq.', "Eng."])

['journal', 'earthquake', 'engineering']

In [316]:
def fix_capital_for_word(init_word, curr_word):
    
    if not init_word or not curr_word:
        return curr_word
    
    # if previous word is not abbreviation and all caps, keep it that way
    if "." not in init_word and init_word.isupper():
        return init_word
    
    # if previous word has its first character capitalized, do the same
    if init_word[0].isupper():
        return curr_word[0].upper() + curr_word[1:]
        
    # else, keep curr word
    return curr_word

In [310]:
def fix_capitalization(initial_lst, lst_to_change):
    result = []
    if len(initial_lst) > len(lst_to_change):
        print("something went wrong")
        return lst_to_change
    
    i_i = 0
    for i_j in range(len(lst_to_change)):
        init_word = initial_lst[i_i]
        curr_word = lst_to_change[i_j]
        if type(init_word) != str and type(curr_word) != str:
            result.append(curr_word)
            i_i += 1
            continue
        if curr_word == "" or init_word == "":
            result.append(curr_word)
            continue
        if curr_word in stopwords:
            result.append(curr_word)
            continue
        if "-" in curr_word and "-" in init_word:
            c_w_lst = curr_word.split("-")
            i_w_lst = init_word.split("-")
            new_w = ""
            for i in range(len(i_w_lst)):
                new_w += "-" + fix_capital_for_word(i_w_lst[i], c_w_lst[i])
            
            result.append(new_w[1:])
            i_i += 1
            continue
            
        result.append(fix_capital_for_word(init_word, curr_word))
        i_i += 1
#         print(init_word, curr_word)
        
    return result

In [343]:
def main(journal_abbrev, verbose = False, lang = ""):
    if not lang:
        lang = get_language(journal_abbrev)
        if not lang:
            lang = "eng"
        
    journal_lst = get_journal_lst(journal_abbrev)
    
    raw_expansion = get_raw_iso4_expansions(journal_lst, lang)
    
    mf_expansion = get_most_frequent_expansions(journal_lst)
    
    if verbose:
        print("journal list: ", journal_lst)
        print("raw iso4 expansion: ", raw_expansion)
        print("most frequent expansion: ", mf_expansion)
    
    if len(raw_expansion) != len(mf_expansion):
        print("something went wrong")
        
    verify_raw_mf = verify_raw_iso4_most_freq_lst(journal_lst, raw_expansion, mf_expansion)
    if verbose: print("verfication list: ", verify_raw_mf)
        
    bigram_pred = bigram_find_blank_words(verify_raw_mf, model_bi_forward, model_bi_backward, journal_lst)
    if verbose: print("bigram prediction: ", bigram_pred)
    
    if get_pos_tag_for_word(bigram_pred[-1]) != "noun":
        mf_noun = most_freq_noun_for_word(journal_lst[-1])
#         print(mf_noun)
        if mf_noun:
            bigram_pred = bigram_pred[:-1] + [mf_noun]
            
    if verbose: print("word list after fixing pos tagging: ", bigram_pred)
        
    bigram_pred_w_stop = bigram_stopword_recovery(bigram_pred, model_bi_forward, model_bi_backward)
    if verbose: print("bigram prediction with stop words: ", bigram_pred_w_stop)
    
    word_lst_fixed_capital = fix_capitalization(journal_lst, bigram_pred_w_stop)
    if verbose: print("fixed capitalization: ", word_lst_fixed_capital)
    
    journal_final_expansion = " ".join(word_lst_fixed_capital)
    
    if verbose: print("RESULT: ", journal_final_expansion)
    return journal_final_expansion

In [423]:
main("J. Earthq. Eng.", verbose = True)

['J.', 'Earthq.', 'Eng.']
journal list:  ['J.', 'Earthq.', 'Eng.']
raw iso4 expansion:  [['journal', 'jurnal'], 'earthquake', 'engineer-']
most frequent expansion:  ['journal', 'earthq.', 'engineering']
verfication list:  ['journal', 'earthquake', 'engineering']
bigram prediction:  ['journal', 'earthquake', 'engineering']
word list after fixing pos tagging:  ['journal', 'earthquake', 'engineering']
bigram prediction with stop words:  ['journal', 'of', 'earthquake', 'engineering']
fixed capitalization:  ['Journal', 'of', 'Earthquake', 'Engineering']
RESULT:  Journal of Earthquake Engineering


'Journal of Earthquake Engineering'

In [277]:
main("J. Essent. Oil-Bear. Plants", verbose = True)

journal list:  ['J.', 'Essent.', 'Oil-Bear.', 'Plants']
raw iso4 expansion:  [['journal', 'jurnal'], 'essential', "oil|['bearer', 'bearing-']", 'plants']
most frequent expansion:  ['journal', 'essential', 'oil|bear.', 'plants']
verfication list:  ['journal', 'essential', 'oil-bearing', 'Plants']
bigram prediction:  ['journal', 'essential', 'oil-bearing', 'Plants']
word list after fixing pos tagging:  ['journal', 'essential', 'oil-bearing', 'Plants']
bigram prediction with stop words:  ['journal', 'of', 'essential', 'oil-bearing', 'Plants']
fixed capitalization:  ['Journal', 'of', 'Essential', 'Oil-Bearing', 'Plants']
RESULT:  Journal of Essential Oil-Bearing Plants


'Journal of Essential Oil-Bearing Plants'

In [394]:
main("Recent Adv. Stud. Card. Struct. Metab.", verbose = True)
# should be "Recent Advances in Studies on Cardiac Structure and Metabolism"

['Recent', 'Adv.', 'Stud.', 'Card.', 'Struct.', 'Metab.']
journal list:  ['Recent', 'Adv.', 'Stud.', 'Card.', 'Struct.', 'Metab.']
raw iso4 expansion:  ['recent', 'advanc-', ['studies', 'studying'], 'card.', 'structur-', ['metabole', 'metaboli-']]
most frequent expansion:  ['recentes', 'advances', 'studies', 'cardiac', 'structural', 'metabolism']
verfication list:  ['recent', 'advances', 'studies', 'cardiac', 'structural', 'metabolism']
bigram prediction:  ['recent', 'advances', 'studies', 'cardiac', 'structural', 'metabolism']
word list after fixing pos tagging:  ['recent', 'advances', 'studies', 'cardiac', 'structural', 'metabolism']
bigram prediction with stop words:  ['recent', 'advances', 'in', 'studies', 'cardiac', 'structural', 'and', 'metabolism']
fixed capitalization:  ['Recent', 'Advances', 'in', 'Studies', 'Cardiac', 'Structural', 'and', 'Metabolism']
RESULT:  Recent Advances in Studies Cardiac Structural and Metabolism


'Recent Advances in Studies Cardiac Structural and Metabolism'

In [396]:
main("J. Am. Acad. Dermatol.", verbose = True)
# should be "Journal of the American Academy of Dermatology"

['J.', 'Am.', 'Acad.', 'Dermatol.']
journal list:  ['J.', 'Am.', 'Acad.', 'Dermatol.']
raw iso4 expansion:  [['journal', 'jurnal'], 'america-', 'academ-', 'dermatol.']
most frequent expansion:  ['journal', 'american', 'academy', 'dermatology']
verfication list:  ['journal', 'american', 'academy', 'dermatology']
bigram prediction:  ['journal', 'american', 'academy', 'dermatology']
word list after fixing pos tagging:  ['journal', 'american', 'academy', 'dermatology']
bigram prediction with stop words:  ['journal', 'of', 'american', 'academy', 'of', 'dermatology']
fixed capitalization:  ['Journal', 'of', 'American', 'Academy', 'of', 'Dermatology']
RESULT:  Journal of American Academy of Dermatology


'Journal of American Academy of Dermatology'

In [397]:
main("Real Living Mult. Scler.")
# should be "Real Living with Multiple Sclerosis"

['Real', 'Living', 'Mult.', 'Scler.']


'Real Living Multiple Sclerosis'

In [405]:
main("An. R. Acad. Farm.", True, "spa")
# Anales de la Real Academia de Farmacia

['An.', 'R.', 'Acad.', 'Farm.']
journal list:  ['An.', 'R.', 'Acad.', 'Farm.']
raw iso4 expansion:  ['anales', 'real (royal)', 'academ-', 'farmac-']
most frequent expansion:  ['anales', 'royal', 'academy', 'farmacia']
verfication list:  ['anales', 'academy', 'farmacia']
bigram prediction:  ['anales', 'academy', 'farmacia']
word list after fixing pos tagging:  ['anales', 'academy', 'farmacia']
bigram prediction with stop words:  ['anales', 'academy', 'of', 'farmacia']
fixed capitalization:  ['Anales', 'Academy', 'of', 'Farmacia']
RESULT:  Anales Academy of Farmacia


'Anales Academy of Farmacia'

In [278]:
main("Trans. Soc. Occup. Med.", verbose = True)

journal list:  ['Trans.', 'Soc.', 'Occup.', 'Med.']
raw iso4 expansion:  ['transact-', ['social', 'societ-'], ['occupancy', 'occupation-', 'occupied'], 'medecin-']
most frequent expansion:  ['transactions', 'society', 'occupational', 'medicine']
verfication list:  ['transactions', 'society', 'occupational', '']
bigram prediction:  ['transactions', 'society', 'occupational', 'medicine']
word list after fixing pos tagging:  ['transactions', 'society', 'occupational', 'medicine']
bigram prediction with stop words:  ['transactions', 'society', 'occupational', 'medicine']
fixed capitalization:  ['Transactions', 'Society', 'Occupational', 'Medicine']
RESULT:  Transactions Society Occupational Medicine


'Transactions Society Occupational Medicine'

In [279]:
main("Int. J. Geogr. Inf. Sci.")
# should be "International Journal of Geographical Information Science"

'International Journal of Geography Information Science'

In [280]:
from fuzzywuzzy import fuzz 
fuzz.ratio("International Journal of Geographical Information Science", 
           "International Journal of Geography Information Science")

95

In [281]:
fuzz.ratio("Journal of Continuous Education Nursing", "Journal of Continuous Education in Nursing")

96

*Can generate a fairly decent value for fuzzy matching

In [302]:
main("Proc. Natl. Acad. Sci. U. S. A.", True)
# should be "Proceedings of the National Academy of Sciences of the United States of America"

journal list:  ['Proc.', 'Natl.', 'Acad.', 'Sci.', 'U.', 'S.', 'A.']
raw iso4 expansion:  ['proceedings', 'nat̡ional-', 'academ-', ['scienc-', 'scient-'], 'u.', 's.', 'a.']
most frequent expansion:  ['proceedings', 'national', 'academy', 'science', 'united', 'states', 'america']
verfication list:  ['proceedings', 'national', 'academy', 'science', 'united', 'states', 'america']
bigram prediction:  ['proceedings', 'national', 'academy', 'science', 'united', 'states', 'america']
word list after fixing pos tagging:  ['proceedings', 'national', 'academy', 'science', 'united', 'states', 'america']
bigram prediction with stop words:  ['proceedings', 'national', 'academy', 'of', 'science', 'united', 'states', 'of', 'america']
fixed capitalization:  ['Proceedings', 'National', 'Academy', 'of', 'Science', 'United', 'States', 'of', 'America']
RESULT:  Proceedings National Academy of Science United States of America


'Proceedings National Academy of Science United States of America'

In [303]:
fuzz.ratio("Proceedings National Academy of Science United States of America", 
           "Proceedings of the National Academy of Sciences of the United States of America")

90

In [305]:
dict(model_tri["proceedings", "of"]) 

{'the': 0.9621212121212122,
 'symposia': 0.007575757575757576,
 'spie': 0.015151515151515152,
 'national': 0.003787878787878788,
 'ilia': 0.003787878787878788,
 'institute': 0.003787878787878788,
 'machine': 0.003787878787878788}

*Possibly add a layer of trigram prediction for "the"

In [306]:
dict(model_tri["sciences", "of"]) # we cannot get the "of" in United States of America

{'the': 0.5555555555555556,
 'moldova': 0.037037037037037035,
 'complexity': 0.07407407407407407,
 'urss': 0.037037037037037035,
 'moldova.': 0.037037037037037035,
 'armenia': 0.037037037037037035,
 'azerbaijan': 0.07407407407407407,
 'armenia.': 0.037037037037037035,
 'ukraine.': 0.037037037037037035,
 'azerbaijan.': 0.037037037037037035,
 'belarus.': 0.037037037037037035}

### Testing Model

In [308]:
# abbrev_df_sample = abbrev_df.sample(n = 200)
# abbrev_df_sample["abbrev_exact"] = abbrev_df_sample["full"].apply(abbreviate_journal_name)
# print(abbrev_df_sample.shape)
abbrev_df_sample.head(10)

Unnamed: 0,full,abbrev,abbrev_exact
11834,Denver journal of international law and policy,Denver J Int Law Policy,Denver j. int. law policy
22386,Shujutsu. Operation,Shujutsu,"Shujutsu, Oper."
38612,Journal of Nutrition Education and Behavior,J. Nutr. Educ. Behav.,J. Nutr. Educ. Behav.
24435,The Philippine journal of pediatrics,Philipp J Pediatr,Philipp. j. pediatr.
44974,Mathematical Forum,Math. Forum,Math. Forum
20778,Quaderni della nutrizione,Quad Nutr,Quad. nutr.
7415,Advances in genetics,Adv Genet,Adv. genet.
2728,Integrating Materials and Manufacturing Innova...,Integr. Mater. Manuf. Innov.,Integr. Mater. Manuf. Innov.
299,Cellular Microbiology,Cell. Microbiol.,Cell. Microbiol.
36277,Diskussionsforum Medizinische Ethik,Diskussionsforum Med. Ethik,Diskuss. Med. Ethik


In [317]:
abbrev_df_sample["expand_from_exact"] = abbrev_df_sample["abbrev_exact"].apply(lambda x: main(x, False))
abbrev_df_sample.head()

nuclei
annuelle

physics
counselor
society



astronautics
interactions
mycology
internationale


Unnamed: 0,full,abbrev,abbrev_exact,expand_from_exact
11834,Denver journal of international law and policy,Denver J Int Law Policy,Denver j. int. law policy,Denver journal of international law policy
22386,Shujutsu. Operation,Shujutsu,"Shujutsu, Oper.",Shujutsu Operations
38612,Journal of Nutrition Education and Behavior,J. Nutr. Educ. Behav.,J. Nutr. Educ. Behav.,Journal of Nutrition Education Behavior
24435,The Philippine journal of pediatrics,Philipp J Pediatr,Philipp. j. pediatr.,Philippine journal of pediatric
44974,Mathematical Forum,Math. Forum,Math. Forum,Mathematics Forum


In [318]:
abbrev_df_sample.head()

Unnamed: 0,full,abbrev,abbrev_exact,expand_from_exact
11834,Denver journal of international law and policy,Denver J Int Law Policy,Denver j. int. law policy,Denver journal of international law policy
22386,Shujutsu. Operation,Shujutsu,"Shujutsu, Oper.",Shujutsu Operations
38612,Journal of Nutrition Education and Behavior,J. Nutr. Educ. Behav.,J. Nutr. Educ. Behav.,Journal of Nutrition Education Behavior
24435,The Philippine journal of pediatrics,Philipp J Pediatr,Philipp. j. pediatr.,Philippine journal of pediatric
44974,Mathematical Forum,Math. Forum,Math. Forum,Mathematics Forum


In [319]:
abbrev_correct_expand = abbrev_df_sample[abbrev_df_sample["expand_from_exact"] == abbrev_df_sample["full"]]
print(abbrev_correct_expand.shape)
abbrev_correct_expand.head()

(58, 4)


Unnamed: 0,full,abbrev,abbrev_exact,expand_from_exact
7415,Advances in genetics,Adv Genet,Adv. genet.,Advances in genetics
299,Cellular Microbiology,Cell. Microbiol.,Cell. Microbiol.,Cellular Microbiology
7435,Advances in morphogenesis,Adv Morphog,Adv. morphog.,Advances in morphogenesis
11809,Dental progress,Dent Prog (Chic),Dent. prog.,Dental progress
36380,Emergency Planning Digest,Emerg. Plann. Dig.,Emerg. Plan. Dig.,Emergency Planning Digest


In [320]:
abbrev_incorrect_expand = abbrev_df_sample[abbrev_df_sample["expand_from_exact"] != abbrev_df_sample["full"]]
print(abbrev_incorrect_expand.shape)
abbrev_incorrect_expand.head(20)

(142, 4)


Unnamed: 0,full,abbrev,abbrev_exact,expand_from_exact
11834,Denver journal of international law and policy,Denver J Int Law Policy,Denver j. int. law policy,Denver journal of international law policy
22386,Shujutsu. Operation,Shujutsu,"Shujutsu, Oper.",Shujutsu Operations
38612,Journal of Nutrition Education and Behavior,J. Nutr. Educ. Behav.,J. Nutr. Educ. Behav.,Journal of Nutrition Education Behavior
24435,The Philippine journal of pediatrics,Philipp J Pediatr,Philipp. j. pediatr.,Philippine journal of pediatric
44974,Mathematical Forum,Math. Forum,Math. Forum,Mathematics Forum
20778,Quaderni della nutrizione,Quad Nutr,Quad. nutr.,Quaderni nutrition
2728,Integrating Materials and Manufacturing Innova...,Integr. Mater. Manuf. Innov.,Integr. Mater. Manuf. Innov.,Integrative Materials and Manufacturing and In...
36277,Diskussionsforum Medizinische Ethik,Diskussionsforum Med. Ethik,Diskuss. Med. Ethik,Diskuss. Medicine Ethik
15711,Journal of chromatographic science,J Chromatogr Sci,J. chromatogr. sci.,Journal of chromatography science
23495,The Canadian dental hygienist,Can Dent Hyg,Can. dent. hyg.,Canadian dental hygiene


In [321]:
abbrev_df_sample["expansion_score"]= abbrev_df_sample.apply(lambda x: 
                                                            fuzz.ratio(x["expand_from_exact"], x["full"]), axis = 1)

In [393]:
abbrev_close_expand = abbrev_df_sample[abbrev_df_sample["expansion_score"] > 85]
print(abbrev_close_expand.shape)
abbrev_close_expand.head()

(152, 6)


Unnamed: 0,full,abbrev,abbrev_exact,expand_from_exact,expansion_score,expansion_partial_score
11834,Denver journal of international law and policy,Denver J Int Law Policy,Denver j. int. law policy,Denver journal of international law policy,95,90
22386,Shujutsu. Operation,Shujutsu,"Shujutsu, Oper.",Shujutsu Operations,95,95
38612,Journal of Nutrition Education and Behavior,J. Nutr. Educ. Behav.,J. Nutr. Educ. Behav.,Journal of Nutrition Education Behavior,95,90
24435,The Philippine journal of pediatrics,Philipp J Pediatr,Philipp. j. pediatr.,Philippine journal of pediatric,93,100
44974,Mathematical Forum,Math. Forum,Math. Forum,Mathematics Forum,91,88


In [384]:
abbrev_df_sample["expansion_partial_score"]= abbrev_df_sample.apply(lambda x: 
                                                            fuzz.partial_ratio(x["expand_from_exact"], x["full"]), axis = 1)

In [386]:
abbrev_part_close_expand = abbrev_df_sample[abbrev_df_sample["expansion_partial_score"] > 90]
print(abbrev_part_close_expand.shape)
abbrev_part_close_expand.head()

(106, 6)


Unnamed: 0,full,abbrev,abbrev_exact,expand_from_exact,expansion_score,expansion_partial_score
22386,Shujutsu. Operation,Shujutsu,"Shujutsu, Oper.",Shujutsu Operations,95,95
24435,The Philippine journal of pediatrics,Philipp J Pediatr,Philipp. j. pediatr.,Philippine journal of pediatric,93,100
7415,Advances in genetics,Adv Genet,Adv. genet.,Advances in genetics,100,100
299,Cellular Microbiology,Cell. Microbiol.,Cell. Microbiol.,Cellular Microbiology,100,100
15711,Journal of chromatographic science,J Chromatogr Sci,J. chromatogr. sci.,Journal of chromatography science,96,94


In [389]:
abbrev_bad_expand = abbrev_df_sample[(abbrev_df_sample["expansion_score"] < 80) &
                                    (abbrev_df_sample["expansion_partial_score"] < 80)]
print(abbrev_bad_expand.shape)
abbrev_bad_expand.head()

(19, 6)


Unnamed: 0,full,abbrev,abbrev_exact,expand_from_exact,expansion_score,expansion_partial_score
20778,Quaderni della nutrizione,Quad Nutr,Quad. nutr.,Quaderni nutrition,79,67
36277,Diskussionsforum Medizinische Ethik,Diskussionsforum Med. Ethik,Diskuss. Med. Ethik,Diskuss. Medicine Ethik,72,65
10567,Chemical Society reviews,Chem Soc Rev,Chem. Soc. rev.,Chemistry Sociological review,64,51
7970,"Anatomic pathology (Chicago, Ill. : annual)",Anat Pathol,"Anat. pathol. (Chic. Ill, : annu.)",Anatomy pathology Chicago Ill Chicago Ill : an...,76,72
37897,Journal Belge de Medecine Physique,J. Belge Med. Phys.,J. Belge Med. Phys.,Journal of Belge Physics,68,64


In [378]:
main("Proc. Inst. Med. Chic.", True, "eng")

['Proc.', 'Inst.', 'Med.', 'Chic.']
journal list:  ['Proc.', 'Inst.', 'Med.', 'Chic.']
raw iso4 expansion:  ['proceedings', ['institut', 'institute', 'institution'], 'medecin-', 'chic.']
most frequent expansion:  ['proceedings', 'institute', 'medicine', 'chicago']
verfication list:  ['proceedings', 'institute', 'medecin', 'chicago']
bigram prediction:  ['proceedings', 'institute', 'medecin', 'chicago']
word list after fixing pos tagging:  ['proceedings', 'institute', 'medecin', 'chicago']
bigram prediction with stop words:  ['proceedings', 'institute', 'medecin', 'chicago']
fixed capitalization:  ['Proceedings', 'Institute', 'Medecin', 'Chicago']
RESULT:  Proceedings Institute Medecin Chicago


'Proceedings Institute Medecin Chicago'

In [381]:
dict(model_bi_forward["proceedings"])

{'library': 0.0026109660574412533,
 'of': 0.6892950391644909,
 None: 0.1279373368146214,
 'lecture': 0.0026109660574412533,
 'and': 0.013054830287206266,
 '/': 0.028720626631853787,
 '-': 0.02349869451697128,
 'series': 0.005221932114882507,
 'baylor': 0.0026109660574412533,
 'annual': 0.02349869451697128,
 'the': 0.005221932114882507,
 'supplements': 0.005221932114882507,
 'in': 0.02610966057441253,
 'i': 0.0026109660574412533,
 'childrens': 0.0026109660574412533,
 'amia': 0.007832898172323759,
 'foundation': 0.0026109660574412533,
 'international': 0.0026109660574412533,
 'national': 0.0026109660574412533,
 'nursing': 0.0026109660574412533,
 'royal': 0.005221932114882507,
 'state': 0.0026109660574412533,
 'on': 0.0026109660574412533,
 'a': 0.0026109660574412533,
 'supplement': 0.0026109660574412533,
 'including': 0.0026109660574412533,
 'chemical': 0.0026109660574412533}

In [581]:
b = TextBlob("Mol. Pharm.")
lang = b.detect_language()
# get_language("Mol. Pharm.")
lang

'hi'

In [377]:
ltwa_df_with_abbrev[ltwa_df_with_abbrev["abbrev"] == "b."]

Unnamed: 0,full,abbrev,lang,is_substring,end_with_period


In [390]:
get_raw_iso4_expansions(["Ann.","Soc.","belge", "hist.", "hop."], "fre")

['annal-', ['social', 'societ-'], 'belge', ['histoir-', 'histor-'], 'hop.']

## Write Models 

### Frequencies 

In [406]:
dataset_abbrev_freq.head()

Unnamed: 0,abbrev,full,is_substring,count,freq,most_freq,abbrev_expanded
0,$k$-monogr.,[$k$-monographs],[True],{'$k$-monographs': 1},{'$k$-monographs': 1.0},$k$-monographs,abbrev not in iso4
1,-,"[–, --]","[False, True]","{'–': 1, '--': 1}","{'–': 0.5, '--': 0.5}",–,abbrev not in iso4
2,-geol.,"[-geologie, -geologie]","[True, True]",{'-geologie': 2},{'-geologie': 1.0},-geologie,abbrev not in iso4
3,.r.,"[rendus, rendus]","[True, True]",{'rendus': 2},{'rendus': 1.0},rendus,abbrev not in iso4
4,11,[fisica],[False],{'fisica': 1},{'fisica': 1.0},fisica,abbrev not in iso4


In [407]:
df_freq_to_write = dataset_abbrev_freq[["abbrev", "full", "freq", "most_freq"]]
df_freq_to_write.head()

Unnamed: 0,abbrev,full,freq,most_freq
0,$k$-monogr.,[$k$-monographs],{'$k$-monographs': 1.0},$k$-monographs
1,-,"[–, --]","{'–': 0.5, '--': 0.5}",–
2,-geol.,"[-geologie, -geologie]",{'-geologie': 1.0},-geologie
3,.r.,"[rendus, rendus]",{'rendus': 1.0},rendus
4,11,[fisica],{'fisica': 1.0},fisica


In [408]:
df_freq_to_write.to_csv("frequencies_train.csv")

### N-gram models
#### Model Bi Forward

In [418]:
import json

# writing
json.dump(model_bi_forward, open("test.json", 'w'))

# reading
out = json.load(open("test.json"))
out["perspectives"]

{'in': 0.2891566265060241,
 'on': 0.24096385542168675,
 'null': 0.37349397590361444,
 'innovations': 0.012048192771084338,
 'and': 0.012048192771084338,
 'quarterly': 0.012048192771084338,
 'en': 0.012048192771084338,
 'gerontological': 0.012048192771084338,
 'internationales': 0.012048192771084338,
 '/': 0.012048192771084338,
 'crm': 0.012048192771084338}

In [419]:
json.dump(model_bi_forward, open("model_bi_forward.json", 'w'))

In [420]:
json.dump(model_bi_backward, open("model_bi_backward.json", 'w'))

## TODO: More Work with Part of Speech Patterns

In [874]:
from nltk import word_tokenize, pos_tag
text = word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [904]:
text = word_tokenize("journal of essential oil-bearing plants")
nltk.pos_tag(text)

[('journal', 'NN'),
 ('of', 'IN'),
 ('essential', 'JJ'),
 ('oil-bearing', 'JJ'),
 ('plants', 'NNS')]

## TODO: More Work with Different Languages

## TODO: Better Training Data 
- possibly ones with languages

## TODO: More work with Tri-grams and "the"

TODO: look more closely into the rules here https://marcinwrochna.github.io/abbrevIso/