In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("authors.csv")

In [109]:
df.columns

Index(['Authors', 'Author Full Names', 'Group Authors', 'Article Title',
       'Journals', 'Language', 'Document Type', 'Conference Title',
       'Conference Date', 'Conference Location', 'Conference Sponsor',
       'Author Keywords', 'Keywords Plus', 'Abstract', 'Addresses',
       'Country-Name', 'Leader Addresses', 'Email Addresses', 'Funding Orgs',
       'Funding Text', 'Cited Reference Count', 'Times Cited', 'Usage Count',
       'Publisher', 'Publisher City', 'ISSN', 'Publication Date',
       'Publication Year', 'DOI', 'Book DOI', 'Early Access Date',
       'Number of Pages', 'WoS Categories', 'Research Areas', 'IDS Number',
       'UT', 'Pubmed Id', 'Open Access Designations', 'Date of Export'],
      dtype='object')

# <font color='green'>Leader Addresses</font> Column

#### Data collected about leader : 
- leader name
- leader country

In [110]:
leaderAddresses = df['Leader Addresses']

In [111]:
leaderAddresses.head()

0    Poirier, RA (corresponding author), Mem Univ, ...
1    Kolacz, J (corresponding author), Indiana Univ...
2    D'Cruz, M; Banerjee, D (corresponding author),...
3    Puthiyedath, R (corresponding author), Amrita ...
4    Rana, MK (corresponding author), IISER, Dept C...
Name: Leader Addresses, dtype: object

### *getLeader( )*

In [112]:
def getLeader(text):
    data = {}
    data["leader_last_name"] = text.split(",",1)[0]
    data["leader_country"] = text.replace(".","").strip().split(" ")[-1]
    return data

In [113]:
getLeader(leaderAddresses[500])

{'leader_last_name': 'Choudhari', 'leader_country': 'India'}

# <font color='green'>Addresses</font> Column

#### Data collected about authors : 
- first name
- last name
- full name
- department
- university
- country

In [114]:
Addresses = df['Addresses']

In [115]:
Addresses.head()

0    [Abu-Saleh, Abd Al-Aziz A.; Awad, Ibrahim E.; ...
1    [Kolacz, Jacek; Nix, Evan J.; Roath, Olivia K....
2    [D'Cruz, Migita; Banerjee, Debanjan] Natl Inst...
3    [Payyappallimana, Unnikrishnan] Univ Transdisc...
4    [Sen Gupta, Parth Sarthi; Biswal, Satyaranjan;...
Name: Addresses, dtype: object

### *getAuthors( )*

In [116]:
def getAuthors(text):
    data = []
    addresses = text.split("[")[1:]
    for i in addresses:
        address = i.replace("\n","").strip();
        if address[-1] == ";":
            address = address[:-1]   
        address = address.split("]")    
        address_parts = address[1].split(",")
        university = address_parts[0]
        department = address_parts[1]
        country = address_parts[-1].split(" ")[-1]
        authors = address[0].replace(".","").split(";")
        for j in authors:
            author = {}
            name = j.replace("-"," ").split(",",1)
            author["first_name"] = name[-1].strip()
            author["last_name"] = name[0].strip()
            author["full_name"] = author["first_name"] + ", " + author["last_name"]
            author["department"] = department.replace("dept","").replace("Dept","").strip()
            author["university"] = university.strip()
            author["country"] = country.strip()
            data.append(author)
    return data

In [117]:
getAuthors(Addresses[50])

[{'first_name': 'Sunitha M',
  'last_name': 'Kasibhatla',
  'full_name': 'Sunitha M, Kasibhatla',
  'department': 'Bioinformat Ctr',
  'university': 'Savitribai Phule Pune Univ',
  'country': 'India'},
 {'first_name': 'Meenal',
  'last_name': 'Kinikar',
  'full_name': 'Meenal, Kinikar',
  'department': 'Bioinformat Ctr',
  'university': 'Savitribai Phule Pune Univ',
  'country': 'India'},
 {'first_name': 'Sanket',
  'last_name': 'Limaye',
  'full_name': 'Sanket, Limaye',
  'department': 'Bioinformat Ctr',
  'university': 'Savitribai Phule Pune Univ',
  'country': 'India'},
 {'first_name': 'Urmila',
  'last_name': 'Kulkarni Kale',
  'full_name': 'Urmila, Kulkarni Kale',
  'department': 'Bioinformat Ctr',
  'university': 'Savitribai Phule Pune Univ',
  'country': 'India'},
 {'first_name': 'Sunitha M',
  'last_name': 'Kasibhatla',
  'full_name': 'Sunitha M, Kasibhatla',
  'department': 'HPC Med & Bioinformat Applicat Grp',
  'university': 'Ctr Dev Adv Comp',
  'country': 'India'},
 {'firs

# <font color='green'>Adding Leaders</font> to Authors

In [118]:
authors = []
for i in Addresses:
    authors.append(getAuthors(i))
len(authors)

4145

In [119]:
leaders = []
count = 0
for i in leaderAddresses:
    leaders.append(getLeader(i))
len(leaders)

4145

### *addLeaders( )*

In [120]:
def addLeaders(authors_data, leaders_data):
    result = []
    for authors,leader in zip(authors_data,leaders_data):
        leader_last_name = leader['leader_last_name']
        leader_name = ""
        temp = []
        for i in authors:
            if i['last_name'] == leader_last_name:
                leader_name = i['full_name']
        for j in authors:
            j['leader'] = leader_name
            temp.append(j)
        result.append(temp)
    return result

In [133]:
final_data = addLeaders(authors,leaders)

In [124]:
final_data[0]

[{'first_name': 'Abd Al Aziz A',
  'last_name': 'Abu Saleh',
  'full_name': 'Abd Al Aziz A, Abu Saleh',
  'department': 'Chem',
  'university': 'Mem Univ',
  'country': 'Canada',
  'leader': 'Raymond A, Poirier'},
 {'first_name': 'Ibrahim E',
  'last_name': 'Awad',
  'full_name': 'Ibrahim E, Awad',
  'department': 'Chem',
  'university': 'Mem Univ',
  'country': 'Canada',
  'leader': 'Raymond A, Poirier'},
 {'first_name': 'Raymond A',
  'last_name': 'Poirier',
  'full_name': 'Raymond A, Poirier',
  'department': 'Chem',
  'university': 'Mem Univ',
  'country': 'Canada',
  'leader': 'Raymond A, Poirier'},
 {'first_name': 'Arpita',
  'last_name': 'Yadav',
  'full_name': 'Arpita, Yadav',
  'department': 'Chem',
  'university': 'Chhatrapati Shahu Ji Maharaj Univ',
  'country': 'India',
  'leader': 'Raymond A, Poirier'}]

In [135]:
file = open("data.txt","w")
file.write(str(final_data))

7432232