In [1]:
import pandas as pd

#### dataset_1 
- Complete Data
- __6027__ Rows

#### dataset_2
- Applied filters to dataset_1
- __4145__ Rows

#### dataset_3 
- Made corrections in dataset_2

In [2]:
df = pd.read_csv("dataset_3.csv")

In [3]:
df.columns

Index(['Authors', 'Author Full Names', 'Group Authors', 'Article Title',
       'Journals', 'Language', 'Document Type', 'Conference Title',
       'Conference Date', 'Conference Location', 'Conference Sponsor',
       'Author Keywords', 'Keywords Plus', 'Abstract', 'Addresses',
       'Country-Name', 'Leader Addresses', 'Email Addresses', 'Funding Orgs',
       'Funding Text', 'Cited Reference Count', 'Times Cited', 'Usage Count',
       'Publisher', 'Publisher City', 'ISSN', 'Publication Date',
       'Publication Year', 'DOI', 'Book DOI', 'Early Access Date',
       'Number of Pages', 'WoS Categories', 'Research Areas', 'IDS Number',
       'UT', 'Pubmed Id', 'Open Access Designations', 'Date of Export'],
      dtype='object')

## Column - __Leader Addresses__

#### Data collected about leaders : 
- leader name
- leader country

In [4]:
leaderAddresses = df['Leader Addresses']

In [5]:
leaderAddresses[0]

'Poirier, RA (corresponding author), Mem Univ, Chem Dept, St John, NF A1B 3X7, Canada.'

### *getLeader( )*

In [6]:
def getLeader(text):
    data = {}
    data["leader_last_name"] = text.split(",",1)[0]
    data["leader_country"] = text.replace(".","").strip().split(" ")[-1]
    return data

In [7]:
getLeader(leaderAddresses[0])

{'leader_last_name': 'Poirier', 'leader_country': 'Canada'}

## Column - __Addresses__

#### Data collected about authors : 
- firstname
- lastname
- fullname
- department
- organization
- country

In [8]:
Addresses = df['Addresses']

In [9]:
Addresses[0]

'[Abu-Saleh, Abd Al-Aziz A.; Awad, Ibrahim E.; Poirier, Raymond A.] Mem Univ, Chem Dept, St John, NF A1B 3X7, Canada; [Yadav, Arpita] Chhatrapati Shahu Ji Maharaj Univ, Dept Chem, Univ Inst Engn & Technol, Kanpur 208024, Uttar Pradesh, India'

### *getAuthors( )*

In [10]:
def getAuthors(text):
    data = []
    addresses = text.split("[")[1:]
    for i in addresses:
        address = i.replace("\n","").strip();
        if address[-1] == ";":
            address = address[:-1]   
        address = address.split("]")    
        address_parts = address[1].split(",")
        organization = address_parts[0]
        department = address_parts[1]
        country = address_parts[-1].split(" ")[-1]
        authors = address[0].replace(".","").split(";")
        for j in authors:
            author = {}
            name = j.replace("-"," ").split(",",1)
            author["first_name"] = name[-1].strip()
            author["last_name"] = name[0].strip()
            author["full_name"] = author["first_name"] + ", " + author["last_name"]
            author["department"] = department.replace("dept","").replace("Dept","").strip()
            author["organization"] = organization.strip()
            author["country"] = country.strip()
            data.append(author)
    return data

In [11]:
getAuthors(Addresses[0])

[{'first_name': 'Abd Al Aziz A',
  'last_name': 'Abu Saleh',
  'full_name': 'Abd Al Aziz A, Abu Saleh',
  'department': 'Chem',
  'organization': 'Mem Univ',
  'country': 'Canada'},
 {'first_name': 'Ibrahim E',
  'last_name': 'Awad',
  'full_name': 'Ibrahim E, Awad',
  'department': 'Chem',
  'organization': 'Mem Univ',
  'country': 'Canada'},
 {'first_name': 'Raymond A',
  'last_name': 'Poirier',
  'full_name': 'Raymond A, Poirier',
  'department': 'Chem',
  'organization': 'Mem Univ',
  'country': 'Canada'},
 {'first_name': 'Arpita',
  'last_name': 'Yadav',
  'full_name': 'Arpita, Yadav',
  'department': 'Chem',
  'organization': 'Chhatrapati Shahu Ji Maharaj Univ',
  'country': 'India'}]

## Collecting all __authors__ and __leaders__

In [12]:
authors = []
for i in Addresses:
    authors.append(getAuthors(i))
len(authors)

4145

In [13]:
leaders = []
count = 0
for i in leaderAddresses:
    leaders.append(getLeader(i))
len(leaders)

4145

### *addLeaders( )*

In [14]:
def addLeaders(authors_data, leaders_data):
    result = []
    for authors,leader in zip(authors_data,leaders_data):
        leader_last_name = leader['leader_last_name']
        leader_name = ""
        temp = []
        for i in authors:
            if i['last_name'] == leader_last_name:
                leader_name = i['full_name']
        for j in authors:
            j['leader'] = leader_name
            temp.append(j)
        result.append(temp)
    return result

In [15]:
all_authors = addLeaders(authors,leaders)

## __Removing__ teams without Indians

In [16]:
def remove(authors_data):
    data = []
    for i in authors_data:
        countries = []
        for author in i:
            countries.append(author['country'])
        if("India" in countries):    
            data.append(i)    
    return data

authors_data = remove(all_authors)

### *addTeamIds()*

In [17]:
def addTeamids(authors_data):
    team_id = 1
    for i in authors_data:
        for author in i:
            author['team_id'] = team_id
        team_id += 1

In [18]:
addTeamids(authors_data)

### *addAuthorPositions()*

In [19]:
def addAuthorPositions(authors_data):
    for i in authors_data:
        for index,author in enumerate(i):
            if(index == 0):
                author['position'] = 'first'
            elif(index == len(i)-1):
                author['position'] = 'last'
            else:
                author['position'] = 'other'
            
            if(author['full_name'] == author['leader']):
                if(author['position'] == 'first'):
                    author['position'] = 'first, leader'
                elif(author['position'] == 'last'):
                    author['position'] = 'last, leader'
                else:
                    author['position'] = 'leader'

In [20]:
addAuthorPositions(authors_data)

In [21]:
data = []
for i in authors_data:
    for author in i:
        data.append(author)        

In [22]:
len(data)

31292

## Adding __api names__ to authors

In [23]:
for i in data:
    first_name = i["first_name"].split(" ")
    if(len(first_name[0]) > 1):
        i['api_name'] = first_name[0]
    else:
        last_name = i["last_name"].split(" ")
        i['api_name'] = last_name[0]   

In [24]:
# file_data = pd.DataFrame(data)
# file_data.to_csv("data.csv", index=False)