In [1]:
import sklearn


In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import hashlib

# Load data
author_id = pd.read_csv('D://id.csv', dtype=str)
author_name = pd.read_csv('D://name.csv', dtype=str)
author_affiliation = pd.read_csv('D://affiliation.csv', dtype=str)

# Standardize author names to lowercase
author_name['name'] = author_name['name'].str.lower()

# Fill missing ORCID with a placeholder
author_id['orcid'].fillna('unknown', inplace=True)

# Merge the DataFrames on the common identifiers
merged_data = author_name.merge(author_id, left_on='auth_id', right_on='ident', how='left')
merged_data = merged_data.merge(author_affiliation, on=['pmid', 'auth_id'], how='left')

# Drop unnecessary columns
merged_data.drop(columns=['ident', 'source', 'identifier'], inplace=True)

# Create a unique identifier for each author entry
merged_data['unique_id'] = merged_data['pmid'] + '_' + merged_data['auth_id']

# Shorten unique_id using a hash function
merged_data['short_unique_id'] = merged_data['unique_id'].apply(lambda x: hashlib.sha1(x.encode()).hexdigest()[:10])

# Handle missing values in the merged DataFrame
merged_data.fillna('unknown', inplace=True)

# Remove duplicate short_unique_id
merged_data = merged_data.drop_duplicates(subset=['short_unique_id'])

# Display the cleaned and merged DataFrame
print(merged_data.head())

# Feature Engineering
le = LabelEncoder()
merged_data['auth_id'] = le.fit_transform(merged_data['auth_id'])
merged_data['affiliation'] = le.fit_transform(merged_data['affiliation'])
merged_data['fore_name'] = le.fit_transform(merged_data['fore_name'])
merged_data['last_name'] = le.fit_transform(merged_data['last_name'])

features = ['auth_id', 'affiliation', 'fore_name', 'last_name']
X = merged_data[features]
y = merged_data['short_unique_id']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

merged_data['predicted_short_unique_id'] = model.predict(X)

merged_data

       pmid       auth_id              name_x author_type initials  fore_name  \
0  20199141  (20199141,1)        masaki kasai      Person        M     Masaki   
2  20199141  (20199141,2)      takahiro jikoh      Person        T   Takahiro   
3  20199141  (20199141,3)  hidefumi fukumitsu      Person        H   Hidefumi   
4  20199141  (20199141,4)      shoei furukawa      Person        S      Shoei   
5  22734847  (22734847,1)   matthew f wszolek      Person       MF  Matthew F   

   last_name   suffix  orcid_x   name_y  orcid_y  \
0      Kasai  unknown  unknown  unknown  unknown   
2      Jikoh  unknown  unknown  unknown  unknown   
3  Fukumitsu  unknown  unknown  unknown  unknown   
4   Furukawa  unknown  unknown  unknown  unknown   
5    Wszolek  unknown  unknown  unknown  unknown   

                                         affiliation              unique_id  \
0  Laboratory of Molecular Biology, Gifu Pharmace...  20199141_(20199141,1)   
2                                         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,pmid,auth_id,name_x,author_type,initials,fore_name,last_name,suffix,orcid_x,name_y,orcid_y,affiliation,unique_id,short_unique_id,predicted_short_unique_id
0,20199141,0,masaki kasai,Person,M,57,49,unknown,unknown,unknown,unknown,22,"20199141_(20199141,1)",1c14b34b83,980f9ac5f8
2,20199141,1,takahiro jikoh,Person,T,77,47,unknown,unknown,unknown,unknown,25,"20199141_(20199141,2)",980f9ac5f8,980f9ac5f8
3,20199141,2,hidefumi fukumitsu,Person,H,31,25,unknown,unknown,unknown,unknown,25,"20199141_(20199141,3)",48e789ee9b,48e789ee9b
4,20199141,3,shoei furukawa,Person,S,72,26,unknown,unknown,unknown,unknown,25,"20199141_(20199141,4)",56c6682735,56c6682735
5,22734847,4,matthew f wszolek,Person,MF,58,90,unknown,unknown,unknown,unknown,17,"22734847_(22734847,1)",95b24cd9fa,56c6682735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,23733708,96,sung-chan shin,Person,SC,75,75,unknown,unknown,unknown,unknown,25,"23733708_(23733708,2)",e8ebd4a8b0,30bb0c5175
130,23733708,97,yun-sung lim,Person,YS,90,53,unknown,unknown,unknown,unknown,25,"23733708_(23733708,3)",c5a387e532,133a34619e
131,23733708,98,jin-choon lee,Person,JC,39,50,unknown,unknown,unknown,unknown,25,"23733708_(23733708,4)",133a34619e,133a34619e
132,23733708,99,soo-geun wang,Person,SG,74,86,unknown,unknown,unknown,unknown,25,"23733708_(23733708,5)",84dd131abc,84dd131abc


In [4]:
selected_columns_merged_data = merged_data[['pmid', 'initials', 'short_unique_id']]
selected_columns_id = author_id[['ident', 'name','orcid']]
selected_columns_name = author_name[['auth_id','author_type','initials','fore_name','last_name']]
selected_columns_affiliatiom = author_affiliation[['affiliation']]
selected_columns_merged_data2 = merged_data[['short_unique_id']]

n_merged_data = pd.concat([selected_columns_merged_data, selected_columns_id, selected_columns_name, selected_columns_affiliatiom, selected_columns_merged_data2], axis=1)
n_merged_data


Unnamed: 0,pmid,initials,short_unique_id,ident,name,orcid,auth_id,author_type,initials.1,fore_name,last_name,affiliation,short_unique_id.1
0,20199141,M,1c14b34b83,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Weijie Du,unknown,"(20199141,1)",Person,M,Masaki,Kasai,"Laboratory of Molecular Biology, Gifu Pharmace...",1c14b34b83
2,20199141,T,980f9ac5f8,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Aashay Kekatpure,unknown,"(20199141,3)",Person,H,Hidefumi,Fukumitsu,"Bioengineering Department, Politecnico di Mila...",980f9ac5f8
3,20199141,H,48e789ee9b,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,V Rigau,unknown,"(20199141,4)",Person,S,Shoei,Furukawa,Mechanics of Materials and Constructions (MeMC...,48e789ee9b
4,20199141,S,56c6682735,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Wen-Shi Wei,unknown,"(22734847,1)",Person,MF,Matthew F,Wszolek,1 Department of Child and Adolescent Psychiatr...,56c6682735
5,22734847,MF,95b24cd9fa,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Theodore H Schwartz,unknown,"(22734847,2)",Person,D,David,Canes,2 Department of Child Psychiatry and Sleep cen...,95b24cd9fa
...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,,,,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Haiyue Dai,unknown,"(23733708,1)",Person,YS,Yoon Se,Lee,Hans Popper Laboratory of Molecular Hepatology...,
197,,,,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Natalia Lopez-Andres,unknown,"(23733708,2)",Person,SC,Sung-Chan,Shin,State Key Laboratory of Cancer Biology and Xij...,
198,,,,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,L van Rijn,unknown,"(23733708,3)",Person,YS,Yun-Sung,Lim,"Biostatistics Unit, Department of Health Scien...",
199,,,,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Vijay Gadgil,unknown,"(23733708,4)",Person,JC,Jin-Choon,Lee,"Department of Obstetrics and Gynecology, Sant&...",


In [5]:
final_merged_data = n_merged_data.dropna(subset=['short_unique_id'])

In [6]:
author_names_and_uid = final_merged_data.rename(columns={'short_unique_id': 'UId'})
author_names_and_uid

Unnamed: 0,pmid,initials,UId,ident,name,orcid,auth_id,author_type,initials.1,fore_name,last_name,affiliation,UId.1
0,20199141,M,1c14b34b83,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Weijie Du,unknown,"(20199141,1)",Person,M,Masaki,Kasai,"Laboratory of Molecular Biology, Gifu Pharmace...",1c14b34b83
2,20199141,T,980f9ac5f8,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Aashay Kekatpure,unknown,"(20199141,3)",Person,H,Hidefumi,Fukumitsu,"Bioengineering Department, Politecnico di Mila...",980f9ac5f8
3,20199141,H,48e789ee9b,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,V Rigau,unknown,"(20199141,4)",Person,S,Shoei,Furukawa,Mechanics of Materials and Constructions (MeMC...,48e789ee9b
4,20199141,S,56c6682735,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Wen-Shi Wei,unknown,"(22734847,1)",Person,MF,Matthew F,Wszolek,1 Department of Child and Adolescent Psychiatr...,56c6682735
5,22734847,MF,95b24cd9fa,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Theodore H Schwartz,unknown,"(22734847,2)",Person,D,David,Canes,2 Department of Child Psychiatry and Sleep cen...,95b24cd9fa
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,23733708,SC,e8ebd4a8b0,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,John M Costello,https://orcid.org/0000-0001-7713-0258,"(23592528,3)",Person,CM,C&Atilde;&copy;lia M,Strunz,Delta-Ultrasound Diagnostic Center in Obstetri...,e8ebd4a8b0
130,23733708,YS,c5a387e532,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Hassan Sajjad,unknown,"(23592528,4)",Person,A,Alexandre,Soeiro,Department of Otorhinolaryngology - Head and N...,c5a387e532
131,23733708,JC,133a34619e,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Daniele Cristine Krebs Ribeiro,unknown,"(23592528,5)",Person,M,M&Atilde;&iexcl;rcio,Santos,"Department of Otolaryngology, Singapore Genera...",133a34619e
132,23733708,SG,84dd131abc,https://purl.humanatlas.io/graph/hra-lit/v0.6#...,Yiding Wang,unknown,"(23592528,6)",Person,F,Fl&Atilde;&iexcl;vio,Pivateli,Division of Cardiology and Cardiovascular Surg...,84dd131abc


In [7]:
author_names_and_uid.to_csv('author_details_and_uid.csv', index=False)