In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

In [None]:
df = pd.read_pickle('./df_pubmed_openalex_combined.pkl')

In [None]:
df.columns

# Data cleansing

### Check and drop duplicates

In [None]:
print('All records: {}'.format(df.shape[0]))
df.drop_duplicates(subset=['pmid'], inplace=True)
print('After removing duplicates: {}'.format(df.shape[0]))

### Extra filter for years 2000-2024 (wrong years could be included during fetching)

In [None]:
print('Before "year" filter: {}'.format(df.shape[0]))
df = df.loc[(df.year_pubmed >=2000) & (df.year_pubmed <= 2024)].reset_index(drop=True)
print('After "year" filter: {}'.format(df.shape[0]))

# Transformations

### Transofrm selected columns

In [None]:
def author_country(x):
    y = [x[i]['countries'] for i in range(len(x))]
    return y

### Create column with countries of all authors

In [None]:
df['countries'] = df['authorships'].apply(lambda x: author_country(x))

### Create columns with Last/first author country 

In [None]:
df['last_author_country'] = df['countries'].apply(lambda x: x[-1] if len(x)>0 else None)
df['first_author_country'] = df['countries'].apply(lambda x: x[0] if len(x)>0 else None)

### Extra cleaning: Drop countries without last or first author country

In [None]:
print('numer of rows to drop: {}'.format(sum(((df.last_author_country.isnull()) | (df.first_author_country.isnull())))))
df = df.dropna(subset=['last_author_country','first_author_country'])
print('Final dataframe shape: {}'.format(df.shape))

### Create column with Number of authors

In [None]:
df['authors_number'] = df['authorships'].apply(lambda x: len(x))

### Create column with open-access information

In [None]:
df['is_open_access'] = df['open_access'].apply(lambda x: int(x['is_oa']))

### Assign only one (majority) country for article

In [None]:
def majority_country(x):
    
    ctrs = list(itertools.chain.from_iterable(x['countries'])) #all countires (incl. multiple affiliations)
    lst_ctr = x['last_author_country'] #last author country
    fst_ctr = x['first_author_country'] #first author country

    ctrs = {i:ctrs.count(i) for i in set(ctrs)} # countires counted

    majority_ctrs = [i for i in ctrs.keys() if ctrs[i] == max(ctrs.values())] # most frequent countries
    incl_lst_ath = [i for i,x in enumerate(lst_ctr) if x in majority_ctrs] # index of most frequent country in the countires assigned to last author (if exist)
    incl_fst_ath = [i for i,x in enumerate(fst_ctr) if x in majority_ctrs] # index of most frequent country in the countires assigned to first author (if exist)

    # rule 1
    if len(majority_ctrs) == 1:
        y = majority_ctrs[0]
        return y
    # rule 2
    elif len(incl_lst_ath) == 1:
        y = lst_ctr[incl_lst_ath[0]]
        return y
    # rule 3
    elif len(incl_fst_ath) == 1:
        y = fst_ctr[incl_fst_ath[0]]
        return y
    else:
        return 'Multinational'

In [None]:
df['majority_country'] = df.apply(lambda x: majority_country(x), axis=1)

### Create column with annual citation rate for article

In [None]:
def calc_cit_factor(row):
    
    pub_year = row['year_pubmed']
    cit_total = row['cited_by_count']
    if pub_year == 2025:
        return None
    else:
        return cit_total/(2025 -pub_year)
    
df['cit_per_year'] = df.apply(calc_cit_factor, axis=1)

### Create column with number of references

In [None]:
df['n_references'] = df.referenced_works.apply(lambda x: len(x))

In [None]:
df.columns

# Remove excess columns (not needed anymore after transformations)

In [None]:
df.drop(columns=['ids','authorships','open_access','referenced_works',
                 'last_author_country','first_author_country'], inplace=True)

In [None]:
df.columns

# Save final dataframe

In [None]:
df.to_pickle('./df_pm_oa_preprocessed.pkl')

In [None]:
df.head()