In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# setting numbers of articles in each selection
health = 571
social = 366
business = 305

# total articles
total = health + social + business
percentages = [health/total, social/total, business/total]
print('Total articles: ', total)
print('Percentage of health articles: ', round(percentages[0]*100, 2))
print('Percentage of social science articles: ', round(percentages[1]*100, 2))
print('Percentage of business & economics articles: ', round(percentages[2]*100, 2))

Total articles:  1242
Percentage of health articles:  45.97
Percentage of social science articles:  29.47
Percentage of business & economics articles:  24.56


We are selecting 100 articles for our dataset as it takes too long to label a larger dataset for training purposes.

In [3]:
# number of articles to be selected
n = 100
# number of health articles
n_health = round(n * percentages[0])
# number of social science articles
n_social = round(n * percentages[1])
# number of business & economics articles
n_business = round(n * percentages[2])

# print the numbers of articles in each selection
print('Number of health articles: ', n_health)
print('Number of social science articles: ', n_social)
print('Number of business & economics articles: ', n_business)

Number of health articles:  46
Number of social science articles:  29
Number of business & economics articles:  25


We randomly select the article indices, select the corresponding rows and save them in a file.

In [4]:
draft = pd.read_csv('draftdataset.csv')
draft.head()

Unnamed: 0,study_id,link_to_study,study_title,study_year,Journal,Citations,Label
0,hm0001,https://www.sciencedirect.com/science/article/...,The psychological impact of quarantine and how...,2020,Lancet,97,H_and_M
1,hm0002,https://www.sciencedirect.com/science/article/...,"Global, regional, and national incidence, prev...",2017,Lancet,154,H_and_M
2,hm0003,https://www.thelancet.com/journals/lancet/arti...,A novel coronavirus outbreak of global health ...,2020,Lancet,4791,H_and_M
3,hm0004,https://www.sciencedirect.com/science/article/...,COVID-19 and Italy: what next?,2020,Lancet,1999,H_and_M
4,hm0005,https://www.sciencedirect.com/science/article/...,Structural racism and health inequities in the...,2017,Lancet,0,H_and_M


In [5]:
# pick random indices from the range of total articles in each section and then store them in corresponding files

# health articles
health_indices = random.sample(range(0, health), n_health)
health_indices.sort()

# social science articles
social_indices = random.sample(range(health, health + social), n_social)
social_indices.sort()

# business & economics articles
business_indices = random.sample(range(health + social, health + social + business), n_business)
business_indices.sort()

# select corresponding rows from the dataset and store them in a data frame
health_articles = draft.iloc[health_indices]
social_articles = draft.iloc[social_indices]
business_articles = draft.iloc[business_indices]

# concatenate the data frames
frames = [health_articles, social_articles, business_articles]
result = pd.concat(frames)

# reset the index
result.reset_index(drop=True, inplace=True)
result.head()

Unnamed: 0,study_id,link_to_study,study_title,study_year,Journal,Citations,Label
0,hm0052,https://www.nature.com/articles/nm.4333,Mutational landscape of metastatic cancer reve...,2017,Nature Medicine,1,H_and_M
1,hm0056,https://www.nature.com/articles/s41591-018-0016-8,Molecular subtypes of diffuse large B cell lym...,2018,Nature Medicine,0,H_and_M
2,hm0059,https://www.nature.com/articles/S41591-021-012...,Resistance of SARS-CoV-2 variants to neutraliz...,2021,Nature Medicine,5,H_and_M
3,hm0085,https://www.nature.com/articles/nm.4394,GFRAL is the receptor for GDF15 and is require...,2017,Nature Medicine,0,H_and_M
4,hm0109,https://www.science.org/doi/full/10.1126/scien...,A neutralizing human antibody binds to the N-t...,2020,Science,0,H_and_M


In [6]:
# save the data frame to a csv file
result.to_csv('finaldataset.csv', index=False)