In [1]:
library(dplyr)
library(tidyr)
#install.packages('gender',lib=.libPaths()[3])
#install.packages("genderdata", repos = "http://packages.ropensci.org", type = "source",lib=.libPaths()[3])
library(gender)
library(stringr)
library(widyr)
library(readr)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [2]:
block = read_csv("newdataCombined.csv")
block = block[block$Year >=2010 & block$Year <=2017,]
myvars <- c("AuthorIdsOrder", "AuthorNamesOrder", "Year", "PaperId")
data0 <- block[myvars]
data0$Year1 <-  data0$Year-35 #25-45
data0$Year2 <-  data0$Year-45 #35-55
data0$Year3 <-  data0$Year-55 #45-65
data0$authorCount <- str_count(data0$AuthorIdsOrder, '; ')+1
nrow(data0)

Parsed with column specification:
cols(
  Tag = col_character(),
  PaperId = col_double(),
  AuthorIdsOrder = col_character(),
  AuthorNamesOrder = col_character(),
  FoSNames = col_character(),
  Year = col_integer(),
  DocType = col_character(),
  Journal = col_character(),
  Publisher = col_character(),
  Doi = col_character(),
  Title = col_character(),
  EstimatedCitation = col_integer(),
  URLs = col_character(),
  IndexedAbstract = col_character()
)


### We use "Year2" for birth year estimate, i.e. assuming authors are in the age range [35-55] when they publish. 

In [3]:
AuthorTable <- data0 %>% separate(AuthorNamesOrder, into = sprintf('%s.%s', rep('Author',100), rep(1:100)), sep = "; ") #max author has exceeded 90
AuthorTable <- AuthorTable %>% gather(authorOrder, name, into = sprintf('%s.%s', rep('Author',100), rep(1:100)))
AuthorList <- data.frame(lapply(AuthorTable, trimws), stringsAsFactors = FALSE)
nameTable <- AuthorList %>% separate(name, into = sprintf('%s.%s', rep('namePart',4), rep(1:4)), sep = " ")

“Too few values at 13047 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...”

In [4]:
newvars <- c("PaperId","namePart.1", "namePart.2", "namePart.3", "namePart.4", "Year2", "authorOrder", "authorCount")
nameParts <- nameTable[newvars]
nameParts$namePart.1 <- gsub("\"", "", nameParts$namePart.1)
nameParts$namePart.2 <- gsub("\"", "", nameParts$namePart.2)
nameParts$namePart.3 <- gsub("\"", "", nameParts$namePart.3)
nameParts$namePart.4 <- gsub("\"", "", nameParts$namePart.4)
nameParts$min_years <- as.numeric(nameParts$Year2)-20
nameParts$max_years <- as.numeric(nameParts$Year2)+20
results <- gender_df(nameParts, name_col = "namePart.1", year_col = c("min_years", "max_years"), method = "ssa")
output <- nameParts %>% left_join(results, by = c("namePart.1" = "name", "min_years" = "year_min"))
results <- gender_df(nameParts, name_col = "namePart.2", year_col = c("min_years", "max_years"), method = "ssa")
output <- output %>% left_join(results, by = c("namePart.2" = "name", "min_years" = "year_min"))
results <- gender_df(nameParts, name_col = "namePart.3", year_col = c("min_years", "max_years"), method = "ssa")
output <- output %>% left_join(results, by = c("namePart.3" = "name", "min_years" = "year_min"))
output <- within(output, #if namepart3 is the last name, set it NA 
   temp2 <- ifelse(is.na(namePart.4),NA,proportion_female)
)
output <- within(output, #if namepart2 is the last name, set it NA 
   temp1 <- ifelse(is.na(namePart.3),NA,proportion_female.y)
)
output <- within(output, #if namepart1 is the last name, set it NA 
   temp0 <- ifelse(is.na(namePart.2),NA,proportion_female.x)
)
output <- within(output, # namepart2 merged with namepart3
   femaleProb0 <- ifelse(is.na(temp1),temp2,temp1)
)
output <- within(output, # namepart1 merged with namepart2
   femaleProb <- ifelse(is.na(temp0),temp1,temp0)
)
output <- within(output, # binarize the probabilities, with 1 representing female and 0 male
   genderLabel <- ifelse(femaleProb>0.5,1,0)
)
output %>% 
  group_by(genderLabel) %>%
  summarise(no_rows = length(genderLabel))

genderLabel,no_rows
0.0,6175
1.0,3162
,283263


### We break each author name into 4 parts, delimited by space in their full name. The detected gender probability of later name parts overwrite earlier ones. The final labels are given by binarizing the probabilities, with 1 representing female and 0 male.

In [5]:
newvars <- c("PaperId", "Year2", "authorOrder", "genderLabel", "authorCount")
merged <- output[newvars]
merged <- spread(merged, authorOrder, genderLabel)
merged$detectCount <- 100-rowSums(is.na(merged))
merged$femaleCount <- rowSums(merged[sapply(merged, is.numeric)], na.rm = TRUE) - merged$detectCount
merged$femaleProp <- merged$femaleCount/merged$detectCount
merged$femaleProp[is.infinite(merged$femaleProp) | is.nan(merged$femaleProp) ] <- NA
merged$authorCount <- as.numeric(merged$authorCount)
merged$unknownCount <- merged$authorCount-merged$detectCount
merged$lastColName <- paste("Author", merged$authorCount, sep=".") 
merged$last <- as.numeric(merged[cbind(1:nrow(merged),match(merged$lastColName, colnames(merged)))])
merged$PaperId <- as.numeric(merged$PaperId)
merged <- rename(merged, "1st" = "Author.1", "2nd" = "Author.2", "3rd" = "Author.3", "4th" = "Author.4", "5th" = "Author.5")
outputVars <- c("PaperId", "authorCount", "unknownCount", "femaleCount", "femaleProp", "1st", "2nd", "3rd", "4th", "5th", "last")
mergedSelect <-  merged[outputVars]
#write.csv(merged, file = "output.csv")

### Group gender detected results to papers. Besides paper level aggregate statistics, first 5 and the last authors are kepted for downstream analysis (OpenSci3.csv).

In [6]:
block0 <- block %>% left_join(merged, by = c("PaperId" = "PaperId"))
outputVars <- c("Tag", "PaperId", "AuthorIdsOrder", "AuthorNamesOrder", "authorCount", "unknownCount", "femaleCount", "femaleProp", "1st", "2nd", "3rd", "4th", "5th", "last",
                "FoSNames", "Year", "DocType", "Journal", "Publisher", "Doi", "Title", "EstimatedCitation", "IndexedAbstract")
data <- block0[outputVars]
write_csv(data, "OpenSci3.csv")
nrow(data)

### Counting pie chart numbers

In [7]:
data$allmale <- as.numeric(data$"1st"==0) * as.numeric(data$last==0) * as.numeric(data$authorCount>1)
data$allfemale <- as.numeric(data$"1st"==1) * as.numeric(data$last==1) * as.numeric(data$authorCount>1)
data$mixed <- as.numeric(data$"1st"==1) + as.numeric(data$last==1)
data$mixed <- as.numeric(data$mixed==1)
data$allUnknown <- as.numeric(is.na(data$"1st")) * as.numeric(is.na(data$last)) * as.numeric(data$authorCount>1)
data$UnknownMale <- as.numeric(is.na(data$"1st")) * as.numeric(data$last==0)
data$UnknownFemale <- as.numeric(is.na(data$"1st")) * as.numeric(data$last==1)
data$maleUnknown <- as.numeric(is.na(data$last)) * as.numeric(data$"1st"==0)
data$femaleUnknown <- as.numeric(is.na(data$last)) * as.numeric(data$"1st"==1)
data$singleMale <- as.numeric(data$"1st"==0) * as.numeric(data$authorCount==1)
data$singleFemale <- as.numeric(data$"1st"==1) * as.numeric(data$authorCount==1)
data$singleUnknown <- as.numeric(is.na(data$"1st")) * as.numeric(data$authorCount==1)
outputVars <- c("Tag","allmale","allfemale","mixed","allUnknown","UnknownMale","UnknownFemale","maleUnknown","femaleUnknown","singleMale","singleFemale","singleUnknown")
dataOpen <- data[outputVars]
dataOpen <- dataOpen[dataOpen$Tag=="OpenScience",]
pieOpen <- dplyr::bind_rows(colSums(dataOpen[,-1], na.rm = TRUE))
dataRep <- data[outputVars]
dataRep <- dataRep[dataRep$Tag=="Reproducibility",]
pieRep <- dplyr::bind_rows(colSums(dataRep[,-1], na.rm = TRUE))
pie <- dplyr::bind_rows(list(OpenScience=pieOpen, Reproducibility=pieRep), .id = 'Tag')
pie$sum <- rowSums(pie[-1])
#write_csv(pie, "Pie.csv")

In [8]:
pie

Tag,allmale,allfemale,mixed,allUnknown,UnknownMale,UnknownFemale,maleUnknown,femaleUnknown,singleMale,singleFemale,singleUnknown,sum
OpenScience,190,73,169,43,43,21,56,29,146,72,37,879
Reproducibility,419,80,326,345,200,66,165,104,194,76,72,2047


In [9]:
library(tibble)
single <- pie[c("singleMale","singleFemale","singleUnknown")]
single <- as.data.frame(t(single))
# Calculate percentage
single$V1p <- single$V1 / sum(single$V1)
single$V2p <- single$V2 / sum(single$V2)
single <- rownames_to_column(single, "Legend") 
# Reorder the rows for plotting and lockin with factors
single <- single[c(3,2,1),]
single$Legend <- factor(single$Legend, levels = single$Legend)

# Add label position for pie charts
single <- single %>%
  arrange(desc(Legend)) %>%
  mutate(lab.ypos1 = cumsum(V1p) - 0.5*V1p)
single <- single %>%
  arrange(desc(Legend)) %>%
  mutate(lab.ypos2 = cumsum(V2p) - 0.5*V2p)
# Prepare the labels for plotting
single$V1l <- sprintf("%0.2f%%", single$V1p * 100)
single$V2l <- sprintf("%0.2f%%", single$V2p * 100)
single

Legend,V1,V2,V1p,V2p,lab.ypos1,lab.ypos2,V1l,V2l
singleMale,146,194,0.572549,0.5672515,0.2862745,0.2836257,57.25%,56.73%
singleFemale,72,76,0.2823529,0.2222222,0.7137255,0.6783626,28.24%,22.22%
singleUnknown,37,72,0.145098,0.2105263,0.927451,0.8947368,14.51%,21.05%


In [10]:
pdf("SinglePie.pdf")
library(ggplot2)
bp<- ggplot(single, aes(x="", y=V1p, fill=Legend))+ ggtitle("Open science") +
  geom_bar(width = 1, stat = "identity", color = "white") +
  coord_polar("y", start = 0) +
  geom_text(aes(y = lab.ypos1, label = V1l))
bp
bp2<- ggplot(single, aes(x="", y=V2p, fill=Legend))+ ggtitle("Reproducibility") +
  geom_bar(width = 1, stat = "identity", color = "white") +
  coord_polar("y", start = 0) +
  geom_text(aes(y = lab.ypos2, label = V2l))
bp2
dev.off()

In [11]:
multi <- pie[c("allmale","maleUnknown","allUnknown","femaleUnknown","allfemale","mixed")]
multi$maleUnknown <- multi$maleUnknown + pie$UnknownMale
multi$femaleUnknown <- multi$femaleUnknown + pie$UnknownFemale
multi <- as.data.frame(t(multi))
# Calculate percentage
multi$V1p <- multi$V1 / sum(multi$V1)
multi$V2p <- multi$V2 / sum(multi$V2)
multi <- rownames_to_column(multi, "Legend") 
# Reorder the rows for plotting and lockin with factors
multi <- multi[c(6,5,4,3,2,1),]
multi$Legend <- factor(multi$Legend, levels = multi$Legend)

# Add label position for pie charts
multi <- multi %>%
  arrange(desc(Legend)) %>%
  mutate(lab.ypos1 = cumsum(V1p) - 0.5*V1p)
multi <- multi %>%
  arrange(desc(Legend)) %>%
  mutate(lab.ypos2 = cumsum(V2p) - 0.5*V2p)
# Prepare the labels for plotting
multi$V1l <- sprintf("%0.2f%%", multi$V1p * 100)
multi$V2l <- sprintf("%0.2f%%", multi$V2p * 100)
multi

Legend,V1,V2,V1p,V2p,lab.ypos1,lab.ypos2,V1l,V2l
allmale,190,419,0.30448718,0.2457478,0.1522436,0.1228739,30.45%,24.57%
maleUnknown,99,365,0.15865385,0.21407625,0.3838141,0.3527859,15.87%,21.41%
allUnknown,43,345,0.06891026,0.20234604,0.4975962,0.5609971,6.89%,20.23%
femaleUnknown,50,170,0.08012821,0.09970674,0.5721154,0.7120235,8.01%,9.97%
allfemale,73,80,0.11698718,0.04692082,0.6706731,0.7853372,11.70%,4.69%
mixed,169,326,0.27083333,0.19120235,0.8645833,0.9043988,27.08%,19.12%


In [12]:
pdf("MultiPie.pdf")
library(ggplot2)
bp<- ggplot(multi, aes(x="", y=V1p, fill=Legend))+ ggtitle("Open science") +
  geom_bar(width = 1, stat = "identity", color = "white") +
  coord_polar("y", start = 0) +
  geom_text(x = 1.2, aes(y = lab.ypos1, label = V1l))
bp
bp2<- ggplot(multi, aes(x="", y=V2p, fill=Legend))+ ggtitle("Reproducibility") +
  geom_bar(width = 1, stat = "identity", color = "white") +
  coord_polar("y", start = 0) +
  geom_text(x = 1.2, aes(y = lab.ypos2, label = V2l))
bp2
dev.off()

### Calculating the fraction of women authors in each literature

In [57]:
papers = read_csv("OpenSci3.csv")
AuthorTable <- papers %>% separate(AuthorIdsOrder, into = sprintf('%s%s', rep('',100), rep(1:100)), sep = "; ") #max author has exceeded 90
AuthorTable <- AuthorTable %>% separate(AuthorNamesOrder, into = sprintf('%s.%s', rep('name',100), rep(1:100)), sep = "; ") #max author has exceeded 90
AuthorTable <- AuthorTable %>% gather(authorOrder, AuthorIdsOrder, into = sprintf('%s%s', rep('',100), rep(1:100)))
AuthorTable <- AuthorTable[!is.na(AuthorTable$AuthorIdsOrder), ]
AuthorTable2 <-  AuthorTable[AuthorTable$Tag=="Reproducibility",]
AuthorTable1 <-  AuthorTable[AuthorTable$Tag=="OpenScience",]
AuthorTable1 <- distinct(AuthorTable1, AuthorIdsOrder, .keep_all = TRUE)
AuthorTable2 <- distinct(AuthorTable2, AuthorIdsOrder, .keep_all = TRUE)
nrow(AuthorTable1)
nrow(AuthorTable2)

Parsed with column specification:
cols(
  .default = col_integer(),
  Tag = col_character(),
  PaperId = col_double(),
  AuthorIdsOrder = col_character(),
  AuthorNamesOrder = col_character(),
  femaleProp = col_double(),
  FoSNames = col_character(),
  DocType = col_character(),
  Journal = col_character(),
  Publisher = col_character(),
  Doi = col_character(),
  Title = col_character(),
  IndexedAbstract = col_character()
)
See spec(...) for full column specifications.
“Too few values at 2926 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...”

In [64]:
getName <- function(df) {
  # check if df has more then 1 row:
  if (nrow(df)>1) { message("!! nrow(df)>1 !!");  return(NA) }
  colName <- paste('name.', df$authorOrder, sep="")
  return(df %>% select(colName))
}
AuthorTable1$nameDistinct <- sapply(split((AuthorTable1), seq(nrow(AuthorTable1))), getName)
AuthorTable2$nameDistinct <- sapply(split((AuthorTable2), seq(nrow(AuthorTable2))), getName)

In [67]:
AuthorTable1$Year2 <-  AuthorTable1$Year-45 #35-55
AuthorTable2$Year2 <-  AuthorTable2$Year-45 #35-55
myvars <- c("AuthorIdsOrder", "nameDistinct", "Year2", "PaperId")
data1 <- AuthorTable1[myvars]
data2 <- AuthorTable2[myvars]

AuthorList1 <- data.frame(lapply(AuthorTable1, trimws), stringsAsFactors = FALSE)
nameTable1 <- AuthorList1 %>% separate(nameDistinct, into = sprintf('%s.%s', rep('namePart',4), rep(1:4)), sep = " ")

AuthorList2 <- data.frame(lapply(AuthorTable2, trimws), stringsAsFactors = FALSE)
nameTable2 <- AuthorList2 %>% separate(nameDistinct, into = sprintf('%s.%s', rep('namePart',4), rep(1:4)), sep = " ")

“Too few values at 8406 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...”

In [71]:
nrow(nameTable1)

In [78]:
newvars <- c("AuthorIdsOrder", "PaperId", "namePart.1", "namePart.2", "namePart.3", "namePart.4", "Year2", "authorOrder")
nameParts <- nameTable1[newvars]
nameParts$namePart.1 <- gsub("\"", "", nameParts$namePart.1)
nameParts$namePart.2 <- gsub("\"", "", nameParts$namePart.2)
nameParts$namePart.3 <- gsub("\"", "", nameParts$namePart.3)
nameParts$namePart.4 <- gsub("\"", "", nameParts$namePart.4)
nameParts$min_years <- as.numeric(nameParts$Year2)-20
nameParts$max_years <- as.numeric(nameParts$Year2)+20
results <- gender_df(nameParts, name_col = "namePart.1", year_col = c("min_years", "max_years"), method = "ssa")
output <- nameParts %>% left_join(results, by = c("namePart.1" = "name", "min_years" = "year_min"))
results <- gender_df(nameParts, name_col = "namePart.2", year_col = c("min_years", "max_years"), method = "ssa")
output <- output %>% left_join(results, by = c("namePart.2" = "name", "min_years" = "year_min"))
results <- gender_df(nameParts, name_col = "namePart.3", year_col = c("min_years", "max_years"), method = "ssa")
output <- output %>% left_join(results, by = c("namePart.3" = "name", "min_years" = "year_min"))
output <- within(output, #if namepart3 is the last name, set it NA 
   temp2 <- ifelse(is.na(namePart.4),NA,proportion_female)
)
output <- within(output, #if namepart2 is the last name, set it NA 
   temp1 <- ifelse(is.na(namePart.3),NA,proportion_female.y)
)
output <- within(output, #if namepart1 is the last name, set it NA 
   temp0 <- ifelse(is.na(namePart.2),NA,proportion_female.x)
)
output <- within(output, # namepart2 merged with namepart3
   femaleProb0 <- ifelse(is.na(temp1),temp2,temp1)
)
output <- within(output, # namepart1 merged with namepart2
   femaleProb <- ifelse(is.na(temp0),temp1,temp0)
)
output <- within(output, # binarize the probabilities, with 1 representing female and 0 male
   genderLabel <- ifelse(femaleProb>0.5,1,0)
)

output %>% 
  group_by(genderLabel) %>%
  summarise(no_rows = length(genderLabel))

genderLabel,no_rows
0.0,1669
1.0,916
,572


In [79]:
newvars <- c("AuthorIdsOrder", "PaperId","namePart.1", "namePart.2", "namePart.3", "namePart.4", "Year2", "authorOrder")
nameParts <- nameTable2[newvars]
nameParts$namePart.1 <- gsub("\"", "", nameParts$namePart.1)
nameParts$namePart.2 <- gsub("\"", "", nameParts$namePart.2)
nameParts$namePart.3 <- gsub("\"", "", nameParts$namePart.3)
nameParts$namePart.4 <- gsub("\"", "", nameParts$namePart.4)
nameParts$min_years <- as.numeric(nameParts$Year2)-20
nameParts$max_years <- as.numeric(nameParts$Year2)+20
results <- gender_df(nameParts, name_col = "namePart.1", year_col = c("min_years", "max_years"), method = "ssa")
output <- nameParts %>% left_join(results, by = c("namePart.1" = "name", "min_years" = "year_min"))
results <- gender_df(nameParts, name_col = "namePart.2", year_col = c("min_years", "max_years"), method = "ssa")
output <- output %>% left_join(results, by = c("namePart.2" = "name", "min_years" = "year_min"))
results <- gender_df(nameParts, name_col = "namePart.3", year_col = c("min_years", "max_years"), method = "ssa")
output <- output %>% left_join(results, by = c("namePart.3" = "name", "min_years" = "year_min"))
output <- within(output, #if namepart3 is the last name, set it NA 
   temp2 <- ifelse(is.na(namePart.4),NA,proportion_female)
)
output <- within(output, #if namepart2 is the last name, set it NA 
   temp1 <- ifelse(is.na(namePart.3),NA,proportion_female.y)
)
output <- within(output, #if namepart1 is the last name, set it NA 
   temp0 <- ifelse(is.na(namePart.2),NA,proportion_female.x)
)
output <- within(output, # namepart2 merged with namepart3
   femaleProb0 <- ifelse(is.na(temp1),temp2,temp1)
)
output <- within(output, # namepart1 merged with namepart2
   femaleProb <- ifelse(is.na(temp0),temp1,temp0)
)
output <- within(output, # binarize the probabilities, with 1 representing female and 0 male
   genderLabel <- ifelse(femaleProb>0.5,1,0)
)
output %>% 
  group_by(genderLabel) %>%
  summarise(no_rows = length(genderLabel))

genderLabel,no_rows
0.0,3705
1.0,1891
,3170


In [80]:
write_csv(output, "authorList.csv")

In [81]:
head(output)

AuthorIdsOrder,PaperId,namePart.1,namePart.2,namePart.3,namePart.4,Year2,authorOrder,min_years,max_years,⋯,proportion_male,proportion_female,gender,year_max,temp2,temp1,temp0,femaleProb0,femaleProb,genderLabel
2678954453,2155922502,Francis,Beauvais,,,1969,1,1949,1989,⋯,,,,,,,0.0823,,0.0823,0.0
2527835356,2324727210,Takashi,Hoshino,,,1967,1,1947,1987,⋯,,,,,,,0.0,,0.0,0.0
2740812319,2740582100,P.,Van,Brussel,,1972,1,1952,1992,⋯,,,,,,0.0772,,0.0772,0.0772,0.0
2112481453,2740654812,Khatera,Ibrahimi,,,1972,1,1952,1992,⋯,,,,,,,,,,
2772520160,2771908588,L.,Al-Mouazzen,,,1972,1,1952,1992,⋯,,,,,,,,,,
2780837869,2406493898,Monya,Baker,,,1971,1,1951,1991,⋯,,,,,,,1.0,,1.0,1.0
