This notebook uses R "gender" package to benchmark datasets on SSA and IPUMS datasets. To run you need to specify the following variables: <br/>
1. dataset: can be "imdb", "wiki", "scholar" or "twitter"
2. path_to_data: path to the zip file 
3. path_to_output: path where the output folder will be created

In [None]:
# install.packages("dplyr")
# install.packages("tidyverse")
# install.packages("gender")
# install.packages("R.matlab")
library(gender)
library(tidyverse)
library(R.matlab)

In [None]:
dataset <- "wiki" #imdb, wiki, scholar or twitter
path_to_data <- "wiki.tar.gz" #path to the zip archive
path_to_output <- "wiki/" #output folder to which the archive will be extracted and the results will be saved

In [None]:
name_scholar = function(data, name_clean) { #extracting first names from scholar data
    name <- data['V1']
    name_clean <- chartr(old = "+", new = " ", name)
    name_clean <- unlist(strsplit(name_clean," "))[1]
    }

name_twitter = function(data, name_clean) { #extracting first names from twitter data
    name <- data['Name']
    name_clean <- unlist(strsplit(name," "))[1]
}

if(dataset=="scholar"){
    unzip(path_to_data,exdir=path_to_output)
    data_f <- read.csv(file = paste(path_to_output, 'female_name_url.csv', sep=""), header = FALSE)
    data_f$gender=0 #female
    data_m <- read.csv(file = paste(path_to_output, 'male_name_url.csv', sep=""), header = FALSE)
    data_m$gender = 1 #male
    data_merged <- rbind(data_f[c("V1","gender")], data_m[c("V1","gender")])
    
    data_merged$names = apply(data_merged, 1, name_scholar) #extracting first names
    data_merged["predicted_gender"] = NaN
    for (row in 1:nrow(data_merged)) {
        name <- data_merged[row, "names"]
        gender <- gender(name, years = 2012, method = "ssa")$gender
        if(length(gender)==0){
            data_merged[row, "predicted_gender"] = NaN
        } else if(gender=='male'){
            data_merged[row, "predicted_gender"] = 1
        } else {
            data_merged[row, "predicted_gender"] = 0
        }

    }
} else if(dataset=="imdb") {
    
} else if(dataset=="wiki") {
    untar(path_to_data, exdir=path_to_output)
    
    
} else { #twitter
    unzip(path_to_data,exdir=path_to_output)
    data <- read.csv(file = paste(path_to_output, gsub('.{4}$', '', path_to_data), '/Twitter_names.csv', sep=""), stringsAsFactors = FALSE)
    unzip(paste(path_to_output, gsub('.{4}$', '', path_to_data), '/_a_results32langs.zip', sep=""),exdir=paste(path_to_output, gsub('.{4}$', '', path_to_data), '/_a_results32langs/', sep=""))
    path_to_meta = paste(path_to_output, gsub('.{4}$', '', path_to_data), '/_a_results32langs', sep="")
    files <- list.files(path=path_to_meta, pattern="*.csv", full.names=TRUE, recursive=FALSE)

    meta_df = data.frame()
    for (file in files){
        t <- read.csv(file, header=TRUE, stringsAsFactors = FALSE) # load file
        meta_df <- rbind(meta_df,t[c('indicated_gender', 'temp_file')])
    }
    
    #merging dataframes with gender and names 
    data_merged <- merge(data[c('hash', 'Name')], meta_df, by.x='hash', by.y='temp_file')
    data_merged <- data_merged[!duplicated(data_merged$hash), ]
    data_merged$first_names = apply(data_merged, 1, name_twitter) #extracting first names
    data_merged[['indicated_gender']][data_merged$indicated_gender == "female"] <- 0
    data_merged[['indicated_gender']][data_merged$indicated_gender == "male"] <- 1
    data_merged <- data_merged[data_merged$indicated_gender == 1 | data_merged$indicated_gender == 0,]
    names(data_merged)[names(data_merged) == "indicated_gender"] <- "gender" #renaming the true gender
    
    data_merged["predicted_gender"] = NaN
    for (row in 1:nrow(data_merged)) {
        name <- data_merged[row, "first_names"]
        gender <- gender(name, years = 2012, method = "ssa")$gender
        if(length(gender)==0){
            data_merged[row, "predicted_gender"] = NaN
        } else if(gender=='male'){
            data_merged[row, "predicted_gender"] = 1
        } else {
            data_merged[row, "predicted_gender"] = 0
        }

    }
    
}

In [None]:
install.packages("R.matlab")
library(R.matlab)
readMat(paste(path_to_output, 'wiki/wiki.mat',sep=''))

In [None]:
head(data_merged)

In [None]:
#calculating all the metrics

calcScores=function(data_merged){
    df = data_merged[complete.cases(data_merged), ]
    coverage = nrow(df)/nrow(data_merged)
    print(paste("Coverage: ", coverage))

    scores=list();
    avg_pr = 0
    avg_r = 0
    avg_f1 = 0
    for(i in seq(0, 1)){
        tp = nrow(df[df$predicted_gender==i & df$gender==i,]);
        fp=nrow(df[df$predicted_gender==i & df$gender!=i,]);
        fn=nrow(df[df$predicted_gender!=i & df$gender==i,]);
        precision <- tp/(tp+fp)
        recall <- tp/(tp+fn)
        f1=(2*precision*recall)/(precision+recall)
        n = nrow(df[df$gender==i,])/nrow(df)
        avg_pr = avg_pr + n*precision
        avg_r = avg_r + n*recall
        avg_f1 = avg_f1 + n*f1
        if(i==0){
            gender = 'male'
        } else{
            gender = 'female'
        }

        temp <- list(precision, recall, f1)
        names(temp) <- c("Precision", "Recall", "F1")
        scores[[gender]] <- temp
    } 
    average <- list(avg_pr, avg_r, avg_f1)
    names(average) <- c("Precision", "Recall", "F1")
    scores[['average']] <- average
    
    return(scores);
}

result = calcScores(data_merged)
unlist(result)