# Overview:
 - we processed the data to have the same measure for all studies 
 - we chose OR as the common measure for all studies
 - we also chose to use the 95% CI for all studies

# Steps:
1. read data
2. cleaned up file to keep only the numbers
   * removed all the special formatting
   * changed column names to simplify
3. converted all the measures to OR
   * cut the ORs to be only between `0` and `10` because in log OR we can not have anything less than 0 
   * aka: if there were any numbers less than `0.1`, they were given the value `0.1`
   * aka: if there were any numbers greater than `10`, they were given the value `10`
   This is because the `Crude IRR` conversion could result in infinity
   (when dividing by 0, will give a result of infinity-- not useful for us)
      ```R
      if (measure_name == "Crude IRR"){
         OR <- measure / (1 - measure)
      }
      ```

In [6]:
# Function to convert measures to OR
convert_to_or <- function(row){

  measure_name <- row[['Measure_Name']]
  measure <- as.numeric(row[['Measure_Value']])
  n1 <- as.numeric(row[['N1_Cancer']])
  n2 <- as.numeric(row[['N2_No_Cancer']])

  if (measure_name == "OR"){
    OR <- measure
  } else if (measure_name == "Crude OR") {
    OR <- measure
  } else if (measure_name == "Adjusted OR") {
    OR <- measure
  } else if (measure_name == "RD"){
    OR <- exp(measure)
  } else if (measure_name == "SIR"){
    # TODO: two SIR values are 0, please fix the upstream data
    OR <- measure
  } else if (measure_name == "RR"){
    OR <- measure
  } else if (measure_name == "HR"){
    OR <- measure
  } else if (measure_name == "Crude IRR"){
    # TODO: this is prob wrong, all output values hit min limit of 0.1
    OR <- measure / (1 - measure)
  } else if (measure_name == "IR"){
    # TODO: this is prob wrong, all output values hit min limit of 0.1
    OR <- (n1 / n2) / (1 - (n1 / n2))
  }
  # print(OR)
  if (OR < 0.1) {
    OR <- 0.1
  } else if (OR > 10) {
    OR <- 10
  }
  return(OR)
}

In [7]:
# read data in
data <- read.csv("meta-analysis-data.csv")

# remove commas from Total_Cohort and parse as numeric
data$n <- as.numeric(gsub(",", "",data$Total_Cohort))

# convert Measure_Value to numeric
data$Measure_Value <- as.numeric(data$Measure_Value)

# parse N1_Cancer as numeric or fill with 0
data$N1_Cancer <- as.numeric(data$N1_Cancer)
data$N1_Cancer[is.na(data$N1_Cancer)] <- 0

# parse N2_No_Cancer as numeric or fill with 0
data$N2_No_Cancer <- as.numeric(data$N2_No_Cancer)
data$N2_No_Cancer[is.na(data$N2_No_Cancer)] <- 0

# delete rows with no confidence interval
data <- data[!is.na(data$X95_CI_lower),]
data <- data[!is.na(data$X95_CI_upper),]

# show the first few rows
head(data)

Unnamed: 0_level_0,Article_ID,Total_Cohort,Measure_Name,Measure_Value,X95_CI_lower,X95_CI_upper,N1_Cancer,N2_No_Cancer,Cancer.Type,n
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
1,"Di Luccia, 2010",1216,OR,3.04,1.22,7.57,0,0,Melanoma,1216
2,"Brewster, 2010",77518,SIR,1.4,0.17,5.04,0,0,Melanoma,77518
3,"Brewster, 2010",77518,SIR,0.0,0.0,3.11,0,0,Basal Cell Carcinoma,77518
4,"Cnattingius, 1995",89,OR,7.5,1.8,31.9,0,0,Myeloid Leukemia,89
5,"Bugaiaki-Shaked, 2022",342172,HR,1.33,0.95,1.84,0,0,Total Malignant Morbidity,342172
6,"Bugaiaki-Shaked, 2022",342172,HR,1.09,0.92,1.3,0,0,Total Bengin Tumors,342172


In [8]:
# convert to OR
data$OR <- apply(data, 1, convert_to_or)

# show the first few rows
head(data)

Unnamed: 0_level_0,Article_ID,Total_Cohort,Measure_Name,Measure_Value,X95_CI_lower,X95_CI_upper,N1_Cancer,N2_No_Cancer,Cancer.Type,n,OR
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
1,"Di Luccia, 2010",1216,OR,3.04,1.22,7.57,0,0,Melanoma,1216,3.04
2,"Brewster, 2010",77518,SIR,1.4,0.17,5.04,0,0,Melanoma,77518,1.4
3,"Brewster, 2010",77518,SIR,0.0,0.0,3.11,0,0,Basal Cell Carcinoma,77518,0.1
4,"Cnattingius, 1995",89,OR,7.5,1.8,31.9,0,0,Myeloid Leukemia,89,7.5
5,"Bugaiaki-Shaked, 2022",342172,HR,1.33,0.95,1.84,0,0,Total Malignant Morbidity,342172,1.33
6,"Bugaiaki-Shaked, 2022",342172,HR,1.09,0.92,1.3,0,0,Total Bengin Tumors,342172,1.09


In [9]:
# rename article_id to study_id
data$study_id <- data$Article_ID

# rename or to odds_ratio
data$odds_ratio <- data$OR

# rename X95_CI_lower to lower_ci
data$lower_ci <- data$X95_CI_lower

# rename X95_CI_upper to upper_ci
data$upper_ci <- data$X95_CI_upper

# rename X...Article_ID to article_id
data$article_id <- data$Article_ID

# rename Cancer.Type to cancer_type
data$cancer_type <- data$Cancer.Type

# show the first few rows
head(data)

Unnamed: 0_level_0,Article_ID,Total_Cohort,Measure_Name,Measure_Value,X95_CI_lower,X95_CI_upper,N1_Cancer,N2_No_Cancer,Cancer.Type,n,OR,study_id,odds_ratio,lower_ci,upper_ci,article_id,cancer_type
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
1,"Di Luccia, 2010",1216,OR,3.04,1.22,7.57,0,0,Melanoma,1216,3.04,"Di Luccia, 2010",3.04,1.22,7.57,"Di Luccia, 2010",Melanoma
2,"Brewster, 2010",77518,SIR,1.4,0.17,5.04,0,0,Melanoma,77518,1.4,"Brewster, 2010",1.4,0.17,5.04,"Brewster, 2010",Melanoma
3,"Brewster, 2010",77518,SIR,0.0,0.0,3.11,0,0,Basal Cell Carcinoma,77518,0.1,"Brewster, 2010",0.1,0.0,3.11,"Brewster, 2010",Basal Cell Carcinoma
4,"Cnattingius, 1995",89,OR,7.5,1.8,31.9,0,0,Myeloid Leukemia,89,7.5,"Cnattingius, 1995",7.5,1.8,31.9,"Cnattingius, 1995",Myeloid Leukemia
5,"Bugaiaki-Shaked, 2022",342172,HR,1.33,0.95,1.84,0,0,Total Malignant Morbidity,342172,1.33,"Bugaiaki-Shaked, 2022",1.33,0.95,1.84,"Bugaiaki-Shaked, 2022",Total Malignant Morbidity
6,"Bugaiaki-Shaked, 2022",342172,HR,1.09,0.92,1.3,0,0,Total Bengin Tumors,342172,1.09,"Bugaiaki-Shaked, 2022",1.09,0.92,1.3,"Bugaiaki-Shaked, 2022",Total Bengin Tumors


In [10]:
# save data into a new csv
write.csv(data, "processed-data.csv")