In [5]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [46]:
#@title Concatenating input files into one tab-separated-values file (.tsv, runs in about 5-10 minutes)

%%bash

dataset="Training_SetA"
folder="gdrive/MyDrive/Etudes/GENIOMHE/M2_EFG/Machine_Learning/${dataset}/"

# This block removes potential duplicate input files in the Google Drive folder
ls $folder | grep '\ (1).psv' > ${folder}twice_aint_nice.txt
while IFS="" read -r p || [ -n "$p" ]
do
  rm "${folder}${p}"
  echo "Duplicate file removed:" ${folder}${p} 
done < ${folder}twice_aint_nice.txt

psv_list=$(ls ${folder}p*.psv)
psv_count=$(ls -alF ${folder}p*.psv | wc -l | cut -d ' ' -f 1)
echo $psv_list | cut -d ' ' -f 1-5 | tr " " "\n"

psv="${folder}All_patients_${dataset}.psv"
tsv="${folder}All_patients_${dataset}.tsv"
touch $psv
line_count="${folder}All_patients_${dataset}.lines"
touch $line_count

for file in ${psv_list}
do
cat ${file} >> $psv
wc -l $file | cut -d ' ' -f 1 >> $line_count
done

head -n 5 $psv

# Processing to remove header (Starts with HR) and switch separator from pipe to tabulation.
cat $psv | grep --invert-match -E 'HR' | tr "|" "\t" > $tsv

printf "\n"
head -n 5 $tsv

echo "Number of lines in .tsv:"
wc -l $tsv
echo "Number of files (= Number of 'header' lines removed):"
echo "${psv_count}"
echo "Sum of all lines:"
awk '{ sum += $1 } END { print sum }' $line_count
wc -l $psv_list | tail -n 1 # Retrieves only the last line of the word count: total

rm $psv $line_count

gdrive/MyDrive/Etudes/GENIOMHE/M2_EFG/Machine_Learning/Training_SetA/p000002.psv
gdrive/MyDrive/Etudes/GENIOMHE/M2_EFG/Machine_Learning/Training_SetA/p000003.psv
gdrive/MyDrive/Etudes/GENIOMHE/M2_EFG/Machine_Learning/Training_SetA/p000004.psv
gdrive/MyDrive/Etudes/GENIOMHE/M2_EFG/Machine_Learning/Training_SetA/p000005.psv
gdrive/MyDrive/Etudes/GENIOMHE/M2_EFG/Machine_Learning/Training_SetA/p000006.psv
HR|O2Sat|Temp|SBP|MAP|DBP|Resp|EtCO2|BaseExcess|HCO3|FiO2|pH|PaCO2|SaO2|AST|BUN|Alkalinephos|Calcium|Chloride|Creatinine|Bilirubin_direct|Glucose|Lactate|Magnesium|Phosphate|Potassium|Bilirubin_total|TroponinI|Hct|Hgb|PTT|WBC|Fibrinogen|Platelets|Age|Gender|Unit1|Unit2|HospAdmTime|ICULOS|SepsisLabel
NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|75.91|0|0|1|-98.6|1|0
61|99|36.44|124|65|43|17.5|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|NaN|7

In [None]:
#@title Testing length of all files against merged file # Do not execute - WIP

%%bash

## Doesn't work yet :(

date

# This script is to check proper function of earlier script

dataset="Training_SetA"
folder="drive/MyDrive/Etudes/GENIOMHE/M2_EFG/Machine_Learning/${dataset}/"

tsv="${folder}All_patients_${dataset}.tsv"

linecount=0
filecount=0

for i in $(ls ${folder}/*.psv)
do
  lines= $(wc -l $i | cut -d' ' -f1)
  # linecount= $((linecount + lines))
  # filecount= $((filecount + 1))
  echo $lines
done

total= $((linecount - filecount))

printf "All patients line:\n"
wc -l $tsv | cut -d' ' -f1

printf "Sum of counts:\n"
printf $linecount
printf "- \n"
printf $filecount
printf "= \n"
printf $total

printf "\n"
date

Process is interrupted.


In [48]:
#@title Getting medians from full datasets

%%bash

# This script is to compute median of each column from every patient's data

dataset="Training_SetA"
folder="gdrive/MyDrive/Etudes/GENIOMHE/M2_EFG/Machine_Learning/${dataset}/"

tsv="${folder}All_patients_${dataset}.tsv"

median_file="${folder}All_patients_${dataset}.median.txt"
log_file="${folder}All_patients_${dataset}.log"

n_columns=$(awk '{print NF}' $tsv | sort -nu | tail -n 1)

printf "Count of columns: $n_columns\n" > $log_file
printf "Count of lines: $(wc -l $tsv | cut -d' ' -f1)\n\n" >> $log_file


for i in $(seq $n_columns)
do
  cut -d"	" -f ${i} $tsv > tmp
  grep --invert-match -E 'NaN' tmp > tmp1 # To remove "NaN" lines in each column
  if [ -s tmp1 ]; then
        # The file is not-empty.
  
  # Computes medians

  sort -n tmp1 | awk ' { a[i++]=$1; } \
     END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }' >> $median_file
  
  nonempty=$(wc -l tmp1 | cut -d' ' -f1)
  printf "Column $i has $nonempty non-empty values\n" >> $log_file
  
  median_col=$(tail -n 1 $median_file)
  printf "Median value: ${median_col}\n\n" >> $log_file
  
  else
        # The file is empty.
  printf "Column $i is empty for all patients and should be removed from all patients\n\n" >> $log_file
  fi
done

printf "Medians (file called All_patients_${dataset}.median.txt): \n"
cat $median_file

# From https://stackoverflow.com/questions/6166375/median-of-column-with-awk

# Approximate median column 1
# sort -n file | awk ' { a[i++]=$1; } END { print a[int(i/2)]; }'

# True median column 1
# sort -n file | awk ' { a[i++]=$1; }
#    END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }'

Medians (file called All_patients_Training_SetA.median): 
84
98
37.06
118
77
58.5
18
0
24
0.5
7.38
40
97
57
18
79
8.3
106
0.9
1.4
124
1.8
2
3.4
4.1
0.9
4.25
30.2
10.4
32.4
10.8
248
181
65.27
1
1
0
-2.54
21
0
84
98
37.06
118
77
58.5
18
0
24
0.5
7.38
40
97
57
18
79
8.3
106
0.9
1.4
124
1.8
2
3.4
4.1
0.9
4.25
30.2
10.4
32.4
10.8
248
181
65.27
1
1
0
-2.54
21
0


rm: cannot remove 'output': No such file or directory
rm: cannot remove 'output_bis': No such file or directory
