In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

import seaborn as sb
sb.set_style('darkgrid')

from glob import glob
import matplotlib.pyplot as plt
import os
import random
import cv2

import gzip
import nibabel as nb

In [2]:
MriFiles = glob('/Volumes/DISK_IMG/10*/*T1w.nii.gz')
SosFiles = glob('/Volumes/DISK_IMG/10*/*Vp.nii.gz')

MriIm = [] # array of full images containing header and data of MRIs
MriHdr = [] # array of only the headers of MRIs
MriData = [] # array of only the data of MRIs

# similarly, for speed of sound maps
SosIm = []
SosHdr = []
SosData = []
for i in range(len(MriFiles)):
    MriIm.append(nb.load(MriFiles[i]))
    MriHdr.append(MriIm[i].header)
    MriData.append(MriIm[i].get_fdata())

    SosIm.append(nb.load(SosFiles[i]))
    SosHdr.append(SosIm[i].header)
    SosData.append(SosIm[i].get_fdata())

In [3]:
# put all the scans of all MRIs as members of a dataframe so that each cell of each row is a whole image
MriDf = pd.DataFrame()
for scan in range(len(MriData)):
    Mri = MriData[scan]
    for slice in range(len(Mri[1,1,:])):
        temp = {'img' : [Mri[:,:,slice]]}
        tempDf = pd.DataFrame.from_records(temp)
        MriDf = MriDf.append(tempDf)
print(MriDf.shape[0])

1600


In [4]:
# same for SoS
SosDf = pd.DataFrame()
for scan in range(len(SosData)):
    Sos = SosData[scan]
    for slice in range(len(Sos[1,1,:])):
        temp = {'img' : [Sos[:,:,slice]]}
        tempDf = pd.DataFrame.from_records(temp)
        SosDf = SosDf.append(tempDf)
print(SosDf.shape[0])

1600


In [5]:
# must vectorize each cell of the df (each (n,m) matrix) so structure is:
# X has form (n.m, 2) (col of 1s and col of values)
# y has form (n.m, 1)
vec_MriDf = pd.DataFrame()
for row in range(MriDf.shape[0]):
    temp = pd.DataFrame({'' : np.array([MriDf.iloc[row][0].flatten()]).tolist()})
    vec_MriDf = pd.concat([vec_MriDf,temp])


In [6]:
# same for Sos
vec_SosDf = pd.DataFrame()
for row in range(SosDf.shape[0]):
    temp = pd.DataFrame({'' : np.array([SosDf.iloc[row][0].flatten()]).tolist()})
    vec_SosDf = pd.concat([vec_SosDf,temp])

In [7]:
vec_MriDf.head()

Unnamed: 0,Unnamed: 1
0,"[32.05524826049805, 72.65767669677734, 19.2325..."
0,"[32.053810119628906, 45.943233489990234, 26.71..."
0,"[38.461997985839844, 54.48716354370117, 16.025..."
0,"[19.22929573059082, 53.4140510559082, 41.66219..."
0,"[11.749947547912598, 24.567768096923828, 25.63..."


In [8]:
# we need to make a super dataframe with both cos we'll separate them to train the model but we want them to be divided the same way
# eg. if MRI#2 is in training, SoS#2 also has to be there
df = pd.concat([vec_MriDf, vec_SosDf], axis=1)
df.columns = ['MRI','SoS']
df.head()

Unnamed: 0,MRI,SoS
0,"[32.05524826049805, 72.65767669677734, 19.2325...","[1480.0, 1480.0, 1480.0, 1480.0, 1480.0, 1480...."
0,"[32.053810119628906, 45.943233489990234, 26.71...","[1480.0, 1480.0, 1480.0, 1480.0, 1480.0, 1480...."
0,"[38.461997985839844, 54.48716354370117, 16.025...","[1480.0, 1480.0, 1480.0, 1480.0, 1480.0, 1480...."
0,"[19.22929573059082, 53.4140510559082, 41.66219...","[1480.0, 1480.0, 1480.0, 1480.0, 1480.0, 1480...."
0,"[11.749947547912598, 24.567768096923828, 25.63...","[1480.0, 1480.0, 1480.0, 1480.0, 1480.0, 1480...."


In [9]:
# lets do 70% train, 15% test, 15% eval
df_train = pd.DataFrame()
train_indices = []
for i in range(int(0.7*df.shape[0])):
    # get 70% of the set for training from the whole thing randomly
    r = random.randint(0, df.shape[0]-1) 
    train_indices.append(r)
    df_train = df_train.append(df.iloc[r])
print(df_train.shape)
df_train.head()

(1120, 2)


Unnamed: 0,MRI,SoS
0,"[27.58697509765625, 29.868118286132812, 39.023...","[1480.0, 1480.0, 1480.0, 1480.0, 1480.0, 1480...."
0,"[7.938844680786133, 49.884742736816406, 33.995...","[1480.0, 1480.0, 1480.0, 1480.0, 1480.0, 1480...."
0,"[24.312788009643555, 18.2200984954834, 36.3968...","[1480.0, 1480.0, 1480.0, 1480.0, 1480.0, 1480...."
0,"[47.7351188659668, 50.1462287902832, 63.530162...","[1480.0, 1480.0, 1480.0, 1480.0, 1480.0, 1480...."
0,"[7.427075386047363, 39.602210998535156, 42.063...","[1480.0, 1480.0, 1480.0, 1480.0, 1480.0, 1480...."


In [10]:
# get the stuff that isnt in train
# dataframe1[~dataframe1.column1.isin(common.column1)]
df_test = pd.DataFrame()
test_indices = []
while df_test.shape[0] < 0.15*df.shape[0]:
    r = random.randint(0, df.shape[0]-1)

    if(r not in train_indices): 
        test_indices.append(r)
        df_test = df_test.append(df.iloc[r])
print("Number of test images: " + str(df_test.shape[0]))

df_eval = pd.DataFrame()
# for i in range(int(0.15*df.shape[0])):
while df_eval.shape[0] < 0.15*df.shape[0]:
    r = random.randint(0, df.shape[0]-1)

    if(r not in train_indices) and (r not in test_indices) :
        df_eval = df_eval.append(df.iloc[r])
print("Number of eval images: " + str(df_eval.shape[0]))


Number of test images: 240
Number of eval images: 240


In [38]:
df_train_X = df_train['MRI'].to_frame('MRI')
df_train_y = df_train['SoS'].to_frame('SoS')

In [42]:
# df_train_X.insert(0,'',1)
df_train_X.head()

Unnamed: 0,MRI
0,"[27.58697509765625, 29.868118286132812, 39.023..."
0,"[7.938844680786133, 49.884742736816406, 33.995..."
0,"[24.312788009643555, 18.2200984954834, 36.3968..."
0,"[47.7351188659668, 50.1462287902832, 63.530162..."
0,"[7.427075386047363, 39.602210998535156, 42.063..."


In [31]:
mat_length = np.zeros([df_train_X.shape[0],1])
for row in range(df_train_X.shape[0]):
    mat_length[row] = len(df_train_X.iloc[row][1])

for i in range(len(mat_length)):
    if mat_length[i]!= 81920:
        print(i)


In [39]:
"""
LinearRegression does not support list as a feature. I saw you're using one-hot, and you can use each dimension as a column of features
https://stackoverflow.com/questions/53742832/is-there-a-way-to-use-lists-as-values-in-a-dataframe

"""

(1120, 1)

In [62]:
df_train_X['MRI']

0    [27.58697509765625, 29.868118286132812, 39.023...
0    [7.938844680786133, 49.884742736816406, 33.995...
0    [24.312788009643555, 18.2200984954834, 36.3968...
0    [47.7351188659668, 50.1462287902832, 63.530162...
0    [7.427075386047363, 39.602210998535156, 42.063...
                           ...                        
0    [38.200435638427734, 23.862871170043945, 28.61...
0    [39.42176055908203, 52.90105438232422, 78.7069...
0    [37.5607795715332, 44.79332733154297, 29.01894...
0    [48.72303009033203, 50.987091064453125, 49.850...
0    [38.76922607421875, 38.75668716430664, 12.4961...
Name: MRI, Length: 1120, dtype: object

In [47]:
df = pd.DataFrame([1,2,])

df_train_X_2 = pd.get_dummies(df_train_X['MRI'])

TypeError: unhashable type: 'list'