In [1]:
import pandas as pd
import numpy as np
import json

In [9]:
class Adexi:
    def __init__(self,columns,dataframe,jsonData,instrument):
        """
    Parameters
    ----------
    columns: list that contains the names of all the columns,
    dataframe: a pandas dataframe object that contains all the csv file info,
    jsonData: a python dictionary that contains information about each instrument
    
    Attributes
    ----------
    columns: stores the list of columns
        type: list
    df: stores the dataframe
        type: datafram
    sections: stores each section from timestamp to complete
        type: list
    nullRow: stores the rows that contain empty values for a section
        type: list
    """
        self.instrument = instrument
        self.workingMemory = jsonData[instrument][0][list(jsonData[instrument][0])[0]]
        self.inhibition = jsonData[instrument][0][list(jsonData[instrument][0])[1]]
        self.columns = columns
        self.df = dataframe
        self.sections = []
        self.jdata = jsonData
        self.nullRow = []
        
    def getSections(self):
        """
        Loops through each column,
        each time a column name ends with timestamp or complete
        we store the index of that column in the sections list
        """
        for index,column in enumerate(self.columns):
            split = column.split("_")
            if split[-1]=='timestamp' or split[-1]=='complete':
                self.sections.append(index)
    
    def missingData(self, position):
        """
        Selects a starting and ending index from the sections list,
        and looks through each row checking if any there is a missing value.
        If a row has a missing value, we store the index of that row in the nullRow list.
        We do this to ignore this row for calculating the score for this section.
        """
        index = self.df.index
        start = self.sections[position]+1 #start of the section
        end = self.sections[position+1] #end of the section
        selected = self.df.columns[start:end] #selecting all the columns from this section of the dataframe
        for i in selected:  #for each column
            if len(self.df.loc[self.df[i].isnull()].index) > 0: # we want to know if the row has missing data
                # we want to know if this row that has missing that is already in the NullRow list
                if self.df.loc[self.df[i].isnull()].index[0] not in self.nullRow: 
                    #if is not, we add it to the list
                    self.nullRow.append(self.df.loc[self.df[i].isnull()].index[0])
                    
    def fixable(self, position, run):
                    
        #to get the index of the person with missing data
        #because we need the data for the row from only that section and no other
        
        for x in self.nullRow:
            index = 0
            found = False
            while not found:
                if self.df.index[index] == x:
                    found = True
                else:
                    index+=1
            missingData = pd.to_numeric(self.df.iloc[index,self.sections[position]+1:self.sections[position+1]])
            values = np.isnan(missingData) #returns boolean array

            # identify which columns have data missing
            count = 0
            columnsMissing = []
            while count<len(values):
                if values[count]:
                    columnsMissing.append(count)
                count+=1

            count = 0
            wm = True
            inh = True
            for missing in columnsMissing:
                while count<len(self.workingMemory):
                    if missing == self.workingMemory[count]:
                        wm = False
                    count+=1
                count = 0
                while count<len(self.inhibition):
                    if missing == self.inhibition[count]:
                        wm = False
                    count+=1
                count = 0
            if wm == False:
                wmColumnIndex = self.df.columns.get_loc(list(self.jdata['adexi'][0])[0]+'_'+ run)
                self.df.iloc[index,wmColumnIndex]= np.NAN
            if inh == False:
                inhColumnIndex = self.df.columns.get_loc(list(self.jdata['adexi'][0])[1]+'_'+ run)
                self.df.iloc[index,inhColumnIndex]= np.NAN
            if inh == False or wm == False:
                totalscoreColumnIndex = self.df.columns.get_loc('adexi_total-score_'+run)
                self.df.iloc[index,totalscoreColumnIndex]= np.NAN
                

    
    def addNewColumn(self,score,name,position):
        """
        adds a new column to the dataframe at the specified "position",
        the column data will be the calculated "score"
        and the name will be the "name"
        
        Parameters
        ----------
        score: a dataframe column that contains the data to be stored in the new column
        name: the name of the column
        position: the position where the column will be inserted
        """
        self.df.insert(loc = self.sections[position+1]+1,
                  column = name+"_"+self.df.columns[self.sections[position+1]].split("_",1)[1].split("_comp")[0],
                  value=score,
                  allow_duplicates=False) 
    
    def subscore(self, columns_arr,position):
        """
        Calculates the subscore by adding the specified columns,
        it uses the indexes stored in the sections list to go through every section
        for the instrument
        
        Parameters
        ----------
        columns_arr: list of the columns to be summed
        position: position in the sections list for this instrument
        """
        selected = df.columns[self.sections[position]+1:self.sections[position+1]]
        selectedCols = []
        for i in columns_arr:
            selectedCols.append(selected[i-1])
        return df[selectedCols].sum(axis=1)
    
    def addNewDataColumns(self):
        """
        Adds all the columns for the instrument for each of its sections
        by looping through the sections list
        """
        i = 0
        count = 1
        while i < len(self.sections):
            if i%2 == 0:
                self.missingData(i)
                addition = pd.to_numeric(df.iloc[:, self.sections[i]+1:self.sections[i+1]].sum(axis=1))
                self.addNewColumn(addition,list(self.jdata)[0]+'_total-score', i)
                self.addNewColumn(self.subscore(self.jdata[instrument][0][list(self.jdata[instrument][0])[0]],i),list(self.jdata[instrument][0])[0], i)
                self.addNewColumn(self.subscore(self.jdata[instrument][0][list(self.jdata[instrument][0])[1]],i),list(self.jdata[instrument][0])[1], i)
                run = df.columns[adexi.sections[i+1]].split("_",1)[1].split("_comp")[0]
                self.fixable(i,run)
                
                try:
                    self.nullRow = []
                    self.sections[i+2]+=(3*count)
                    self.sections[i+3]+=(3*count)
                    count+=1
                except IndexError:
                    break;
            i+=1

In [10]:
file = pd.read_csv("/Users/osman/OneDrive/Documents/expected_input.csv",index_col="participant")
df = pd.DataFrame(file)

In [11]:
with open('data.json','r') as infile:
    data = json.load(infile)

In [12]:
for instrument in data:
    if instrument == 'adexi':
        adexi = Adexi(df.columns,df,data,instrument)
        adexi.getSections()
        adexi.addNewDataColumns()

In [13]:
df[df.columns[11:22]]

Unnamed: 0_level_0,adexi_i11_s1_r1_e1,adexi_i12_s1_r1_e1,adexi_i13_s1_r1_e1,adexi_i14_s1_r1_e1,adexi_s1_r1_e1_complete,adexi_scored-inh_s1_r1_e1,adexi_scored-wm_s1_r1_e1,adexi_total-score_s1_r1_e1,adexi_s1_r1_e2_timestamp,adexi_i1_s1_r1_e2,adexi_i2_s1_r1_e2
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1001445,3,2,2,3,,17,25.0,42.0,,1,2
1001446,3,5,5,5,,18,37.0,55.0,,2,5
1001447,1,3,6,3,,16,33.0,49.0,,3,3
1001448,4,3,2,3,,15,25.0,40.0,,1,2
1001449,1,5,1,1,,14,,,,5,2
1001450,2,3,1,3,,15,25.0,40.0,,5,5
1001451,4,4,4,4,,19,35.0,54.0,,4,4
1001452,2,1,1,1,,13,21.0,34.0,,3,5
1001453,4,4,4,5,,18,34.0,52.0,,2,5


In [14]:
df[df.columns[32:]]

Unnamed: 0_level_0,adexi_i13_s1_r1_e2,adexi_i14_s1_r1_e2,adexi_s1_r1_e2_complete,adexi_scored-inh_s1_r1_e2,adexi_scored-wm_s1_r1_e2,adexi_total-score_s1_r1_e2
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1001445,2,3,,17.0,25.0,42.0
1001446,5,5,,18.0,37.0,55.0
1001447,6,3,,16.0,34.0,50.0
1001448,2,3,,15.0,25.0,40.0
1001449,1,1,,9.0,,
1001450,1,3,,14.0,25.0,39.0
1001451,4,4,,19.0,35.0,54.0
1001452,1,1,,13.0,21.0,34.0
1001453,4,5,,18.0,34.0,52.0
