In [1]:
from functions import *
%matplotlib inline

Import the data of 5-minute windows preceding survey responses.

In [2]:
surveyfile = '../Data (Algebra 1)/YearSurvey.csv'
featuredf = readData(surveyfile)
featuredf = dropColumns(featuredf,['survey_id','time_window','question_id'])

Import demographic file

In [3]:
demofile = '../Data (Algebra 1)/YearDemographicsTest.csv'
demodf = readData(demofile)

Filter so survey and demographic files have the same indices

In [4]:
demodf = filterByIndex(demodf,featuredf.index)
featuredfA = filterByIndex(featuredf,demodf.index)
allind = featuredfA.index

Set up some common parameters

In [5]:
nFolds = 10
kfold = GroupKFold(n_splits = nFolds)

In [5]:
nFolds = 4
kfold = GroupKFold(n_splits = nFolds)

Filter surveys according to positive or negative affective states

In [6]:
featuredfP = pd.concat([featuredfA.loc[featuredfA['survey_question']=='Happiness'],featuredfA.loc[featuredfA['survey_question']=='Hopefulness'],
                       featuredfA.loc[featuredfA['survey_question']=='Contentment'],featuredfA.loc[featuredfA['survey_question']=='Relief'],
                       featuredfA.loc[featuredfA['survey_question']=='Pride'],featuredfA.loc[featuredfA['survey_question']=='Pleasantness'],
                       featuredfA.loc[featuredfA['survey_question']=='Interest'],featuredfA.loc[featuredfA['survey_question']=='Arousal'],
                       featuredfA.loc[featuredfA['survey_question']=='Engagement']]).sort_index()
featuredfN = pd.concat([featuredfA.loc[featuredfA['survey_question']=='Frustration'],featuredfA.loc[featuredfA['survey_question']=='Confusion'],
                       featuredfA.loc[featuredfA['survey_question']=='Disappointment'],featuredfA.loc[featuredfA['survey_question']=='Anxiety'],
                       featuredfA.loc[featuredfA['survey_question']=='Sadness'],featuredfA.loc[featuredfA['survey_question']=='Mind Wandering'],
                       featuredfA.loc[featuredfA['survey_question']=='Boredom']]).sort_index()

In [7]:
labelSeriesP = featuredfP['survey_answer']
featuredfP = dropColumns(featuredfP,['survey_answer','survey_question'])

labelSeriesN = featuredfN['survey_answer']
featuredfN = dropColumns(featuredfN,['survey_answer','survey_question'])

labelSeriesA = featuredfA['survey_answer']
featuredfA = dropColumns(featuredfA,['survey_answer','survey_question'])

## Male/Female

Divide female and male students in the demographic file and filter survey file accordingly

In [None]:
femaleind = demodf.loc[demodf['Female'] == 1].index
maleind = demodf.loc[demodf['Female'] == 0].index

Train positive and negative classifier on male/female/all and predict for male/female/all

In [None]:
scoresF = np.zeros((2,3))
scoresM = scoresF.copy()
scoresA = scoresF.copy()

In [None]:
statedfs = [featuredfP,featuredfN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [femaleind,maleind,allind]
scores = [scoresF,scoresM,scoresA]

# For each state (positive or negative)
for s in range(len(statedfs)):
    
    # For each group (Female, Male, All)
    for g1 in range(len(groupinds)):
        featuredf = filterByIndex(statedfs[s],groupinds[g1])
        groups = featuredf.index.values
        features = featuredf.values
        labels = filterByIndex(labelSeries[s],groupinds[g1]).values
        
        # Repeat for k folds
        temp = np.zeros((nFolds,len(groupinds)))
        for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
            # Train a model
            _, model = classify(features[train],labels[train],features[test])
            
            # Predict for each group
            for g2 in range(len(groupinds)):
                predFeatures = filterByIndex(statedfs[s],groupinds[g2]).values
                predLabels = filterByIndex(labelSeries[s],groupinds[g2]).values
                preds = model.predict(predFeatures)
                rho = evaluateSpearman(predLabels,preds)
                temp[i,g2] = rho
                
        # Average across folds
        avg = np.mean(temp,axis=0)
        # Store results
        scores[g1][s,:] = avg

In [None]:
columns = ['Test Female','Test Male','Test All']
ind = ['Positive','Negative']

dfF = pd.DataFrame(data=scoresF,index=ind,columns=columns)
dfM = pd.DataFrame(data=scoresM,index=ind,columns=columns)
dfA = pd.DataFrame(data=scoresA,index=ind,columns=columns)

In [None]:
dfA.round(2)

In [None]:
dfF.to_csv('../Data (Algebra 1)/small gender scores female.csv')
dfM.to_csv('../Data (Algebra 1)/small gender scores male.csv')
dfA.to_csv('../Data (Algebra 1)/small gender scores all.csv')

Run a simulation of the above process, each time using a different sized training set

In [None]:
statedfs = [featuredfP,featuredfN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [femaleind,maleind,allind]
for s in range(len(statedfs)):
    for g1 in range(len(groupinds)):
        print(filterByIndex(statedfs[s],groupinds[g1]).values.shape)

In [None]:
#iterations = 22
iterations = 45
#iterations = 61
scoresSexP = np.zeros((iterations+1,9,nFolds))
scoresSexN = scoresSexP.copy()

In [None]:
statedfs = [featuredfP,featuredfN]
scores = [scoresSexP,scoresSexN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [femaleind,maleind,allind]

start = 5
end = start + iterations + 1

# Run simulation for each increment
for n in range(start, end):
    print(n)
    
    # For each state (positive, negative)
    for s in range(len(statedfs)):
        combo = 0
        # For each group (female, male, all)
        for g1 in range(len(groupinds)):
            features = filterByIndex(statedfs[s],groupinds[g1])
            groups = features.index.values
            features = features.values
            labels = filterByIndex(labelSeries[s],groupinds[g1]).values
            sample = np.random.choice(np.arange(len(labels)),n*100)
            features = features[sample]
            labels = labels[sample]
            groups = groups[sample]
            
            # Test on all groups (female, male, all)
            for g2 in range(len(groupinds)):
                predFeatures = filterByIndex(statedfs[s],groupinds[g2]).values
                predLabels = filterByIndex(labelSeries[s],groupinds[g2]).values
            
                # Repeat for k folds
                for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
                    # Train a model
                    _, model = classify(features[train],labels[train],features[test])
                    # Predict
                    preds = model.predict(predFeatures)
                    rho = evaluateSpearman(predLabels,preds)
                    scores[s][n-start,combo,i] = rho
                combo = combo + 1

In [None]:
index = np.arange(start,end)
columns = ['F/F','F/M','F/A','M/F','M/M','M/A','A/F','A/M','A/A']
dfP = pd.DataFrame(data=np.mean(scoresSexP,axis=2),index=index,columns=columns)
dfN = pd.DataFrame(data=np.mean(scoresSexN,axis=2),index=index,columns=columns)

In [None]:
#dfP.to_csv('../Data (Algebra 1)/simulation gender positive.csv')
#dfN.to_csv('../Data (Algebra 1)/simulation gender negative.csv')
#dfP.to_csv('../Data (Algebra 1)/simulation long gender positive.csv')
#dfN.to_csv('../Data (Algebra 1)/simulation long gender negative.csv')
dfP.to_csv('../Data (Algebra 1)/simulation small gender positive.csv')
dfN.to_csv('../Data (Algebra 1)/simulation small gender negative.csv')

In [None]:
#dataP = readData('../Data (Algebra 1)/simulation gender positive.csv')
#dataN = readData('../Data (Algebra 1)/simulation gender negative.csv')
#dataP = readData('../Data (Algebra 1)/simulation long gender positive.csv')
#dataN = readData('../Data (Algebra 1)/simulation long gender negative.csv')
dataP = readData('../Data (Algebra 1)/simulation small gender positive.csv')
dataN = readData('../Data (Algebra 1)/simulation small gender negative.csv')

In [None]:
inF = (dataN['F/F'] - dataN['A/F'])/(dataN['A/F'])
inM = (dataN['M/M'] - dataN['A/M'])/(dataN['A/M'])
outF = (dataN['F/F'] - dataN['M/F'])/(dataN['M/F'])
outM = (dataN['M/M'] - dataN['F/M'])/(dataN['F/M'])
print(max(max(inF),max(inM),max(outF),max(outM)))
print(min(min(inF),min(inM),min(outF),min(outM)))

In [None]:
fig, axs = plt.subplots(1, 2, sharex=True,figsize=(10, 4))
axs[0].plot(inF,label='F')
axs[0].plot(inM,label='M')
axs[0].set_ylim(-0.57,0.62)
axs[0].grid(True, which='both')
axs[0].set_title('In-group')
axs[0].legend()
axs[0].axhline(y=0, color='k',linewidth=0.5)
axs[1].plot(outF,label='F')
axs[1].plot(outM,label='M')
axs[1].set_title('Out-group')
axs[1].grid(True, which='both')
axs[1].legend()
axs[1].axhline(y=0, color='k',linewidth=0.5)
axs[1].set_ylim(-0.57,0.62)

Run simulation 1000 times in order to get a more stable average

In [None]:
inFP = np.zeros((2,11))
inMP = inFP.copy()
outFP = inFP.copy()
outMP = inFP.copy()
inFN = inFP.copy()
inMN = inFP.copy()
outFN = inFP.copy()
outMN = inFP.copy()

In [None]:
iterations = 10
scoresSexP = np.zeros((iterations+1,9,nFolds))
scoresSexN = scoresSexP.copy()

statedfs = [featuredfP,featuredfN]
scores = [scoresSexP,scoresSexN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [femaleind,maleind,allind]

start = 5
end = start + iterations + 1

index = np.arange(start,end)
columns = ['F/F','F/M','F/A','M/F','M/M','M/A','A/F','A/M','A/A']

In [None]:
for run in range(1002):
    if run%50 == 0:
        print(run)
    # Run simulation for each increment
    for n in range(start, end):
        # For each state (positive, negative)
        for s in range(len(statedfs)):
            combo = 0
            # For each group (female, male, all)
            for g1 in range(len(groupinds)):
                features = filterByIndex(statedfs[s],groupinds[g1])
                groups = features.index.values
                features = features.values
                labels = filterByIndex(labelSeries[s],groupinds[g1]).values
                sample = np.random.choice(np.arange(len(labels)),n*100)
                features = features[sample]
                labels = labels[sample]
                groups = groups[sample]

                # Test on all groups (female, male, all)
                for g2 in range(len(groupinds)):
                    predFeatures = filterByIndex(statedfs[s],groupinds[g2]).values
                    predLabels = filterByIndex(labelSeries[s],groupinds[g2]).values

                    # Repeat for k folds
                    for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
                        # Train a model
                        _, model = classify(features[train],labels[train],features[test])
                        # Predict
                        preds = model.predict(predFeatures)
                        rho = evaluateSpearman(predLabels,preds)
                        scores[s][n-start,combo,i] = rho
                    combo = combo + 1
    dfP = pd.DataFrame(data=np.mean(scoresSexP,axis=2),index=index,columns=columns)
    dfN = pd.DataFrame(data=np.mean(scoresSexN,axis=2),index=index,columns=columns)
    
    inFP[run%2,:] = (dfP['F/F'] - dfP['A/F'])/(dfP['A/F']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Gender/inFP.csv", "a")
        np.savetxt(f, inFP,delimiter=',')
        f.close()
        print('saved')
    
    inMP[run%2,:] = (dfP['M/M'] - dfP['A/M'])/(dfP['A/M']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Gender/inMP.csv", "a")
        np.savetxt(f, inMP,delimiter=',')
        f.close()
    
    outFP[run%2,:] = (dfP['M/F'] - dfP['F/F'])/(dfP['F/F']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Gender/outFP.csv", "a")
        np.savetxt(f, outFP,delimiter=',')
        f.close()
    
    outMP[run%2,:] = (dfP['F/M'] - dfP['M/M'])/(dfP['M/M']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Gender/outMP.csv", "a")
        np.savetxt(f, outMP,delimiter=',')
        f.close()
    
    inFN[run%2,:] = (dfN['F/F'] - dfN['A/F'])/(dfN['A/F']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Gender/inFN.csv", "a")
        np.savetxt(f, inFN,delimiter=',')
        f.close()
    
    inMN[run%2,:] = (dfN['M/M'] - dfN['A/M'])/(dfN['A/M']).values
    if run%2 == 1:    
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Gender/inMN.csv", "a")
        np.savetxt(f, inMN,delimiter=',')
        f.close()
    
    outFN[run%2,:] = (dfN['M/F'] - dfN['F/F'])/(dfN['F/F']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Gender/outFN.csv", "a")
        np.savetxt(f, outFN,delimiter=',')
        f.close()
    
    outMN[run%2,:] = (dfN['F/M'] - dfN['M/M'])/(dfN['M/M']).values
    if run%2 == 1:    
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Gender/outMN.csv", "a")
        np.savetxt(f, outMN,delimiter=',')
        f.close()

## Lunch status

Divide students by lunch status in the demographic file and filter survey file accordingly

In [None]:
yesind = pd.concat([demodf.loc[demodf['Free Lunch'] == 1],
                    demodf.loc[demodf['Reduced Lunch'] == 1]]).sort_index().index
noind = demodf.loc[demodf['Other Lunch'] == 1].index

Train positive and negative classifier on yes/no/all and predict for yes/no/all

In [None]:
scoresY = np.zeros((2,3))
scoresN = scoresY.copy()
scoresA = scoresY.copy()

In [None]:
statedfs = [featuredfP,featuredfN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [yesind,noind,allind]
scores = [scoresY,scoresN,scoresA]

# For each state (positive or negative)
for s in range(len(statedfs)):
    
    # For each group (Yes, No, All)
    for g1 in range(len(groupinds)):
        featuredf = filterByIndex(statedfs[s],groupinds[g1])
        groups = featuredf.index.values
        features = featuredf.values
        labels = filterByIndex(labelSeries[s],groupinds[g1]).values
        
        # Repeat for k folds
        temp = np.zeros((nFolds,len(groupinds)))
        for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
            # Train a model
            _, model = classify(features[train],labels[train],features[test])
            
            # Predict for each group
            for g2 in range(len(groupinds)):
                predFeatures = filterByIndex(statedfs[s],groupinds[g2]).values
                predLabels = filterByIndex(labelSeries[s],groupinds[g2]).values
                preds = model.predict(predFeatures)
                rho = evaluateSpearman(predLabels,preds)
                temp[i,g2] = rho
                
        # Average across folds
        avg = np.mean(temp,axis=0)
        # Store results
        scores[g1][s,:] = avg

In [None]:
columns = ['Test F/R','Test Other','Test All']
ind = ['Positive','Negative']

dfY = pd.DataFrame(data=scoresY,index=ind,columns=columns)
dfN = pd.DataFrame(data=scoresN,index=ind,columns=columns)
dfA = pd.DataFrame(data=scoresA,index=ind,columns=columns)

In [None]:
dfA.round(2)

In [None]:
dfY.to_csv('../Data (Algebra 1)/small lunch scores free reduced.csv')
dfN.to_csv('../Data (Algebra 1)/small lunch scores other.csv')
dfA.to_csv('../Data (Algebra 1)/small lunch scores all.csv')

Run a simulation of the above process, each time using a different sized training set

In [None]:
statedfs = [featuredfP,featuredfN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [yesind,noind,allind]
for s in range(len(statedfs)):
    for g1 in range(len(groupinds)):
        print(filterByIndex(statedfs[s],groupinds[g1]).values.shape)

In [None]:
#iterations = 21
iterations = 45
#iterations = 61
scoresLunchP = np.zeros((iterations+1,9,nFolds))
scoresLunchN = scoresLunchP.copy()

In [None]:
statedfs = [featuredfP,featuredfN]
scores = [scoresLunchP,scoresLunchN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [yesind,noind,allind]

start = 5
end = start + iterations + 1

# Run simulation for each increment
for n in range(start,end):
    print(n)
    
    # For each state (positive, negative)
    for s in range(len(statedfs)):
        combo = 0
        # For each group (female, male, all)
        for g1 in range(len(groupinds)):
            features = filterByIndex(statedfs[s],groupinds[g1])
            groups = features.index.values
            features = features.values
            labels = filterByIndex(labelSeries[s],groupinds[g1]).values
            sample = np.random.choice(np.arange(len(labels)),n*100)
            features = features[sample]
            labels = labels[sample]
            groups = groups[sample]
            
            # Test on all groups (female, male, all)
            for g2 in range(len(groupinds)):
                predFeatures = filterByIndex(statedfs[s],groupinds[g2]).values
                predLabels = filterByIndex(labelSeries[s],groupinds[g2]).values
            
                # Repeat for k folds
                for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
                    # Train a model
                    _, model = classify(features[train],labels[train],features[test])
                    # Predict
                    preds = model.predict(predFeatures)
                    rho = evaluateSpearman(predLabels,preds)
                    scores[s][n-start,combo,i] = rho
                combo = combo + 1

In [None]:
index = np.arange(start,end)
columns = ['Y/Y','Y/N','Y/A','N/Y','N/N','N/A','A/Y','A/N','A/A']
dfP = pd.DataFrame(data=np.mean(scoresLunchP,axis=2),index=index,columns=columns)
dfN = pd.DataFrame(data=np.mean(scoresLunchN,axis=2),index=index,columns=columns)

In [None]:
#dfP.to_csv('../Data (Algebra 1)/simulation lunch positive.csv')
#dfN.to_csv('../Data (Algebra 1)/simulation lunch negative.csv')
#dfP.to_csv('../Data (Algebra 1)/simulation long lunch positive.csv')
#dfN.to_csv('../Data (Algebra 1)/simulation long lunch negative.csv')
dfP.to_csv('../Data (Algebra 1)/simulation small lunch positive.csv')
dfN.to_csv('../Data (Algebra 1)/simulation small lunch negative.csv')

In [None]:
#dataP = readData('../Data (Algebra 1)/simulation lunch positive.csv')
#dataN = readData('../Data (Algebra 1)/simulation lunch negative.csv')
#dataP = readData('../Data (Algebra 1)/simulation long lunch positive.csv')
#dataN = readData('../Data (Algebra 1)/simulation long lunch negative.csv')
dataP = readData('../Data (Algebra 1)/simulation small lunch positive.csv')
dataN = readData('../Data (Algebra 1)/simulation small lunch negative.csv')

In [None]:
inY = (dataN['Y/Y'] - dataN['A/Y'])/(dataN['A/Y'])
inN = (dataN['N/N'] - dataN['A/N'])/(dataN['A/N'])
outY = (dataN['Y/Y'] - dataN['N/Y'])/(dataN['N/Y'])
outN = (dataN['N/N'] - dataN['Y/N'])/(dataN['Y/N'])
print(max(max(inY),max(inN),max(outY),max(outN)))
print(min(min(inY),min(inN),min(outY),min(outN)))

In [None]:
fig, axs = plt.subplots(1, 2, sharex=True,figsize=(10, 4))
axs[0].plot(inY,label='F/R')
axs[0].plot(inN,label='Other')
axs[0].set_ylim(-0.75,0.5)
axs[0].grid(True, which='both')
axs[0].set_title('In-group')
axs[0].legend()
axs[0].axhline(y=0, color='k',linewidth=0.5)
axs[1].plot(outY,label='F/R')
axs[1].plot(outN,label='Other')
axs[1].set_title('Out-group')
axs[1].grid(True, which='both')
axs[1].legend()
axs[1].axhline(y=0, color='k',linewidth=0.5)
axs[1].set_ylim(-0.75,0.5)

Run simulation 1000 times in order to get a more stable average

In [None]:
inYP = np.zeros((2,11))
inNP = inYP.copy()
outYP = inYP.copy()
outNP = inYP.copy()
inYN = inYP.copy()
inNN = inYP.copy()
outYN = inYP.copy()
outNN = inYP.copy()

In [None]:
iterations = 10
scoresLunchP = np.zeros((iterations+1,9,nFolds))
scoresLunchN = scoresLunchP.copy()

statedfs = [featuredfP,featuredfN]
scores = [scoresLunchP,scoresLunchN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [yesind,noind,allind]

start = 8
end = start + iterations + 1

index = np.arange(start,end)
columns = ['Y/Y','Y/N','Y/A','N/Y','N/N','N/A','A/Y','A/N','A/A']

In [None]:
for run in range(425):
    if run%25 == 0:
        print(run)
    # Run simulation for each increment
    for n in range(start, end):
        # For each state (positive, negative)
        for s in range(len(statedfs)):
            combo = 0
            # For each group (female, male, all)
            for g1 in range(len(groupinds)):
                features = filterByIndex(statedfs[s],groupinds[g1])
                groups = features.index.values
                features = features.values
                labels = filterByIndex(labelSeries[s],groupinds[g1]).values
                sample = np.random.choice(np.arange(len(labels)),n*100)
                features = features[sample]
                labels = labels[sample]
                groups = groups[sample]

                # Test on all groups (female, male, all)
                for g2 in range(len(groupinds)):
                    predFeatures = filterByIndex(statedfs[s],groupinds[g2]).values
                    predLabels = filterByIndex(labelSeries[s],groupinds[g2]).values

                    # Repeat for k folds
                    for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
                        # Train a model
                        _, model = classify(features[train],labels[train],features[test])
                        # Predict
                        preds = model.predict(predFeatures)
                        rho = evaluateSpearman(predLabels,preds)
                        scores[s][n-start,combo,i] = rho
                    combo = combo + 1
    dfP = pd.DataFrame(data=np.mean(scoresLunchP,axis=2),index=index,columns=columns)
    dfN = pd.DataFrame(data=np.mean(scoresLunchN,axis=2),index=index,columns=columns)
    
    inYP[run%2,:] = (dfP['Y/Y'] - dfP['A/Y'])/(dfP['A/Y']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Lunch Status/inYP.csv", "a")
        np.savetxt(f, inYP,delimiter=',')
        f.close()
    
    inNP[run%2,:] = (dfP['N/N'] - dfP['A/N'])/(dfP['A/N']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Lunch Status/inNP.csv", "a")
        np.savetxt(f, inNP,delimiter=',')
        f.close()
    
    outYP[run%2,:] = (dfP['Y/Y'] - dfP['N/Y'])/(dfP['N/Y']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Lunch Status/outYP.csv", "a")
        np.savetxt(f, outYP,delimiter=',')
        f.close()
    
    outNP[run%2,:] = (dfP['N/N'] - dfP['Y/N'])/(dfP['Y/N']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Lunch Status/outNP.csv", "a")
        np.savetxt(f, outNP,delimiter=',')
        f.close()
    
    inYN[run%2,:] = (dfN['Y/Y'] - dfN['A/Y'])/(dfN['A/Y']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Lunch Status/inYN.csv", "a")
        np.savetxt(f, inYN,delimiter=',')
        f.close()
    
    inNN[run%2,:] = (dfN['N/N'] - dfN['A/N'])/(dfN['A/N']).values
    if run%2 == 1:    
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Lunch Status/inNN.csv", "a")
        np.savetxt(f, inNN,delimiter=',')
        f.close()
    
    outYN[run%2,:] = (dfN['Y/Y'] - dfN['N/Y'])/(dfN['N/Y']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Lunch Status/outYN.csv", "a")
        np.savetxt(f, outYN,delimiter=',')
        f.close()
    
    outNN[run%2,:] = (dfN['N/N'] - dfN['Y/N'])/(dfN['Y/N']).values
    if run%2 == 1:    
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Lunch Status/outNN.csv", "a")
        np.savetxt(f, outNN,delimiter=',')
        f.close()

## Grade

Divide students by grade level in the demographic file and filter survey file accordingly

In [None]:
middleind = pd.concat([demodf.loc[demodf['Grade 6'] == 1],demodf.loc[demodf['Grade 7'] == 1],
                       demodf.loc[demodf['Grade 8'] == 1]]).sort_index().index
freshind = demodf.loc[demodf['Grade 9'] == 1].index
highind = pd.concat([demodf.loc[demodf['Grade 10'] == 1],demodf.loc[demodf['Grade 11'] == 1],
                               demodf.loc[demodf['Grade 12'] == 1]]).sort_index().index
# combined groups
MFind = middleind.union(freshind)
MHind = middleind.union(highind)
FHind = freshind.union(highind)

Train positive and negative classifier on middle/ninth/high/all and predict for middle/ninth/high/all

In [None]:
scoresM = np.zeros((2,4))
scoresF = scoresM.copy()
scoresH = scoresM.copy()
scoresA = scoresM.copy()

In [None]:
statedfs = [featuredfP,featuredfN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [middleind,freshind,highind,allind]
scores = [scoresM,scoresF,scoresH,scoresA]

# For each state (positive or negative)
for s in range(len(statedfs)):
    
    # For each group (Middle, Ninth, High, All)
    for g1 in range(len(groupinds)):
        featuredf = filterByIndex(statedfs[s],groupinds[g1])
        groups = featuredf.index.values
        features = featuredf.values
        labels = filterByIndex(labelSeries[s],groupinds[g1]).values
        
        # Repeat for k folds
        temp = np.zeros((nFolds,len(groupinds)))
        for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
            # Train a model
            _, model = classify(features[train],labels[train],features[test])
            
            # Predict for each group
            for g2 in range(len(groupinds)):
                predFeatures = filterByIndex(statedfs[s],groupinds[g2]).values
                predLabels = filterByIndex(labelSeries[s],groupinds[g2]).values
                preds = model.predict(predFeatures)
                rho = evaluateSpearman(predLabels,preds)
                temp[i,g2] = rho
                
        # Average across folds
        avg = np.mean(temp,axis=0)
        # Store results
        scores[g1][s,:] = avg

In [None]:
columns = ['Test Middle','Test Ninth','Test High','Test All']
ind = ['Positive','Negative']

dfM = pd.DataFrame(data=scoresM,index=ind,columns=columns)
dfF = pd.DataFrame(data=scoresF,index=ind,columns=columns)
dfH = pd.DataFrame(data=scoresH,index=ind,columns=columns)
dfA = pd.DataFrame(data=scoresA,index=ind,columns=columns)

In [None]:
dfA.round(2)

In [None]:
dfM.to_csv('../Data (Algebra 1)/small grade scores middle.csv')
dfF.to_csv('../Data (Algebra 1)/small grade scores ninth.csv')
dfH.to_csv('../Data (Algebra 1)/small grade scores high.csv')
dfA.to_csv('../Data (Algebra 1)/small grade scores all.csv')

Run a simulation of the above process, each time using a different sized training set

In [None]:
statedfs = [featuredfP,featuredfN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [middleind,freshind,highind,allind,MFind,MHind,FHind]
for s in range(len(statedfs)):
    for g1 in range(len(groupinds)):
        print(filterByIndex(statedfs[s],groupinds[g1]).values.shape)

In [None]:
#iterations = 12
iterations = 45
#iterations = 61
scoresGradeP = np.zeros((iterations+1,9,nFolds))
scoresGradeN = scoresGradeP.copy()

In [None]:
statedfs = [featuredfP,featuredfN]
scores = [scoresGradeP,scoresGradeN]
labelSeries = [labelSeriesP,labelSeriesN]
traininds = [middleind,freshind,highind,allind,allind,allind,MFind,MHind,FHind]
testinds = [middleind,freshind,highind,middleind,freshind,highind,highind,freshind,middleind]

start = 5
end = start + iterations + 1

# Run simulation for each increment of 1,000
for n in range(start,end):
    print(n)
    
    # For each state (positive, negative)
    for s in range(len(statedfs)):
        
        # For each combination
        for g in range(len(traininds)):
            # get training data
            features = filterByIndex(statedfs[s],traininds[g])
            groups = features.index.values
            features = features.values
            labels = filterByIndex(labelSeries[s],traininds[g]).values
            sample = np.random.choice(np.arange(len(labels)),n*100)
            features = features[sample]
            labels = labels[sample]
            groups = groups[sample]
            
            # get testing data
            predFeatures = filterByIndex(statedfs[s],testinds[g]).values
            predLabels = filterByIndex(labelSeries[s],testinds[g]).values
            
            # Repeat for k folds
            for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
                # Train a model
                _, model = classify(features[train],labels[train],features[test])
                # Predict
                preds = model.predict(predFeatures)
                rho = evaluateSpearman(predLabels,preds)
                scores[s][n-start,g,i] = rho

In [None]:
index = np.arange(start,end)
columns = ['M/M','F/F','H/H','A/M','A/F','A/H','MF/H','MH/F','FH/M']
dfP = pd.DataFrame(data=np.mean(scoresGradeP,axis=2),index=index,columns=columns)
dfN = pd.DataFrame(data=np.mean(scoresGradeN,axis=2),index=index,columns=columns)

In [None]:
#dfP.to_csv('../Data (Algebra 1)/simulation grade positive.csv')
#dfN.to_csv('../Data (Algebra 1)/simulation grade negative.csv')
#dfP.to_csv('../Data (Algebra 1)/simulation long grade positive.csv')
#dfN.to_csv('../Data (Algebra 1)/simulation long grade negative.csv')
dfP.to_csv('../Data (Algebra 1)/simulation small grade positive.csv')
dfN.to_csv('../Data (Algebra 1)/simulation small grade negative.csv')

In [None]:
#dataP = readData('../Data (Algebra 1)/simulation grade positive.csv')
#dataN = readData('../Data (Algebra 1)/simulation grade negative.csv')
#dataP = readData('../Data (Algebra 1)/simulation long grade positive.csv')
#dataN = readData('../Data (Algebra 1)/simulation long grade negative.csv')
dataP = readData('../Data (Algebra 1)/simulation small grade positive.csv')
dataN = readData('../Data (Algebra 1)/simulation small grade negative.csv')

In [None]:
inM = (dataN['M/M'] - dataN['A/M'])/(dataN['A/M'])
inF = (dataN['F/F'] - dataN['A/F'])/(dataN['A/F'])
inH = (dataN['H/H'] - dataN['A/H'])/(dataN['A/H'])

outF = (dataN['F/F'] - dataN['MH/F'])/(dataN['MH/F'])
outM = (dataN['M/M'] - dataN['FH/M'])/(dataN['FH/M'])
outH = (dataN['H/H'] - dataN['MF/H'])/(dataN['MF/H'])

print(max(max(inM),max(inF),max(outM),max(outF),max(inH),max(outH)))
print(min(min(inF),min(inM),min(outF),min(outM),min(inH),max(outH)))

In [None]:
fig, axs = plt.subplots(1, 2, sharex=True,figsize=(10, 4))
axs[0].plot(inM,label='Middle')
axs[0].plot(inF,label='9th')
axs[0].plot(inH[:23],label='High')
axs[0].set_ylim(-0.5,1.5)
axs[0].grid(True, which='both')
axs[0].set_title('In-group')
axs[0].legend()
axs[0].axhline(y=0, color='k',linewidth=0.5)
axs[1].plot(outM,label='Middle')
axs[1].plot(outF,label='9th')
axs[1].plot(outH[:23],label='High')
axs[1].set_title('Out-group')
axs[1].grid(True, which='both')
axs[1].legend()
axs[1].axhline(y=0, color='k',linewidth=0.5)
axs[1].set_ylim(-0.5,1.5)

Run simulation 100 times in order to get a more stable average

In [None]:
inMP = np.zeros((2,11))
inFP = inMP.copy()
inHP = inMP.copy()
outMP = inMP.copy()
outFP = inMP.copy()
outHP = inMP.copy()
inMN = inMP.copy()
inFN = inMP.copy()
inHN = inMP.copy()
outMN = inMP.copy()
outFN = inMP.copy()
outHN = inMP.copy()

In [None]:
iterations = 10
scoresGradeP = np.zeros((iterations+1,9,nFolds))
scoresGradeN = scoresGradeP.copy()

statedfs = [featuredfP,featuredfN]
scores = [scoresGradeP,scoresGradeN]
labelSeries = [labelSeriesP,labelSeriesN]
traininds = [middleind,freshind,highind,allind,allind,allind,MFind,MHind,FHind]
testinds = [middleind,freshind,highind,middleind,freshind,highind,highind,freshind,middleind]

start = 8
end = start + iterations + 1

index = np.arange(start,end)
columns = ['M/M','F/F','H/H','A/M','A/F','A/H','MF/H','MH/F','FH/M']

In [None]:
for run in range(1002):
    if run%25 == 0:
        print(run)
    # Run simulation for each increment
    for n in range(start,end):
        # For each state (positive, negative)
        for s in range(len(statedfs)):
            # For each combination
            for g in range(len(traininds)):
                # get training data
                features = filterByIndex(statedfs[s],traininds[g])
                groups = features.index.values
                features = features.values
                labels = filterByIndex(labelSeries[s],traininds[g]).values
                sample = np.random.choice(np.arange(len(labels)),n*100)
                features = features[sample]
                labels = labels[sample]
                groups = groups[sample]

                # get testing data
                predFeatures = filterByIndex(statedfs[s],testinds[g]).values
                predLabels = filterByIndex(labelSeries[s],testinds[g]).values

                # Repeat for k folds
                for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
                    # Train a model
                    _, model = classify(features[train],labels[train],features[test])
                    # Predict
                    preds = model.predict(predFeatures)
                    rho = evaluateSpearman(predLabels,preds)
                    scores[s][n-start,g,i] = rho
    dfP = pd.DataFrame(data=np.mean(scoresGradeP,axis=2),index=index,columns=columns)
    dfN = pd.DataFrame(data=np.mean(scoresGradeN,axis=2),index=index,columns=columns)
    
    outHP[run%2,:] = (dfP['MF/H'] - dfP['H/H'])/(dfP['H/H']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Grade Level/outHP.csv", "a")
        np.savetxt(f, outHP,delimiter=',')
        f.close()
        
    outHN[run%2,:] = (dfN['MF/H'] - dfN['H/H'])/(dfN['H/H']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Grade Level/outHN.csv", "a")
        np.savetxt(f, outHN,delimiter=',')
        f.close()
    
    outFP[run%2,:] = (dfP['MH/F'] - dfP['F/F'])/(dfP['F/F']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Grade Level/outFP.csv", "a")
        np.savetxt(f, outFP,delimiter=',')
        f.close()
        
    outFN[run%2,:] = (dfN['MH/F'] - dfN['F/F'])/(dfN['F/F']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Grade Level/outFN.csv", "a")
        np.savetxt(f, outFN,delimiter=',')
        f.close()
    
    outMP[run%2,:] = (dfP['FH/M'] - dfP['M/M'])/(dfP['M/M']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Grade Level/outMP.csv", "a")
        np.savetxt(f, outMP,delimiter=',')
        f.close()
        
    outMN[run%2,:] = (dfN['FH/M'] - dfN['M/M'])/(dfN['M/M']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Grade Level/outMN.csv", "a")
        np.savetxt(f, outMN,delimiter=',')
        f.close()
    
    inMP[run%2,:] = (dfP['M/M'] - dfP['A/M'])/(dfP['A/M']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Grade Level/inMP.csv", "a")
        np.savetxt(f, inMP,delimiter=',')
        f.close()
        
    inMN[run%2,:] = (dfN['M/M'] - dfN['A/M'])/(dfN['A/M']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Grade Level/inMN.csv", "a")
        np.savetxt(f, inMN,delimiter=',')
        f.close()
    
    inFP[run%2,:] = (dfP['F/F'] - dfP['A/F'])/(dfP['A/F']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Grade Level/inFP.csv", "a")
        np.savetxt(f, inFP,delimiter=',')
        f.close()
        
    inFN[run%2,:] = (dfN['F/F'] - dfN['A/F'])/(dfN['A/F']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Grade Level/inFN.csv", "a")
        np.savetxt(f, inFN,delimiter=',')
        f.close()
        
    inHP[run%2,:] = (dfP['H/H'] - dfP['A/H'])/(dfP['A/H']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Grade Level/inHP.csv", "a")
        np.savetxt(f, inHP,delimiter=',')
        f.close()
        
    inHN[run%2,:] = (dfN['H/H'] - dfN['A/H'])/(dfN['A/H']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Grade Level/inHN.csv", "a")
        np.savetxt(f, inHN,delimiter=',')
        f.close()

## Race/Ethnicity

Divide students by race/ethnicity in the demographic file and filter survey file accordingly. Due to small sample sizes, we will only consider white/black/hispanic independently

In [None]:
whiteind = demodf.loc[demodf['White'] == 1].index
blackind = demodf.loc[demodf['Black'] == 1].index
hispind = demodf.loc[demodf['Hispanic'] == 1].index
otherind = pd.concat([demodf.loc[demodf['Native American'] == 1],demodf.loc[demodf['Asian'] == 1],
                      demodf.loc[demodf['Pacific Islander'] == 1]]).sort_index().index

# combinations for simulation
BHOind = blackind.union(hispind).union(otherind)
WHOind = whiteind.union(hispind).union(otherind)
WBOind = whiteind.union(blackind).union(otherind)
WBHind = whiteind.union(blackind).union(hispind)

Train positive and negative classifier on white/black/hispanic/other/all and predict for white/black/hispanic/other/all

In [None]:
scoresW = np.zeros((2,5))
scoresB = scoresW.copy()
scoresH = scoresW.copy()
scoresO = scoresW.copy()
scoresA = scoresW.copy()

In [None]:
statedfs = [featuredfP,featuredfN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [whiteind,blackind,hispind,otherind,allind]
scores = [scoresW,scoresB,scoresH,scoresO,scoresA]

# For each state (positive or negative)
for s in range(len(statedfs)):
    
    # For each group (Middle, Ninth, High, All)
    for g1 in range(len(groupinds)):
        featuredf = filterByIndex(statedfs[s],groupinds[g1])
        groups = featuredf.index.values
        features = featuredf.values
        labels = filterByIndex(labelSeries[s],groupinds[g1]).values
        
        # Repeat for k folds
        temp = np.zeros((nFolds,len(groupinds)))
        for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
            # Train a model
            _, model = classify(features[train],labels[train],features[test])
            
            # Predict for each group
            for g2 in range(len(groupinds)):
                predFeatures = filterByIndex(statedfs[s],groupinds[g2]).values
                predLabels = filterByIndex(labelSeries[s],groupinds[g2]).values
                preds = model.predict(predFeatures)
                rho = evaluateSpearman(predLabels,preds)
                temp[i,g2] = rho
                
        # Average across folds
        avg = np.mean(temp,axis=0)
        # Store results
        scores[g1][s,:] = avg

In [None]:
columns = ['Test White','Test Black','Test Hispanic','Test Other','Test All']
ind = ['Positive','Negative']

dfW = pd.DataFrame(data=scoresW,index=ind,columns=columns)
dfB = pd.DataFrame(data=scoresB,index=ind,columns=columns)
dfH = pd.DataFrame(data=scoresH,index=ind,columns=columns)
dfO = pd.DataFrame(data=scoresO,index=ind,columns=columns)
dfA = pd.DataFrame(data=scoresA,index=ind,columns=columns)

In [None]:
dfA.round(2)

In [None]:
dfW.to_csv('../Data (Algebra 1)/small race scores white.csv')
dfB.to_csv('../Data (Algebra 1)/small race scores black.csv')
dfH.to_csv('../Data (Algebra 1)/small race scores hispanic.csv')
dfO.to_csv('../Data (Algebra 1)/small race scores other.csv')
dfA.to_csv('../Data (Algebra 1)/small race scores all.csv')

Run a simulation of the above process, each time using a different sized training set

In [None]:
statedfs = [featuredfP,featuredfN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [whiteind,blackind,hispind,otherind,allind,BHOind,WHOind,WBOind,WBHind]
for s in range(len(statedfs)):
    for g1 in range(len(groupinds)):
        print(filterByIndex(statedfs[s],groupinds[g1]).values.shape)

In [None]:
#iterations = 5
iterations = 45
#iterations = 61
scoresRaceP = np.zeros((iterations+1,12,nFolds))
scoresRaceN = scoresRaceP.copy()

In [None]:
statedfs = [featuredfP,featuredfN]
scores = [scoresRaceP,scoresRaceN]
labelSeries = [labelSeriesP,labelSeriesN]
traininds = [whiteind,blackind,hispind,otherind,allind,allind,allind,allind,BHOind,WHOind,WBOind,WBHind]
testinds = [whiteind,blackind,hispind,otherind,whiteind,blackind,hispind,otherind,whiteind,blackind,hispind,otherind]

start = 5
end = start + iterations + 1

# Run simulation for each increment of 1,000
for n in range(start,end):
    print(n)
    
    # For each state (positive, negative)
    for s in range(len(statedfs)):
        
        # For each combination
        for g in range(len(traininds)):
            # get training data
            features = filterByIndex(statedfs[s],traininds[g])
            groups = features.index.values
            features = features.values
            labels = filterByIndex(labelSeries[s],traininds[g]).values
            sample = np.random.choice(np.arange(len(labels)),n*100)
            features = features[sample]
            labels = labels[sample]
            groups = groups[sample]
            
            # get testing data
            predFeatures = filterByIndex(statedfs[s],testinds[g]).values
            predLabels = filterByIndex(labelSeries[s],testinds[g]).values
            
            # Repeat for k folds
            for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
                # Train a model
                _, model = classify(features[train],labels[train],features[test])
                # Predict
                preds = model.predict(predFeatures)
                rho = evaluateSpearman(predLabels,preds)
                scores[s][n-start,g,i] = rho

In [None]:
index = np.arange(start,end)
columns = ['W/W','B/B','H/H','O/O','A/W','A/B','A/H','A/O','BHO/W','WHO/B','WBO/H','WBH/O']
dfP = pd.DataFrame(data=np.mean(scoresRaceP,axis=2),index=index,columns=columns)
dfN = pd.DataFrame(data=np.mean(scoresRaceN,axis=2),index=index,columns=columns)

In [None]:
#dfP.to_csv('../Data (Algebra 1)/simulation race positive.csv')
#dfN.to_csv('../Data (Algebra 1)/simulation race negative.csv')
#dfP.to_csv('../Data (Algebra 1)/simulation long race positive.csv')
#dfN.to_csv('../Data (Algebra 1)/simulation long race negative.csv')
dfP.to_csv('../Data (Algebra 1)/simulation small race positive.csv')
dfN.to_csv('../Data (Algebra 1)/simulation small race negative.csv')

In [None]:
#dataP = readData('../Data (Algebra 1)/simulation race positive.csv')
#dataN = readData('../Data (Algebra 1)/simulation race negative.csv')
#dataP = readData('../Data (Algebra 1)/simulation long race positive.csv')
#dataN = readData('../Data (Algebra 1)/simulation long race negative.csv')
dataP = readData('../Data (Algebra 1)/simulation small race positive.csv')
dataN = readData('../Data (Algebra 1)/simulation small race negative.csv')

In [None]:
inW = (dataP['W/W'] - dataP['A/W'])/(dataP['A/W'])
inB = (dataP['B/B'] - dataP['A/B'])/(dataP['A/B'])
inH = (dataP['H/H'] - dataP['A/H'])/(dataP['A/H'])
inO = (dataP['O/O'] - dataP['A/O'])/(dataP['A/O'])

outW = (dataP['W/W'] - dataP['BHO/W'])/(dataP['BHO/W'])
outB = (dataP['B/B'] - dataP['WHO/B'])/(dataP['WHO/B'])
outH = (dataP['H/H'] - dataP['WBO/H'])/(dataP['WBO/H'])
outO = (dataP['O/O'] - dataP['WBH/O'])/(dataP['WBH/O'])

print(max(max(inW),max(inB),max(inH),max(outW),max(outB),max(outH),max(inO),max(outO)))
print(min(min(inW),min(inB),min(inH),min(outW),min(outB),min(outH),min(inO),min(outO)))

In [None]:
fig, axs = plt.subplots(1, 2, sharex=True,figsize=(10, 4))
axs[0].plot(inW,label='W')
axs[0].plot(inB,label='B')
axs[0].plot(inH,label='H')
axs[0].plot(inO,label='O')
axs[0].set_ylim(-1,1)
axs[0].grid(True, which='both')
axs[0].set_title('In-group')
axs[0].legend()
axs[0].axhline(y=0, color='k',linewidth=0.5)
axs[1].plot(outW,label='W')
axs[1].plot(outB,label='B')
axs[1].plot(outH,label='H')
axs[1].plot(outO,label='O')
axs[1].set_title('Out-group')
axs[1].grid(True, which='both')
axs[1].legend()
axs[1].axhline(y=0, color='k',linewidth=0.5)
axs[1].set_ylim(-1,1)

Run simulation 1000 times in order to get a more stable average

In [None]:
inWP = np.zeros((2,11))
inBP = inWP.copy()
inHP = inWP.copy()
inOP = inWP.copy()
outWP = inWP.copy()
outBP = inWP.copy()
outHP = inWP.copy()
outOP = inWP.copy()
inWN = inWP.copy()
inBN = inWP.copy()
inHN = inWP.copy()
inON = inWP.copy()
outWN = inWP.copy()
outBN = inWP.copy()
outHN = inWP.copy()
outON = inWP.copy()

In [None]:
iterations = 10
scoresRaceP = np.zeros((iterations+1,12,nFolds))
scoresRaceN = scoresRaceP.copy()

statedfs = [featuredfP,featuredfN]
scores = [scoresRaceP,scoresRaceN]
labelSeries = [labelSeriesP,labelSeriesN]
traininds = [whiteind,blackind,hispind,otherind,allind,allind,allind,allind,BHOind,WHOind,WBOind,WBHind]
testinds = [whiteind,blackind,hispind,otherind,whiteind,blackind,hispind,otherind,whiteind,blackind,hispind,otherind]

start = 8
end = start + iterations + 1

index = np.arange(start,end)
columns = ['W/W','B/B','H/H','O/O','A/W','A/B','A/H','A/O','BHO/W','WHO/B','WBO/H','WBH/O']

In [None]:
for run in range(1002):
    if run%25 == 0:
        print(run)
    # Run simulation for each increment
    for n in range(start,end):
        # For each state (positive, negative)
        for s in range(len(statedfs)):
            # For each combination
            for g in range(len(traininds)):
                # get training data
                features = filterByIndex(statedfs[s],traininds[g])
                groups = features.index.values
                features = features.values
                labels = filterByIndex(labelSeries[s],traininds[g]).values
                sample = np.random.choice(np.arange(len(labels)),n*100)
                features = features[sample]
                labels = labels[sample]
                groups = groups[sample]

                # get testing data
                predFeatures = filterByIndex(statedfs[s],testinds[g]).values
                predLabels = filterByIndex(labelSeries[s],testinds[g]).values

                # Repeat for k folds
                for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
                    # Train a model
                    _, model = classify(features[train],labels[train],features[test])
                    # Predict
                    preds = model.predict(predFeatures)
                    rho = evaluateSpearman(predLabels,preds)
                    scores[s][n-start,g,i] = rho
    dfP = pd.DataFrame(data=np.mean(scoresRaceP,axis=2),index=index,columns=columns)
    dfN = pd.DataFrame(data=np.mean(scoresRaceN,axis=2),index=index,columns=columns)
    
    outOP[run%2,:] = (dfP['WBH/O'] - dfP['O/O'])/(dfP['O/O']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/outOP.csv", "a")
        np.savetxt(f, outOP,delimiter=',')
        f.close()
        
    outON[run%2,:] = (dfN['WBH/O'] - dfN['O/O'])/(dfN['O/O']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/outON.csv", "a")
        np.savetxt(f, outON,delimiter=',')
        f.close()
    
    outHP[run%2,:] = (dfP['WBO/H'] - dfP['H/H'])/(dfP['H/H']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/outHP.csv", "a")
        np.savetxt(f, outHP,delimiter=',')
        f.close()
        
    outHN[run%2,:] = (dfN['WBO/H'] - dfN['H/H'])/(dfN['H/H']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/outHN.csv", "a")
        np.savetxt(f, outHN,delimiter=',')
        f.close()
    
    outWP[run%2,:] = (dfP['BHO/W'] - dfP['W/W'])/(dfP['W/W']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/outWP.csv", "a")
        np.savetxt(f, outWP,delimiter=',')
        f.close()
        
    outWN[run%2,:] = (dfN['BHO/W'] - dfN['W/W'])/(dfN['W/W']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/outWN.csv", "a")
        np.savetxt(f, outWN,delimiter=',')
        f.close()
        
    outBP[run%2,:] = (dfP['WHO/B'] - dfP['B/B'])/(dfP['B/B']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/outBP.csv", "a")
        np.savetxt(f, outBP,delimiter=',')
        f.close()
        
    outBN[run%2,:] = (dfN['WHO/B'] - dfN['B/B'])/(dfN['B/B']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/outBN.csv", "a")
        np.savetxt(f, outBN,delimiter=',')
        f.close()
    
    inWP[run%2,:] = (dfP['W/W'] - dfP['A/W'])/(dfP['A/W']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/inWP.csv", "a")
        np.savetxt(f, inWP,delimiter=',')
        f.close()
        
    inWN[run%2,:] = (dfN['W/W'] - dfN['A/W'])/(dfN['A/W']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/inWN.csv", "a")
        np.savetxt(f, inWN,delimiter=',')
        f.close()
        
    inBP[run%2,:] = (dfP['B/B'] - dfP['A/B'])/(dfP['A/B']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/inBP.csv", "a")
        np.savetxt(f, inBP,delimiter=',')
        f.close()
        
    inBN[run%2,:] = (dfN['B/B'] - dfN['A/B'])/(dfN['A/B']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/inBN.csv", "a")
        np.savetxt(f, inBN,delimiter=',')
        f.close()
        
    inHP[run%2,:] = (dfP['H/H'] - dfP['A/H'])/(dfP['A/H']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/inHP.csv", "a")
        np.savetxt(f, inHP,delimiter=',')
        f.close()
        
    inHN[run%2,:] = (dfN['H/H'] - dfN['A/H'])/(dfN['A/H']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/inHN.csv", "a")
        np.savetxt(f, inHN,delimiter=',')
        f.close()
        
    inOP[run%2,:] = (dfP['O/O'] - dfP['A/O'])/(dfP['A/O']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/inOP.csv", "a")
        np.savetxt(f, inOP,delimiter=',')
        f.close()
        
    inON[run%2,:] = (dfN['O/O'] - dfN['A/O'])/(dfN['A/O']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Race/inON.csv", "a")
        np.savetxt(f, inON,delimiter=',')
        f.close()

# Usage clusters

Do a 100-run simulation using the clusters as the groups.

In [8]:
# load activity data for clusters
activityFile = '../Data (Algebra 1)/YearUsageActivity.csv'
activitydf = readData(activityFile)
# normalize activity data
activitydf = replaceDataFrameOutliers(activitydf,['sessionTime','NumSessions'])
activitydf, scalers = scaleDataframe(activitydf)

In [9]:
# get usage clusters
model = joblib.load('kmeans.joblib')
clusterSeries = pd.Series(model.predict(activitydf.values),index=activitydf.index)



In [10]:
# get indices for clusters
c0ind = clusterSeries.loc[clusterSeries == 0].index
c1ind = clusterSeries.loc[clusterSeries == 1].index
c2ind = clusterSeries.loc[clusterSeries == 2].index
c3ind = clusterSeries.loc[clusterSeries == 3].index
c4ind = clusterSeries.loc[clusterSeries == 4].index

In [11]:
# combinations for simulation
c0123ind = c0ind.union(c1ind).union(c2ind).union(c3ind)
c0234ind = c0ind.union(c4ind).union(c2ind).union(c3ind)
c0134ind = c0ind.union(c1ind).union(c4ind).union(c3ind)
c0124ind = c0ind.union(c1ind).union(c2ind).union(c4ind)
c1234ind = c4ind.union(c1ind).union(c2ind).union(c3ind)

In [12]:
statedfs = [featuredfP,featuredfN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [c0ind,c1ind,c2ind,c3ind,c4ind,c0123ind,c0234ind,c0134ind,c0124ind,c1234ind]
for s in range(len(statedfs)):
    for g1 in range(len(groupinds)):
        print(filterByIndex(statedfs[s],groupinds[g1]).values.shape)

(4181, 22)
(9170, 22)
(39705, 22)
(6512, 22)
(2124, 22)
(59568, 22)
(52522, 22)
(21987, 22)
(55180, 22)
(57511, 22)
(3043, 22)
(6596, 22)
(28984, 22)
(4779, 22)
(1552, 22)
(43402, 22)
(38358, 22)
(15970, 22)
(40175, 22)
(41911, 22)


Run simulation 100 times in order to get a more stable average

In [12]:
in0P = np.zeros((2,11))
in1P = in0P.copy()
in2P = in0P.copy()
in3P = in0P.copy()
in4P = in0P.copy()
in0N = in0P.copy()
in1N = in0P.copy()
in2N = in0P.copy()
in3N = in0P.copy()
in4N = in0P.copy()
out0P = in0P.copy()
out1P = in0P.copy()
out2P = in0P.copy()
out3P = in0P.copy()
out4P = in0P.copy()
out0N = in0P.copy()
out1N = in0P.copy()
out2N = in0P.copy()
out3N = in0P.copy()
out4N = in0P.copy()

In [26]:
iterations = 17
#scoresUseP = np.zeros((iterations+1,15,nFolds))
scoresUseP = np.zeros((iterations,15,nFolds))
scoresUseN = scoresUseP.copy()

statedfs = [featuredfP,featuredfN]
scores = [scoresUseP,scoresUseN]
labelSeries = [labelSeriesP,labelSeriesN]
traininds = [c0ind,c1ind,c2ind,c3ind,c4ind,allind,allind,allind,allind,allind,c0123ind,c0124ind,c0134ind,c0234ind,c1234ind]
testinds = [c0ind,c1ind,c2ind,c3ind,c4ind,c0ind,c1ind,c2ind,c3ind,c4ind,c4ind,c3ind,c2ind,c1ind,c0ind]

start = 20
end = start + iterations + 1

index = np.arange(start*100,end*100,500)
columns = ['0/0','1/1','2/2','3/3','4/4','A/0','A/1','A/2','A/3','A/4','0123/4','0124/3','0134/2','0234/1','1234/0']

In [27]:
iterations = 17
start = 2000
end = 10500
index = np.arange(start,end,500)

In [29]:
for run in range(11):
    #if run%1 == 0:
    print(run)
    # Run simulation for each increment
    for n in index:
        # For each state (positive, negative)
        for s in range(len(statedfs)):
            # For each combination
            for g in range(len(traininds)):
                # get training data
                features = filterByIndex(statedfs[s],traininds[g])
                groups = features.index.values
                features = features.values
                labels = filterByIndex(labelSeries[s],traininds[g]).values
                sample = np.random.choice(np.arange(len(labels)),n)
                features = features[sample]
                labels = labels[sample]
                groups = groups[sample]

                # get testing data
                predFeatures = filterByIndex(statedfs[s],testinds[g]).values
                predLabels = filterByIndex(labelSeries[s],testinds[g]).values

                # Repeat for k folds
                for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
                    # Train a model
                    _, model = classify(features[train],labels[train],features[test])
                    # Predict
                    preds = model.predict(predFeatures)
                    rho = evaluateSpearman(predLabels,preds)
                    scores[s][int((n-start)/500),g,i] = rho
    dfP = pd.DataFrame(data=np.mean(scoresUseP,axis=2),index=index,columns=columns)
    dfN = pd.DataFrame(data=np.mean(scoresUseN,axis=2),index=index,columns=columns)
    
    in0P[run%2,:] = (dfP['0/0'] - dfP['A/0'])/(dfP['A/0']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/in0P.csv", "a")
        np.savetxt(f, in0P,delimiter=',')
        f.close()
    in1P[run%2,:] = (dfP['1/1'] - dfP['A/1'])/(dfP['A/1']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/in1P.csv", "a")
        np.savetxt(f, in1P,delimiter=',')
        f.close()
    in2P[run%2,:] = (dfP['2/2'] - dfP['A/2'])/(dfP['A/2']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/in2P.csv", "a")
        np.savetxt(f, in2P,delimiter=',')
        f.close()
    in3P[run%2,:] = (dfP['3/3'] - dfP['A/3'])/(dfP['A/3']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/in3P.csv", "a")
        np.savetxt(f, in3P,delimiter=',')
        f.close()
    in4P[run%2,:] = (dfP['4/4'] - dfP['A/4'])/(dfP['A/4']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/in4P.csv", "a")
        np.savetxt(f, in4P,delimiter=',')
        f.close()
    in0N[run%2,:] = (dfN['0/0'] - dfN['A/0'])/(dfN['A/0']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/in0N.csv", "a")
        np.savetxt(f, in0N,delimiter=',')
        f.close()
    in1N[run%2,:] = (dfN['1/1'] - dfN['A/1'])/(dfN['A/1']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/in1N.csv", "a")
        np.savetxt(f, in1N,delimiter=',')
        f.close()
    in2N[run%2,:] = (dfN['2/2'] - dfN['A/2'])/(dfN['A/2']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/in2N.csv", "a")
        np.savetxt(f, in2N,delimiter=',')
        f.close()
    in3N[run%2,:] = (dfN['3/3'] - dfN['A/3'])/(dfN['A/3']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/in3N.csv", "a")
        np.savetxt(f, in3N,delimiter=',')
        f.close()
    in4N[run%2,:] = (dfN['4/4'] - dfN['A/4'])/(dfN['A/4']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/in4N.csv", "a")
        np.savetxt(f, in4N,delimiter=',')
        f.close()
    out0P[run%2,:] = (dfP['1234/0'] - dfP['0/0'])/(dfP['0/0']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/out0P.csv", "a")
        np.savetxt(f, out0P,delimiter=',')
        f.close()
    out1P[run%2,:] = (dfP['0234/1'] - dfP['1/1'])/(dfP['1/1']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/out1P.csv", "a")
        np.savetxt(f, out1P,delimiter=',')
        f.close()
    out2P[run%2,:] = (dfP['0134/2'] - dfP['2/2'])/(dfP['2/2']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/out2P.csv", "a")
        np.savetxt(f, out2P,delimiter=',')
        f.close()
    out3P[run%2,:] = (dfP['0124/3'] - dfP['3/3'])/(dfP['3/3']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/out3P.csv", "a")
        np.savetxt(f, out3P,delimiter=',')
        f.close()
    out4P[run%2,:] = (dfP['0123/4'] - dfP['4/4'])/(dfP['4/4']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/out4P.csv", "a")
        np.savetxt(f, out4P,delimiter=',')
        f.close()
    out0N[run%2,:] = (dfN['1234/0'] - dfN['0/0'])/(dfN['0/0']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/out0N.csv", "a")
        np.savetxt(f, out0N,delimiter=',')
        f.close()
    out1N[run%2,:] = (dfN['0234/1'] - dfN['1/1'])/(dfN['1/1']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/out1N.csv", "a")
        np.savetxt(f, out1N,delimiter=',')
        f.close()
    out2N[run%2,:] = (dfN['0134/2'] - dfN['2/2'])/(dfN['2/2']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/out2N.csv", "a")
        np.savetxt(f, out2N,delimiter=',')
        f.close()
    out3N[run%2,:] = (dfN['0124/3'] - dfN['3/3'])/(dfN['3/3']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/out3N.csv", "a")
        np.savetxt(f, out3N,delimiter=',')
        f.close()
    out4N[run%2,:] = (dfN['0123/4'] - dfN['4/4'])/(dfN['4/4']).values
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Use big/out4N.csv", "a")
        np.savetxt(f, out4N,delimiter=',')
        f.close()

0


ValueError: could not broadcast input array from shape (17) into shape (11)

In [24]:
(n-start)/500

0.0

# Demographics clusters

Do a 100-run simulation using the clusters as the groups.

In [13]:
# get usage clusters
model = joblib.load('kmeans demo only.joblib')
clusterSeries = pd.Series(model.predict(demodf.values),index=demodf.index)

In [14]:
# get indices for clusters
c0ind = clusterSeries.loc[clusterSeries == 0].index
c1ind = clusterSeries.loc[clusterSeries == 1].index
c2ind = clusterSeries.loc[clusterSeries == 2].index
c3ind = clusterSeries.loc[clusterSeries == 3].index
c4ind = clusterSeries.loc[clusterSeries == 4].index
c5ind = clusterSeries.loc[clusterSeries == 5].index
c6ind = clusterSeries.loc[clusterSeries == 6].index

In [15]:
# combinations for simulation
c012345ind = c0ind.union(c1ind).union(c2ind).union(c3ind).union(c4ind).union(c5ind)
c012346ind = c0ind.union(c1ind).union(c2ind).union(c3ind).union(c4ind).union(c6ind)
c012356ind = c0ind.union(c1ind).union(c2ind).union(c3ind).union(c6ind).union(c5ind)
c012456ind = c0ind.union(c1ind).union(c2ind).union(c6ind).union(c4ind).union(c5ind)
c013456ind = c0ind.union(c1ind).union(c6ind).union(c3ind).union(c4ind).union(c5ind)
c023456ind = c0ind.union(c6ind).union(c2ind).union(c3ind).union(c4ind).union(c5ind)
c123456ind = c6ind.union(c1ind).union(c2ind).union(c3ind).union(c4ind).union(c5ind)

In [17]:
statedfs = [featuredfP,featuredfN]
labelSeries = [labelSeriesP,labelSeriesN]
groupinds = [c0ind,c1ind,c2ind,c3ind,c4ind,c5ind,c6ind,c012345ind,c023456ind,c013456ind,c012456ind,c123456ind,c012356ind,c012346ind]
for s in range(len(statedfs)):
    for g1 in range(len(groupinds)):
        print(filterByIndex(statedfs[s],groupinds[g1]).values.shape)

(1932, 22)
(10060, 22)
(8985, 22)
(16503, 22)
(7841, 22)
(9091, 22)
(7280, 22)
(54412, 22)
(51632, 22)
(52707, 22)
(45189, 22)
(59760, 22)
(53851, 22)
(52601, 22)
(1339, 22)
(7453, 22)
(6609, 22)
(11963, 22)
(5631, 22)
(6685, 22)
(5274, 22)
(39680, 22)
(37501, 22)
(38345, 22)
(32991, 22)
(43615, 22)
(39323, 22)
(38269, 22)


Run simulation 100 times in order to get a more stable average

In [28]:
in0P = np.zeros((2,11))
in1P = in0P.copy()
in2P = in0P.copy()
in3P = in0P.copy()
in4P = in0P.copy()
in5P = in0P.copy()
in6P = in0P.copy()
in0N = in0P.copy()
in1N = in0P.copy()
in2N = in0P.copy()
in3N = in0P.copy()
in4N = in0P.copy()
in5N = in0P.copy()
in6N = in0P.copy()
out0P = in0P.copy()
out1P = in0P.copy()
out2P = in0P.copy()
out3P = in0P.copy()
out4P = in0P.copy()
out5P = in0P.copy()
out6P = in0P.copy()
out0N = in0P.copy()
out1N = in0P.copy()
out2N = in0P.copy()
out3N = in0P.copy()
out4N = in0P.copy()
out5N = in0P.copy()
out6N = in0P.copy()

In [29]:
iterations = 10
scoresUseP = np.zeros((iterations+1,21,nFolds))
scoresUseN = scoresUseP.copy()

statedfs = [featuredfP,featuredfN]
scores = [scoresUseP,scoresUseN]
labelSeries = [labelSeriesP,labelSeriesN]
traininds = [c0ind,c1ind,c2ind,c3ind,c4ind,c5ind,c6ind,allind,allind,allind,allind,allind,allind,allind,c012345ind,c012346ind,c012356ind,c012456ind,c013456ind,c023456ind,c123456ind]
testinds = [c0ind,c1ind,c2ind,c3ind,c4ind,c5ind,c6ind,c0ind,c1ind,c2ind,c3ind,c4ind,c5ind,c6ind,c6ind,c5ind,c4ind,c3ind,c2ind,c1ind,c0ind]

start = 5
end = start + iterations + 1

index = np.arange(start*100,end*100,100)
columns = ['0/0','1/1','2/2','3/3','4/4','5/5','6/6','A/0','A/1','A/2','A/3','A/4','A/5','A/6','012345/6','012346/5','012356/4','012456/3','013456/2','023456/1','123456/0']

In [None]:
iterations = 17
start = 2000
end = 10500
index = np.arange(start,end,500)

In [31]:
for run in range(11):
    if run%1 == 0:
        print(run)
    # Run simulation for each increment
    for n in index:
        # For each state (positive, negative)
        for s in range(len(statedfs)):
            # For each combination
            for g in range(len(traininds)):
                # get training data
                features = filterByIndex(statedfs[s],traininds[g])
                groups = features.index.values
                features = features.values
                labels = filterByIndex(labelSeries[s],traininds[g]).values
                sample = np.random.choice(np.arange(len(labels)),n)
                features = features[sample]
                labels = labels[sample]
                groups = groups[sample]

                # get testing data
                predFeatures = filterByIndex(statedfs[s],testinds[g]).values
                predLabels = filterByIndex(labelSeries[s],testinds[g]).values

                # Repeat for k folds
                for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
                    # Train a model
                    _, model = classify(features[train],labels[train],features[test])
                    # Predict
                    preds = model.predict(predFeatures)
                    rho = evaluateSpearman(predLabels,preds)
                    scores[s][int((n-start)/100),g,i] = rho
    dfP = pd.DataFrame(data=np.mean(scoresUseP,axis=2),index=index,columns=columns)
    dfN = pd.DataFrame(data=np.mean(scoresUseN,axis=2),index=index,columns=columns)
    
    in0P[run%2,:] = (dfP['0/0'] - dfP['A/0'])/(dfP['A/0']).values
    in1P[run%2,:] = (dfP['1/1'] - dfP['A/1'])/(dfP['A/1']).values
    in2P[run%2,:] = (dfP['2/2'] - dfP['A/2'])/(dfP['A/2']).values
    in3P[run%2,:] = (dfP['3/3'] - dfP['A/3'])/(dfP['A/3']).values
    in4P[run%2,:] = (dfP['4/4'] - dfP['A/4'])/(dfP['A/4']).values
    in5P[run%2,:] = (dfP['5/5'] - dfP['A/5'])/(dfP['A/5']).values
    in6P[run%2,:] = (dfP['6/6'] - dfP['A/6'])/(dfP['A/6']).values
    in0N[run%2,:] = (dfN['0/0'] - dfN['A/0'])/(dfN['A/0']).values
    in1N[run%2,:] = (dfN['1/1'] - dfN['A/1'])/(dfN['A/1']).values
    in2N[run%2,:] = (dfN['2/2'] - dfN['A/2'])/(dfN['A/2']).values
    in3N[run%2,:] = (dfN['3/3'] - dfN['A/3'])/(dfN['A/3']).values
    in4N[run%2,:] = (dfN['4/4'] - dfN['A/4'])/(dfN['A/4']).values
    in5N[run%2,:] = (dfN['5/5'] - dfN['A/5'])/(dfN['A/5']).values
    in6N[run%2,:] = (dfN['6/6'] - dfN['A/6'])/(dfN['A/6']).values
    
    out0P[run%2,:] = (dfP['123456/0'] - dfP['0/0'])/(dfP['0/0']).values
    out1P[run%2,:] = (dfP['023456/1'] - dfP['1/1'])/(dfP['1/1']).values
    out2P[run%2,:] = (dfP['013456/2'] - dfP['2/2'])/(dfP['2/2']).values
    out3P[run%2,:] = (dfP['012456/3'] - dfP['3/3'])/(dfP['3/3']).values
    out4P[run%2,:] = (dfP['012356/4'] - dfP['4/4'])/(dfP['4/4']).values
    out5P[run%2,:] = (dfP['012346/5'] - dfP['5/5'])/(dfP['5/5']).values
    out6P[run%2,:] = (dfP['012345/6'] - dfP['6/6'])/(dfP['6/6']).values
    out0N[run%2,:] = (dfN['123456/0'] - dfN['0/0'])/(dfN['0/0']).values
    out1N[run%2,:] = (dfN['023456/1'] - dfN['1/1'])/(dfN['1/1']).values
    out2N[run%2,:] = (dfN['013456/2'] - dfN['2/2'])/(dfN['2/2']).values
    out3N[run%2,:] = (dfN['012456/3'] - dfN['3/3'])/(dfN['3/3']).values
    out4N[run%2,:] = (dfN['012356/4'] - dfN['4/4'])/(dfN['4/4']).values
    out5N[run%2,:] = (dfN['012346/5'] - dfN['5/5'])/(dfN['5/5']).values
    out6N[run%2,:] = (dfN['012345/6'] - dfN['6/6'])/(dfN['6/6']).values
    
    if run%2 == 1:
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in0P.csv", "a")
        np.savetxt(f, in0P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out0P.csv", "a")
        np.savetxt(f, out0P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in0N.csv", "a")
        np.savetxt(f, in0N,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out0N.csv", "a")
        np.savetxt(f, out0N,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in1P.csv", "a")
        np.savetxt(f, in1P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out1P.csv", "a")
        np.savetxt(f, out1P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in1N.csv", "a")
        np.savetxt(f, in1N,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out1N.csv", "a")
        np.savetxt(f, out1N,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in2P.csv", "a")
        np.savetxt(f, in2P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out2P.csv", "a")
        np.savetxt(f, out2P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in2N.csv", "a")
        np.savetxt(f, in2N,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out2N.csv", "a")
        np.savetxt(f, out2N,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in3P.csv", "a")
        np.savetxt(f, in3P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out3P.csv", "a")
        np.savetxt(f, out3P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in3N.csv", "a")
        np.savetxt(f, in3N,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out3N.csv", "a")
        np.savetxt(f, out3N,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in4P.csv", "a")
        np.savetxt(f, in4P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out4P.csv", "a")
        np.savetxt(f, out4P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in4N.csv", "a")
        np.savetxt(f, in4N,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out4N.csv", "a")
        np.savetxt(f, out4N,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in5P.csv", "a")
        np.savetxt(f, in5P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out5P.csv", "a")
        np.savetxt(f, out5P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in5N.csv", "a")
        np.savetxt(f, in5N,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out5N.csv", "a")
        np.savetxt(f, out5N,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in6P.csv", "a")
        np.savetxt(f, in6P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out6P.csv", "a")
        np.savetxt(f, out6P,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/in6N.csv", "a")
        np.savetxt(f, in6N,delimiter=',')
        f.close()
        f = open("../Data (Algebra 1)/Demographics/Simulations (new out)/Demographics/out6N.csv", "a")
        np.savetxt(f, out6N,delimiter=',')
        f.close()
        
    

0
25
50
75
100
