In [2]:
import numpy as np
import random
import pandas as pd
#import bnlearn as bn
import random
#mod = bn.import_DAG("fai_mod.bif")


# Set up Experimental Parameters

We outline here what the simulation parameters are, focusing on:
1. Sparsity of the $\beta_j$s (the main effects) and $\beta_z$s (the interaction effects), defined as $sp_j$ and $sp_z$ respectively
2. Magnitude of the $\beta_j$s and $\beta_z$s (as a multiplier defining the maximum value, defined as $m_j$ and $m_z$ respectively
3. What the context conditions are
4. What the conjoint conditions are

For now, 3. and 4. are fixed to the conditions in our current experiment, and we also assume we're not drawing from a real-world distribution (as we are in the experiment we are running).

Given 1. and 2. above, worth now an example to make explicit how we sample the coefficients. Let's assume we have a conjoint variable **age** with 3 levels: "Under 18", "18-20", and "20+". We have 3 coefficients to set (for main effects), one for each of the levels (at least in our current assumed true model). Then we would sample as follows:
- If we allow all 3 of them to potentially be non-zero, then we will end up with something that is not identifiable. So, the first step is to fix the first level (here, "Under 18") to 0.
- We then, for the remaining two coefficients:
    - Draw $b~\mathrm{Bern}(sp_j)$
    - If $b=0$, set the coefficient to 0, otherwise, set the coefficient to $m_j*\mathrm{Unif(0,1)}$


## Outstanding Questions
- Are we messing anything up by fixing the first coefficient?
- Does the sampling of $\mathbf{b_z}$ make sense? Does it have to somehow be conditioned on $\mathbf{\beta_j}$?

In [17]:
# sp_j: % of b_js (main effects) that are exactly 0
BJ_SPARSITY_OPTIONS = [0.1, 0.5]

# sp_z: % of b_zs (interaction effects) that are exactly 0
BZ_SPARSITY_OPTIONS = [0.1, 0.5, .9]

# m_j
BJ_MAGNITUDE_OPTIONS = [1,5]

# m_z
BZ_MAGNITUDE_OPTIONS = [1,5]

SIGMA_MULTIPLIER = [.01, .1, .5]

N_RESPONDENTS = [400] #[400,1000]

# Context effects - the context effect name and then the number of levels
context_factors = {
    "context_1" : 2,
    "context_2" : 4
}


# Conjoint variables
conjoint_factors={
    'gender':2,
    'case_note':3,
    'race':3,
    'length':4,
    'age':4,
    'setting':4,
    'goal':5,
    'n_removal':3,
    'reason':5
}

# Utility code

## Utilities for Simulation

In [4]:
### Randomly select and one-hot-encode contexts

def gen_clist(con_dict,n_response): #generates a random choice of contexts for an experiment
    empty=[]
    count=0
    for i in con_dict.values():
        empty=np.concatenate([empty,np.random.choice(i,n_response)])
        count+=1
    empty=np.array(empty)
    return empty.reshape(count,n_response).T

def oh_c(gen_c,con_dict): #one hot encode context list
    final=[]
    con_v=[x for x in con_dict.values()]
    #print(con_v)
    for i in gen_c:
        temp=[]
        for j in range(len(con_v)):
            temp=np.concatenate([temp,np.eye(con_v[j])[int(i[j])]])
        final.append(temp)
    return final

In [5]:
### Ensure identifiable models by setting the first level coefs of b_j to 0

def fix_first_level_bjs(base_bj, test_att): #setting the first level of bjs to 0
    count=0
    for i in [x for x in test_att.values()]:
        base_bj[count]=0
        count+=i
    return base_bj

def fix_first_level_context(base_bz,test_con): #setting the first level of context list to 0
    for i in base_bz:
        count=0
        for j in [x for x in test_con.values()]:
            i[count]=0
            count +=j
    return base_bz

In [6]:
### Conjoint Question construction
def gen_one_profile(c_list): #generate one profile
    final=[]
    og=[]
    for i in c_list: #attribute list, so for each attribute generate a random number that is within the constraint of it
        random_int=np.random.randint(i)
        #print(random_int)
        final=np.concatenate([final,np.eye(i)[random_int]])
        og.append(random_int)
    return final, og

def gen_conjoint_questions(n,n_trial,count_list):
    final_n_dim_array=[] 
    for i in range(n):
        ten_questions=[]
        og=[]
        for j in range(n_trial):
            first_prof=gen_one_profile(count_list)
            second_prof=gen_one_profile(count_list)
            ten_questions.append([first_prof[0],second_prof[0]])
            #print(ten_questions)
            og.append([first_prof[1],second_prof[1]])
        final_n_dim_array.append([ten_questions,og])
    return final_n_dim_array


In [7]:

def gen_sparsity(n,sparsity): #generate sparse lists/matrcies
    return np.random.choice(n,int(n*sparsity),replace=False)#generate sparse indices

def gen_base_betas(n): #generate the baseline level of bjs (since we have different sparsity and multiplers in a sim)
    return np.random.uniform(-1, 1,n)

def gen_bjs(unsparse,sum_count,sparsity,multiplier): #take the base_bjs then alter it base on spar and mult
    spar=gen_sparsity(sum_count,sparsity)
    for i in spar:
        unsparse[i]=0
    return unsparse*multiplier


def gen_base_bzs(v_list, sum_count, con_sum): #generate the baseline level of bzs
    return np.random.choice(v_list, sum_count*con_sum)

def gen_bzs(unsparse,sum_count,con_sum,sparsity,multiplier): #alter bzs
    #unsparse=np.random.choice(v_list, sum_count*con_sum)
    spar=gen_sparsity(sum_count*con_sum,sparsity)
    for i in spar:
        unsparse[i]=0
    return unsparse.reshape(sum_count,con_sum)*multiplier




## Utilities for transforming simulation output to CSVs

In [8]:
from functools import partial


def get_col(pu, name, col_count):
    return pu[name][col_count]

def unwrap_orig(df,att_dict,field_name):
    for col_count, col_name in enumerate(att_dict.keys()):        
        df[col_name]=df.apply(partial(get_col,
                                      name=field_name,
                                      col_count=col_count), 
                              axis=1)
    return df

#unwrap columns with arrays into one element per column
def unwrap_onehot(df,att_dict,field_name): 
    col_count=0
    for col_name,n_vals in att_dict.items():
        for j in range(n_vals):
            df[col_name+str(j)]=df.apply(partial(get_col,
                                      name=field_name,
                                      col_count=col_count), 
                                  axis=1)
            col_count +=1
    return df

In [67]:
col_count=0
for col_name,n_vals in context_factors.items():
        for j in range(n_vals):
            print(col_name + str(j))

context_10
context_11
context_20
context_21
context_22
context_23


In [68]:
def get_col_names(factors):
    temp=[]
    col_count=0
    for col_name,n_vals in factors.items():
            for j in range(n_vals):
                temp.append(col_name + str(j))
    return temp

In [103]:
def get_bzs_names(context, conjoint):
    temp=[]
    for conj_names, conj_vals in conjoint.items():
        for j in range(conj_vals):
            for cont_names, cont_vals in context.items():
                for k in range(cont_vals):
                    temp.append(conj_names+str(j)+(':')+cont_names+str(k))
    return temp

In [104]:
get_bzs_names(context_factors, conjoint_factors)

['gender0:context_10',
 'gender0:context_11',
 'gender0:context_20',
 'gender0:context_21',
 'gender0:context_22',
 'gender0:context_23',
 'gender1:context_10',
 'gender1:context_11',
 'gender1:context_20',
 'gender1:context_21',
 'gender1:context_22',
 'gender1:context_23',
 'case_note0:context_10',
 'case_note0:context_11',
 'case_note0:context_20',
 'case_note0:context_21',
 'case_note0:context_22',
 'case_note0:context_23',
 'case_note1:context_10',
 'case_note1:context_11',
 'case_note1:context_20',
 'case_note1:context_21',
 'case_note1:context_22',
 'case_note1:context_23',
 'case_note2:context_10',
 'case_note2:context_11',
 'case_note2:context_20',
 'case_note2:context_21',
 'case_note2:context_22',
 'case_note2:context_23',
 'race0:context_10',
 'race0:context_11',
 'race0:context_20',
 'race0:context_21',
 'race0:context_22',
 'race0:context_23',
 'race1:context_10',
 'race1:context_11',
 'race1:context_20',
 'race1:context_21',
 'race1:context_22',
 'race1:context_23',
 'ra

In [105]:
#get_col_names(conjoint_factors)

# Perform the simulation

In [9]:
def compute_respondent_profile(profile,bjs, bzs,context, sigma_r):
    # Compute model value for first profile in question
    mu0=np.dot(profile,bjs)+np.dot(np.dot(profile,bzs),context)
    # Noise it up with respondent sigma
    return np.random.normal(mu0,sigma_r)

def generate_simulations_for_condition(
                            sim_prefix,
                            n_respondents, 
                            n_trials_per_respondent, 
                            context_dict, 
                            conjoint_dict, #generate simulations
                            bjs_sparsity, 
                            bjs_multiplier,
                            bzs_sparsity, 
                            bzs_multiplier, 
                            sigma_multiplier,
                            n_sim_dataset):
    
    sim_values = []
    data=[]
    for sim_id in range(n_sim_dataset):
        
        full_sim_id = f"{sim_prefix}_{sim_id}"
        
        # generate one-hot encoded context assigned to each respondent
        context_list = gen_clist(context_dict,n_respondents)
        one_hot_con_list = oh_c(context_list,context_dict)
        

        # generate all conjoint questions
        conjoint_questions = gen_conjoint_questions(n_respondents,
                                                  n_trials_per_respondent,
                                                  conjoint_dict.values())
        
        # Sample the main effects
        base_bjs = gen_base_betas(sum(conjoint_dict.values()))
        base_bjs = fix_first_level_bjs(base_bjs,conjoint_dict)#fixing first levels of bjs
        bjs = gen_bjs(base_bjs,sum(conjoint_dict.values()),bjs_sparsity,bjs_multiplier)

        # Sample the interaction effects
        base_bzs = gen_base_betas(sum(conjoint_dict.values())*sum(context_dict.values()))
        bzs = gen_bzs(base_bzs, 
                      sum(conjoint_dict.values()),
                      sum(context_dict.values()),
                      bzs_sparsity,
                      bzs_multiplier)
        bzs = fix_first_level_context(bzs,context_dict)
        
        sim_values.append({"sim_id": full_sim_id,
                           "bjs" : bjs,
                           "bzs" : bzs,
                           "sp_j": bjs_sparsity,
                           "sp_z": bzs_sparsity,
                           "m_j" : bjs_multiplier,
                           "m_z" : bzs_multiplier,
                           "sigma" : sigma_multiplier,
                           "n_respondents": n_respondents
                          })

        # for each respondent
        for respondent_id in range(n_respondents):
            
            # get their context assignment
            context=one_hot_con_list[respondent_id]
            
            # get their variability around the truth
            sigma_r=np.random.uniform(0.01,bjs_multiplier*sigma_multiplier)
            
            # get the conjoint questions
            respondent_qs =conjoint_questions[respondent_id][0]
            
            for trial in range(n_trials_per_respondent):
                
                # this question
                cj_question=respondent_qs[trial]
                y0 = compute_respondent_profile(cj_question[0], bjs,bzs,context,sigma_r)
                y1 = compute_respondent_profile(cj_question[1], bjs,bzs,context,sigma_r)

                #pretty sure we concluded the meeting w isys stating it should be deterministic
                respondent_answer = int(y1 >= y0)
                
                data.append({"sim_id" : full_sim_id, 
                             "respondent_id" : respondent_id,
                             "context" : context,
                             "context_orig" : context_list[respondent_id],
                             "profile" : cj_question[0], 
                             "profile_orig" : np.array(conjoint_questions[respondent_id][1][trial][0]),
                             "response" : 1-respondent_answer,
                             "respondent_sigma" : sigma_r,
                             "y" : y0,
                            })
                data.append({"sim_id" : full_sim_id, 
                             "respondent_id" : respondent_id,
                             "context" : context,
                             "context_orig" : context_list[respondent_id],
                             "profile" : cj_question[1], 
                             "profile_orig" : np.array(conjoint_questions[respondent_id][1][trial][1]),
                             "response" : respondent_answer,
                             "respondent_sigma" : sigma_r,
                             "y" :y1
                            })
    return data, sim_values


In [197]:
# sp_j: % of b_js (main effects) that are exactly 0
for sp_j in BJ_SPARSITY_OPTIONS:
    for sp_z in BJ_SPARSITY_OPTIONS:
        for m_j in BJ_MAGNITUDE_OPTIONS:
            for m_z in BZ_MAGNITUDE_OPTIONS:
                for sigma in SIGMA_MULTIPLIER:
                    for n_respondents in N_RESPONDENTS:
                        print("itr")
                        prefix = f"cjsim_{sp_j}_{sp_z}_{m_j}_{m_z}_{sigma}_{n_respondents}"
                        cj_responses, sim_data= generate_simulations_for_condition(
                                                        sim_prefix=prefix,
                                                        n_respondents=n_respondents, 
                                                        n_trials_per_respondent=10, 
                                                        context_dict=context_factors, 
                                                        conjoint_dict=conjoint_factors, 
                                                        bjs_sparsity=sp_j, 
                                                        bjs_multiplier=m_j,
                                                        bzs_sparsity=sp_z, 
                                                        bzs_multiplier=m_z, 
                                                        sigma_multiplier=sigma,
                                                        n_sim_dataset=25)
                        
                        df =pd.DataFrame(cj_responses)
                        dff=pd.DataFrame(data=df['profile_orig'].tolist(),columns=conjoint_factors.keys())
                        dfc=pd.DataFrame(data=df['context_orig'].tolist(),columns=context_factors.keys())
                        dffp=pd.DataFrame(data=df['profile'].tolist(),columns=get_col_names(conjoint_factors))
                        dffc=pd.DataFrame(data=df['context'].tolist(),columns=get_col_names(context_factors))

                        #df = unwrap_orig(df, conjoint_factors,"profile_orig") done
                        #df = unwrap_orig(df, context_factors,"context_orig") done
                        df.drop(columns=["context_orig","profile_orig"],inplace=True) #done
                        #df = unwrap_onehot(df, conjoint_factors,"profile") done
                        #df = unwrap_onehot(df, context_factors,"context") done
                        df.drop(columns=["context","profile"],inplace=True)# done
                        df=pd.concat([df,dff,dfc,dffp,dffc],axis=1)
                        df.to_csv(f"simdata/{prefix}.csv",index=False)

                        sim_df = pd.DataFrame(sim_data)
                        dfsimj=pd.DataFrame(data=sim_df['bjs'].tolist(),columns=get_col_names(conjoint_factors))
                        sim_df=pd.concat([sim_df, dfsimj],axis=1)
                        #sim_df = unwrap_onehot(sim_df, conjoint_factors,"bjs")
                        sim_df.drop(columns='bjs', inplace=True)
                        # deal with bz later
                        sim_df['flat_bz']=sim_df.apply(flatten_bz, axis=1)
                        dfsimz=pd.DataFrame(data=sim_df['flat_bz'].tolist(),columns=get_bzs_names(context_factors, conjoint_factors))
                        sim_df.drop(columns='bzs', inplace=True)
                        sim_df=pd.concat([sim_df,dfsimz],axis=1)
                        sim_df.to_csv(f"sim_params/{prefix}.csv", index=False)
                        
                    #break


itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr
itr


In [137]:
sim_df['bzs'].to_numpy().flatten()[0][0]

array([ 0.        ,  0.22021127,  0.        , -0.93484246, -0.23765094,
       -0.84731623])

In [200]:
sim_df.to_csv('why',index=True)

In [198]:
sim_df

Unnamed: 0,sim_id,sp_j,sp_z,m_j,m_z,sigma,n_respondents,gender0,gender1,case_note0,...,reason3:context_20,reason3:context_21,reason3:context_22,reason3:context_23,reason4:context_10,reason4:context_11,reason4:context_20,reason4:context_21,reason4:context_22,reason4:context_23
0,cjsim_0.5_0.5_5_5_0.5_400_0,0.5,0.5,5,5,0.5,400,0.0,0.0,0.0,...,0.0,-2.221329,0.0,-4.301741,0.0,-4.830788,0.0,4.809397,-1.199827,-3.767102
1,cjsim_0.5_0.5_5_5_0.5_400_1,0.5,0.5,5,5,0.5,400,0.0,-1.662641,0.0,...,0.0,0.0,-2.427798,-3.847197,0.0,0.0,0.0,-0.82497,-0.547948,0.0
2,cjsim_0.5_0.5_5_5_0.5_400_2,0.5,0.5,5,5,0.5,400,0.0,0.866951,0.0,...,0.0,0.0,0.0,-0.953343,0.0,0.0,0.0,-0.294045,1.79878,0.0
3,cjsim_0.5_0.5_5_5_0.5_400_3,0.5,0.5,5,5,0.5,400,0.0,0.0,0.0,...,0.0,2.373125,-4.750167,1.499782,0.0,-0.35854,0.0,0.0,1.405172,0.0
4,cjsim_0.5_0.5_5_5_0.5_400_4,0.5,0.5,5,5,0.5,400,0.0,0.0,0.0,...,0.0,0.0,0.0,0.926724,0.0,4.543865,0.0,0.0,0.0,0.0
5,cjsim_0.5_0.5_5_5_0.5_400_5,0.5,0.5,5,5,0.5,400,0.0,-2.991545,0.0,...,0.0,0.096117,-1.936518,0.0,0.0,0.0,0.0,0.903584,2.051902,2.829849
6,cjsim_0.5_0.5_5_5_0.5_400_6,0.5,0.5,5,5,0.5,400,0.0,3.057367,0.0,...,0.0,-0.844307,0.0,3.582446,0.0,0.0,0.0,0.0,0.0,-0.268754
7,cjsim_0.5_0.5_5_5_0.5_400_7,0.5,0.5,5,5,0.5,400,0.0,0.0,0.0,...,0.0,-2.276995,0.0,0.0,0.0,0.558209,0.0,0.0,3.502909,0.0
8,cjsim_0.5_0.5_5_5_0.5_400_8,0.5,0.5,5,5,0.5,400,0.0,0.0,0.0,...,0.0,-3.4898,-1.217494,0.0,0.0,0.0,0.0,2.968252,-0.647569,0.0
9,cjsim_0.5_0.5_5_5_0.5_400_9,0.5,0.5,5,5,0.5,400,0.0,0.0,0.0,...,0.0,-3.398221,-1.290108,0.0,0.0,0.0,0.0,0.0,0.959121,-2.942275


In [147]:
np.ravel(sim_df['bzs']).tolist()

[array([[ 0.        ,  0.22021127,  0.        , -0.93484246, -0.23765094,
         -0.84731623],
        [ 0.        ,  0.55474428,  0.        ,  0.40087369,  0.87706849,
          0.96440009],
        [ 0.        , -0.36801269,  0.        ,  0.52679772,  0.30778899,
         -0.49784838],
        [ 0.        ,  0.        ,  0.        ,  0.16035871,  0.5592266 ,
         -0.03947584],
        [ 0.        , -0.61757248,  0.        ,  0.19816373, -0.21546623,
          0.98954889],
        [ 0.        , -0.41905719,  0.        , -0.47506777,  0.05720036,
          0.5220443 ],
        [ 0.        , -0.63423414,  0.        ,  0.        ,  0.        ,
         -0.55998382],
        [ 0.        ,  0.        ,  0.        ,  0.46338756, -0.3193283 ,
         -0.20091054],
        [ 0.        ,  0.95969992,  0.        , -0.01910322, -0.14510846,
         -0.98275042],
        [ 0.        , -0.8921119 ,  0.        ,  0.26600483, -0.67435909,
          0.9192043 ],
        [ 0.        , -0.98930

In [191]:
dfsimz=pd.DataFrame(data=sim_df['flat_bz'].tolist(),columns=get_bzs_names(context_factors, conjoint_factors))

In [192]:
dfsimz

Unnamed: 0,gender0:context_10,gender0:context_11,gender0:context_20,gender0:context_21,gender0:context_22,gender0:context_23,gender1:context_10,gender1:context_11,gender1:context_20,gender1:context_21,...,reason3:context_20,reason3:context_21,reason3:context_22,reason3:context_23,reason4:context_10,reason4:context_11,reason4:context_20,reason4:context_21,reason4:context_22,reason4:context_23
0,0.0,0.220211,0.0,-0.934842,-0.237651,-0.847316,0.0,0.554744,0.0,0.400874,...,0.0,-0.271877,-0.968211,-0.218965,0.0,0.93466,0.0,0.302763,-0.274959,0.0
1,0.0,0.535552,0.0,-0.497134,0.0,-0.472906,0.0,0.0,0.0,0.73426,...,0.0,-0.685719,-0.039179,0.918127,0.0,0.287729,0.0,-0.280487,-0.787591,0.034225
2,0.0,0.0,0.0,0.981934,-0.992506,0.624139,0.0,0.929611,0.0,-0.986008,...,0.0,-0.988658,0.0,-0.525687,0.0,0.913928,0.0,-0.786135,-0.009472,0.368158
3,0.0,0.038141,0.0,0.980903,0.362979,-0.762143,0.0,-0.029819,0.0,0.625293,...,0.0,0.0,-0.33937,-0.194438,0.0,-0.058919,0.0,0.909125,0.944487,0.285465
4,0.0,0.021708,0.0,-0.188347,0.214355,-0.681849,0.0,0.315819,0.0,-0.41912,...,0.0,0.37841,0.43372,-0.511099,0.0,-0.457403,0.0,0.941107,0.152121,0.0
5,0.0,0.237285,0.0,-0.614927,0.97203,0.014353,0.0,0.508324,0.0,0.0,...,0.0,0.449707,-0.133285,0.0,0.0,-0.223074,0.0,-0.551438,0.722434,0.850376
6,0.0,0.0,0.0,0.16835,0.0,-0.734001,0.0,0.0,0.0,0.696755,...,0.0,-0.269022,0.516823,0.946274,0.0,-0.48035,0.0,-0.389849,0.0,0.647478
7,0.0,0.039528,0.0,0.730618,-0.174243,-0.168124,0.0,0.007383,0.0,-0.46074,...,0.0,-0.439592,-0.957651,0.194664,0.0,-0.200472,0.0,0.806652,-0.612992,-0.137589
8,0.0,0.243122,0.0,-0.950043,0.06208,-0.9666,0.0,0.430225,0.0,0.334216,...,0.0,0.525234,-0.698272,-0.200041,0.0,-0.596193,0.0,0.715076,-0.279209,-0.451072
9,0.0,0.426635,0.0,-0.449191,0.0,-0.975945,0.0,0.149247,0.0,-0.367331,...,0.0,0.185819,0.0,-0.221789,0.0,0.751468,0.0,0.097519,-0.965789,0.143651


In [161]:
#dfsimz[0][0]

In [159]:
#sim_df['bzs'][0]

In [160]:
#sim_df['bzs'][0]

In [59]:
co=df['context_orig']
co

0         [0.0, 0.0]
1         [0.0, 0.0]
2         [0.0, 0.0]
3         [0.0, 0.0]
4         [0.0, 0.0]
             ...    
199995    [0.0, 2.0]
199996    [0.0, 2.0]
199997    [0.0, 2.0]
199998    [0.0, 2.0]
199999    [0.0, 2.0]
Name: context_orig, Length: 200000, dtype: object

In [60]:
col=co.tolist()

In [164]:
sim_df

Unnamed: 0,sim_id,bzs,sp_j,sp_z,m_j,m_z,sigma,n_respondents,gender0,gender1,...,goal3,goal4,n_removal0,n_removal1,n_removal2,reason0,reason1,reason2,reason3,reason4
0,cjsim_0.1_0.1_1_1_0.01_400_0,"[[0.0, 0.22021127182001266, 0.0, -0.9348424618...",0.1,0.1,1,1,0.01,400,0.0,0.0,...,-0.37144,0.0,0.0,-0.231801,0.279402,0.0,0.0,0.834077,-0.100121,0.68541
1,cjsim_0.1_0.1_1_1_0.01_400_1,"[[0.0, 0.5355521738308759, 0.0, -0.49713361264...",0.1,0.1,1,1,0.01,400,0.0,-0.015514,...,0.899837,-0.253187,0.0,-0.727058,0.323919,0.0,0.670545,-0.36632,-0.881403,-0.877225
2,cjsim_0.1_0.1_1_1_0.01_400_2,"[[0.0, 0.0, 0.0, 0.981933671360466, -0.9925061...",0.1,0.1,1,1,0.01,400,0.0,-0.707867,...,0.382598,0.827022,0.0,0.928021,-0.238889,0.0,0.086393,0.494649,0.79901,0.192317
3,cjsim_0.1_0.1_1_1_0.01_400_3,"[[0.0, 0.03814070493567567, 0.0, 0.98090334960...",0.1,0.1,1,1,0.01,400,0.0,0.725841,...,0.006694,-0.565552,0.0,-0.889688,-0.030586,0.0,0.0,-0.257476,0.40097,0.279027
4,cjsim_0.1_0.1_1_1_0.01_400_4,"[[0.0, 0.021707800661640242, 0.0, -0.188347493...",0.1,0.1,1,1,0.01,400,0.0,-0.32696,...,0.457409,0.101826,0.0,0.183339,-0.234951,0.0,0.347515,0.072041,-0.991602,0.516995
5,cjsim_0.1_0.1_1_1_0.01_400_5,"[[0.0, 0.23728525065172623, 0.0, -0.6149268201...",0.1,0.1,1,1,0.01,400,0.0,0.310525,...,0.121038,-0.305203,0.0,-0.432686,-0.706508,0.0,-0.515563,-0.479262,0.091335,0.417596
6,cjsim_0.1_0.1_1_1_0.01_400_6,"[[0.0, 0.0, 0.0, 0.16835009843509896, 0.0, -0....",0.1,0.1,1,1,0.01,400,0.0,-0.288048,...,-0.405793,0.998452,0.0,-0.803961,0.899541,0.0,-0.361486,0.0,-0.611996,-0.390097
7,cjsim_0.1_0.1_1_1_0.01_400_7,"[[0.0, 0.0395281948791979, 0.0, 0.730617523520...",0.1,0.1,1,1,0.01,400,0.0,-0.504368,...,0.183718,0.346607,0.0,-0.775486,0.0,0.0,0.736319,-0.053378,0.926529,-0.544062
8,cjsim_0.1_0.1_1_1_0.01_400_8,"[[0.0, 0.24312171182550002, 0.0, -0.9500426159...",0.1,0.1,1,1,0.01,400,0.0,-0.755365,...,-0.051414,0.0,0.0,-0.986125,-0.472384,0.0,0.0,-0.150405,0.357813,-0.631606
9,cjsim_0.1_0.1_1_1_0.01_400_9,"[[0.0, 0.42663526718663625, 0.0, -0.4491911385...",0.1,0.1,1,1,0.01,400,0.0,-0.02292,...,0.93116,0.224533,0.0,0.931451,0.961386,0.0,-0.267155,-0.862886,0.760485,-0.113266


In [29]:
pop=po.to_numpy()
pop

array([array([0, 0, 2, 1, 1, 3, 1, 2, 1]),
       array([0, 2, 2, 0, 3, 3, 1, 0, 0]),
       array([1, 0, 2, 0, 3, 0, 1, 0, 3]), ...,
       array([1, 1, 1, 1, 3, 3, 3, 1, 2]),
       array([0, 0, 1, 2, 3, 1, 3, 1, 3]),
       array([0, 1, 1, 1, 2, 0, 0, 1, 3])], dtype=object)

In [55]:
np.array(pol).shape()

TypeError: 'tuple' object is not callable

In [165]:
po

0         [0, 0, 2, 1, 1, 3, 1, 2, 1]
1         [0, 2, 2, 0, 3, 3, 1, 0, 0]
2         [1, 0, 2, 0, 3, 0, 1, 0, 3]
3         [0, 0, 2, 1, 2, 2, 3, 1, 1]
4         [0, 1, 1, 3, 3, 0, 4, 2, 1]
                     ...             
199995    [0, 0, 1, 1, 0, 1, 0, 1, 0]
199996    [0, 0, 1, 0, 0, 0, 1, 1, 3]
199997    [1, 1, 1, 1, 3, 3, 3, 1, 2]
199998    [0, 0, 1, 2, 3, 1, 3, 1, 3]
199999    [0, 1, 1, 1, 2, 0, 0, 1, 3]
Name: profile_orig, Length: 200000, dtype: object

In [37]:
np.squeeze(pop)

array([array([0, 0, 2, 1, 1, 3, 1, 2, 1]),
       array([0, 2, 2, 0, 3, 3, 1, 0, 0]),
       array([1, 0, 2, 0, 3, 0, 1, 0, 3]), ...,
       array([1, 1, 1, 1, 3, 3, 3, 1, 2]),
       array([0, 0, 1, 2, 3, 1, 3, 1, 3]),
       array([0, 1, 1, 1, 2, 0, 0, 1, 3])], dtype=object)

In [76]:
dff=pd.DataFrame(data=df['profile_orig'].tolist(),columns=conjoint_factors.keys())
dff

Unnamed: 0,gender,case_note,race,length,age,setting,goal,n_removal,reason
0,0,0,2,1,1,3,1,2,1
1,0,2,2,0,3,3,1,0,0
2,1,0,2,0,3,0,1,0,3
3,0,0,2,1,2,2,3,1,1
4,0,1,1,3,3,0,4,2,1
...,...,...,...,...,...,...,...,...,...
199995,0,0,1,1,0,1,0,1,0
199996,0,0,1,0,0,0,1,1,3
199997,1,1,1,1,3,3,3,1,2
199998,0,0,1,2,3,1,3,1,3


In [168]:
zz=sim_df['bzs']

In [182]:
#zz[0].flatten()

In [172]:
dfc=pd.DataFrame(data=zz.reshape(1,).tolist())
dfc

AttributeError: 'Series' object has no attribute 'reshape'

In [185]:
def flatten_bz(pu):
    return pu['bzs'].flatten()

In [186]:
sim_df['flat_bz']=sim_df.apply(flatten_bz, axis=1)

In [188]:
sim_df['flat_bz'][0]

array([ 0.        ,  0.22021127,  0.        , -0.93484246, -0.23765094,
       -0.84731623,  0.        ,  0.55474428,  0.        ,  0.40087369,
        0.87706849,  0.96440009,  0.        , -0.36801269,  0.        ,
        0.52679772,  0.30778899, -0.49784838,  0.        ,  0.        ,
        0.        ,  0.16035871,  0.5592266 , -0.03947584,  0.        ,
       -0.61757248,  0.        ,  0.19816373, -0.21546623,  0.98954889,
        0.        , -0.41905719,  0.        , -0.47506777,  0.05720036,
        0.5220443 ,  0.        , -0.63423414,  0.        ,  0.        ,
        0.        , -0.55998382,  0.        ,  0.        ,  0.        ,
        0.46338756, -0.3193283 , -0.20091054,  0.        ,  0.95969992,
        0.        , -0.01910322, -0.14510846, -0.98275042,  0.        ,
       -0.8921119 ,  0.        ,  0.26600483, -0.67435909,  0.9192043 ,
        0.        , -0.98930517,  0.        ,  0.5501941 , -0.48137834,
        0.02390063,  0.        ,  0.        ,  0.        ,  0.  

In [79]:
dff=pd.DataFrame(data=df['profile_orig'].tolist(),columns=conjoint_factors.keys())
dfc=pd.DataFrame(data=df['context_orig'].tolist(),columns=context_factors.keys())
pd.concat([dff, dfc], axis=1)

Unnamed: 0,gender,case_note,race,length,age,setting,goal,n_removal,reason,context_1,context_2
0,0,0,2,1,1,3,1,2,1,0.0,0.0
1,0,2,2,0,3,3,1,0,0,0.0,0.0
2,1,0,2,0,3,0,1,0,3,0.0,0.0
3,0,0,2,1,2,2,3,1,1,0.0,0.0
4,0,1,1,3,3,0,4,2,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
199995,0,0,1,1,0,1,0,1,0,0.0,2.0
199996,0,0,1,0,0,0,1,1,3,0.0,2.0
199997,1,1,1,1,3,3,3,1,2,0.0,2.0
199998,0,0,1,2,3,1,3,1,3,0.0,2.0


In [85]:
dffp=pd.DataFrame(data=df['profile'].tolist(),columns=get_col_names(conjoint_factors))
dffp

Unnamed: 0,gender0,gender1,case_note0,case_note1,case_note2,race0,race1,race2,length0,length1,...,goal3,goal4,n_removal0,n_removal1,n_removal2,reason0,reason1,reason2,reason3,reason4
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
199996,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
199997,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
199998,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [86]:
dffc=pd.DataFrame(data=df['context'].tolist(),columns=get_col_names(context_factors))
dffc

Unnamed: 0,context_10,context_11,context_20,context_21,context_22,context_23
0,1.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...
199995,1.0,0.0,0.0,0.0,1.0,0.0
199996,1.0,0.0,0.0,0.0,1.0,0.0
199997,1.0,0.0,0.0,0.0,1.0,0.0
199998,1.0,0.0,0.0,0.0,1.0,0.0


In [80]:
dfc2=df.copy()

In [83]:
dffc=pd.concat([dff, dfc], axis=1)

In [84]:
dfc2=pd.concat([dfc2, dffc], axis=1)
dfc2

Unnamed: 0,sim_id,respondent_id,context,context_orig,profile,profile_orig,response,respondent_sigma,y,gender,case_note,race,length,age,setting,goal,n_removal,reason,context_1,context_2
0,cjsim_0.1_0.1_1_5_0.01_400_0,0,"[1.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 0.0]","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0, 0, 2, 1, 1, 3, 1, 2, 1]",1,0.01,2.145378,0,0,2,1,1,3,1,2,1,0.0,0.0
1,cjsim_0.1_0.1_1_5_0.01_400_0,0,"[1.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 0.0]","[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, ...","[0, 2, 2, 0, 3, 3, 1, 0, 0]",0,0.01,0.984981,0,2,2,0,3,3,1,0,0,0.0,0.0
2,cjsim_0.1_0.1_1_5_0.01_400_0,0,"[1.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 0.0]","[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...","[1, 0, 2, 0, 3, 0, 1, 0, 3]",0,0.01,0.554826,1,0,2,0,3,0,1,0,3,0.0,0.0
3,cjsim_0.1_0.1_1_5_0.01_400_0,0,"[1.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 0.0]","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0, 0, 2, 1, 2, 2, 3, 1, 1]",1,0.01,2.467168,0,0,2,1,2,2,3,1,1,0.0,0.0
4,cjsim_0.1_0.1_1_5_0.01_400_0,0,"[1.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 0.0]","[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0, 1, 1, 3, 3, 0, 4, 2, 1]",0,0.01,0.069436,0,1,1,3,3,0,4,2,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,cjsim_0.1_0.1_1_5_0.01_400_24,399,"[1.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 2.0]","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0, 0, 1, 1, 0, 1, 0, 1, 0]",0,0.01,-2.024655,0,0,1,1,0,1,0,1,0,0.0,2.0
199996,cjsim_0.1_0.1_1_5_0.01_400_24,399,"[1.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 2.0]","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, ...","[0, 0, 1, 0, 0, 0, 1, 1, 3]",1,0.01,5.519726,0,0,1,0,0,0,1,1,3,0.0,2.0
199997,cjsim_0.1_0.1_1_5_0.01_400_24,399,"[1.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 2.0]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[1, 1, 1, 1, 3, 3, 3, 1, 2]",0,0.01,-3.389967,1,1,1,1,3,3,3,1,2,0.0,2.0
199998,cjsim_0.1_0.1_1_5_0.01_400_24,399,"[1.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 2.0]","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0, 0, 1, 2, 3, 1, 3, 1, 3]",1,0.01,3.005895,0,0,1,2,3,1,3,1,3,0.0,2.0


In [43]:
#take in profile_orig, context_orig, and bjs to create three dataframes then join them together
frame = [[0, 1, 0], [0, 0, 0], [1, 3, 3], [2, 4, 4]]
numpy_data= np.array(frame)
df_fr = pd.DataFrame(data=numpy_data)
df_fr

#df = pd.DataFrame(data=numpy_data, columns=["column1", "column2", "column3"])
#df = pd.DataFrame(data=numpy_data)
#df

Unnamed: 0,0,1,2
0,0,1,0
1,0,0,0
2,1,3,3
3,2,4,4


In [48]:
numpy_data[0]

array([0, 1, 0])

In [56]:
numpy_data.shape

(4, 3)

In [16]:
t=gen_one_profile(conjoint_factors.values())[1]
t

[0, 0, 2, 1, 3, 3, 0, 2, 1]

# BELOW HERE IS STUFF I DID NOT TOUCH

In [78]:
tp=np.array(test_all,dtype=object)
tp=tp.transpose()
df=pd.DataFrame({'sim_id':tp[0],
                'respondent_id':tp[1],
                 'context':tp[2],
                 'context_orig':tp[3],
                 'profile':tp[4],
                 'profile_orig':tp[5],
                 'selected':tp[6],
                 'bjs_spar':tp[7],
                 'bjs_mul':tp[8],
                 'bzs_spar':tp[9],
                 'bzs_mul':tp[10],
                 'sigma_r':tp[11],
                 'bjs':tp[12],
                 'bzs':tp[13]
                })

In [79]:
df.head()

Unnamed: 0,sim_id,respondent_id,context,context_og,profile,profile_og,selected,bjs_spar,bjs_mul,bzs_spar,bzs_mul,sigma_r,bjs,bzs
0,0,0,"[1.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0, 1, 0, 3, 1, 2, 0, 0, 1]",1,0.4,3,0.4,3,0.174313,"[0.0, 2.38385493608702, 0.0, 0.0, 2.9137835874...","[[0.0, 2.105092872766182, 0.0, 0.5433197261177..."
1,0,0,"[1.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0, 0, 2, 3, 1, 0, 2, 1, 3]",1,0.4,3,0.4,3,0.174313,"[0.0, 2.38385493608702, 0.0, 0.0, 2.9137835874...","[[0.0, 2.105092872766182, 0.0, 0.5433197261177..."
2,0,0,"[1.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...","[0, 2, 1, 3, 2, 3, 1, 0, 1]",1,0.4,3,0.4,3,0.174313,"[0.0, 2.38385493608702, 0.0, 0.0, 2.9137835874...","[[0.0, 2.105092872766182, 0.0, 0.5433197261177..."
3,0,0,"[1.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0, 0, 1, 2, 1, 3, 3, 0, 0]",1,0.4,3,0.4,3,0.174313,"[0.0, 2.38385493608702, 0.0, 0.0, 2.9137835874...","[[0.0, 2.105092872766182, 0.0, 0.5433197261177..."
4,0,0,"[1.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0, 0, 2, 3, 1, 2, 2, 0, 3]",0,0.4,3,0.4,3,0.174313,"[0.0, 2.38385493608702, 0.0, 0.0, 2.9137835874...","[[0.0, 2.105092872766182, 0.0, 0.5433197261177..."


In [17]:
unwrap_prof(df, test_att)
print(1)
unwrap_prof_og(df, test_att)
print(2)
unwrap_context(df, test_con_list)
print(3)
unwrap_context_og(df, test_con_list)

1
2
3


Unnamed: 0,sim_id,respondent_id,context,context_og,profile,profile_og,selected,bjs_spar,bjs_mul,bzs_spar,...,n_removal,reason,has_social0,has_social1,question_bucket0,question_bucket1,question_bucket2,question_bucket3,has_social,question_bucket
0,0,0,"[0.0, 1.0, 0.0, 0.0, 1.0, 0.0]","[1.0, 2.0]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...","[1, 1, 2, 0, 2, 1, 0, 1, 4]",0,0.1,1,0.1,...,1,4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0
1,0,0,"[0.0, 1.0, 0.0, 0.0, 1.0, 0.0]","[1.0, 2.0]","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...","[0, 0, 2, 0, 1, 2, 0, 0, 0]",1,0.1,1,0.1,...,0,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0
2,0,0,"[0.0, 1.0, 0.0, 0.0, 1.0, 0.0]","[1.0, 2.0]","[0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, ...","[1, 2, 1, 0, 2, 2, 0, 2, 2]",0,0.1,1,0.1,...,2,2,0.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0
3,0,0,"[0.0, 1.0, 0.0, 0.0, 1.0, 0.0]","[1.0, 2.0]","[0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[1, 0, 0, 2, 1, 1, 3, 1, 1]",1,0.1,1,0.1,...,1,1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0
4,0,0,"[0.0, 1.0, 0.0, 0.0, 1.0, 0.0]","[1.0, 2.0]","[0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[1, 2, 0, 1, 1, 2, 0, 0, 2]",0,0.1,1,0.1,...,0,2,0.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8639995,29,399,"[1.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.0, 3.0]","[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[1, 0, 2, 2, 2, 0, 4, 0, 1]",0,0.9,5,0.9,...,0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0
8639996,29,399,"[1.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.0, 3.0]","[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...","[0, 2, 1, 3, 1, 0, 2, 0, 0]",1,0.9,5,0.9,...,0,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0
8639997,29,399,"[1.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.0, 3.0]","[0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...","[1, 2, 1, 3, 3, 1, 2, 2, 0]",0,0.9,5,0.9,...,2,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0
8639998,29,399,"[1.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.0, 3.0]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[1, 1, 2, 2, 0, 3, 2, 1, 4]",1,0.9,5,0.9,...,1,4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0


In [18]:
list(df.columns)

['sim_id',
 'respondent_id',
 'context',
 'context_og',
 'profile',
 'profile_og',
 'selected',
 'bjs_spar',
 'bjs_mul',
 'bzs_spar',
 'bzs_mul',
 'sigma_r',
 'bjs',
 'bzs',
 'gender0',
 'gender1',
 'case_note0',
 'case_note1',
 'case_note2',
 'race0',
 'race1',
 'race2',
 'length0',
 'length1',
 'length2',
 'length3',
 'age0',
 'age1',
 'age2',
 'age3',
 'setting0',
 'setting1',
 'setting2',
 'setting3',
 'goal0',
 'goal1',
 'goal2',
 'goal3',
 'goal4',
 'n_removal0',
 'n_removal1',
 'n_removal2',
 'reason0',
 'reason1',
 'reason2',
 'reason3',
 'reason4',
 'gender',
 'case_note',
 'race',
 'length',
 'age',
 'setting',
 'goal',
 'n_removal',
 'reason',
 'has_social0',
 'has_social1',
 'question_bucket0',
 'question_bucket1',
 'question_bucket2',
 'question_bucket3',
 'has_social',
 'question_bucket']

In [71]:
df_to_json=df[['bjs_spar','bjs_mul','bzs_spar','bzs_mul','sigma_r','bjs','bzs']]
df_to_json

Unnamed: 0,bjs_spar,bjs_mul,bzs_spar,bzs_mul,sigma_r,bjs,bzs
0,0.1,1,0.1,1,0.748342,"[0, -2, 0, 0, -1, 0, 1, -2, 0, 0, 2, 1, 0, 2, ...","[[0, 1, 0, 2, -2, 0], [0, -1, 0, 2, -1, -1], [..."
1,0.1,1,0.1,1,0.748342,"[0, -2, 0, 0, -1, 0, 1, -2, 0, 0, 2, 1, 0, 2, ...","[[0, 1, 0, 2, -2, 0], [0, -1, 0, 2, -1, -1], [..."
2,0.1,1,0.1,1,0.748342,"[0, -2, 0, 0, -1, 0, 1, -2, 0, 0, 2, 1, 0, 2, ...","[[0, 1, 0, 2, -2, 0], [0, -1, 0, 2, -1, -1], [..."
3,0.1,1,0.1,1,0.748342,"[0, -2, 0, 0, -1, 0, 1, -2, 0, 0, 2, 1, 0, 2, ...","[[0, 1, 0, 2, -2, 0], [0, -1, 0, 2, -1, -1], [..."
4,0.1,1,0.1,1,0.748342,"[0, -2, 0, 0, -1, 0, 1, -2, 0, 0, 2, 1, 0, 2, ...","[[0, 1, 0, 2, -2, 0], [0, -1, 0, 2, -1, -1], [..."
...,...,...,...,...,...,...,...
8639995,0.9,5,0.9,5,4.923496,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0..."
8639996,0.9,5,0.9,5,4.923496,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0..."
8639997,0.9,5,0.9,5,4.923496,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0..."
8639998,0.9,5,0.9,5,4.923496,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0..."


In [73]:
def remove_first_level_bjs(base_bj, test_att):
    acc=[]
    count=0
    for i in [x for x in test_att.values()]:
        acc.append(count)
        
        count+=i
    base_bj=np.delete(base_bj,acc)
    return base_bj
def aflb(pu):
    return remove_first_level_bjs(pu['bjs'],test_att)
df_to_json['bjs_24']=df_to_json.apply(aflb,axis=1)

KeyboardInterrupt: 

In [74]:
df_to_json['bjs_24']

0          [-2, 0, -1, 1, -2, 0, 2, 1, 2, -2, 1, 2, -2, 1...
1          [-2, 0, -1, 1, -2, 0, 2, 1, 2, -2, 1, 2, -2, 1...
2          [-2, 0, -1, 1, -2, 0, 2, 1, 2, -2, 1, 2, -2, 1...
3          [-2, 0, -1, 1, -2, 0, 2, 1, 2, -2, 1, 2, -2, 1...
4          [-2, 0, -1, 1, -2, 0, 2, 1, 2, -2, 1, 2, -2, 1...
                                 ...                        
8639995    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
8639996    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
8639997    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
8639998    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
8639999    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: bjs_24, Length: 8640000, dtype: object

In [21]:
df_trim=df.drop(['bjs_spar','bjs_mul','bzs_spar','bzs_mul','sigma_r','bjs','bzs'],axis=1) #df w/o repetitive info

In [131]:
one_sim=df_trim[:72*4000]
one_sim.to_csv('one_trim.csv')

In [122]:
df_trim.to_csv('trim.csv')

In [75]:
bjs24=df_to_json[::72*4000]['bjs_24']
df_bjs24= pd.DataFrame(bjs24.to_numpy())
t=[]
for i in df_bjs24[0]: #df to series
    t.append(i)
pd.DataFrame(t)
df_bjss=pd.DataFrame(t,columns=['gender1','case_note1','case_note_2','race1','race2','length1','length2','length3','age1','age2','age3',
                                                 'setting1','setting2','setting3','goal1','goal2','goal3','goal4','n_removal1','n_removal2','reason1','reason2','reason3','reason4'])
df_bjss.to_csv('bjss_new.csv')

In [77]:
#df_bjss

In [41]:
def remove_first_level_context(bz,test_con):
    acc=[]
    count =0
    temp=[]
    for i in [x for x in test_con.values()]:
        acc.append(count)
        count+=i
    #bz=np.delete(bz,acc)
    for i in bz:
        temp.append(np.delete(i,acc))
    return temp

In [104]:
def aflbz(pu):
    return remove_first_level_context(pu['bzs_96_t'],test_con_list)
#df_to_json['bzs_96']=df_to_json.apply(aflbz,axis=1)

In [57]:
def flat_bzs(pu):
    #pu=np.array(pu)
    return np.array(pu['bzs_96']).flatten()

In [111]:
df_to_json_s=df_to_json[::72*4000]
df_to_json_s

Unnamed: 0,bjs_spar,bjs_mul,bzs_spar,bzs_mul,sigma_r,bjs,bzs,bjs_24
0,0.1,1,0.1,1,0.748342,"[0, -2, 0, 0, -1, 0, 1, -2, 0, 0, 2, 1, 0, 2, ...","[[0, 1, 0, 2, -2, 0], [0, -1, 0, 2, -1, -1], [...","[-2, 0, -1, 1, -2, 0, 2, 1, 2, -2, 1, 2, -2, 1..."
288000,0.1,1,0.1,1,0.17896,"[0, 1, 0, -1, -2, 0, 2, -1, 0, -1, -1, -1, 0, ...","[[0, 2, 0, -1, 2, 2], [0, -2, 0, 2, 1, 2], [0,...","[1, -1, -2, 2, -1, -1, -1, -1, -2, 1, 0, 0, -1..."
576000,0.1,1,0.1,1,0.146496,"[0, -1, 0, -1, 2, 0, -2, 2, 0, 1, 1, 2, 0, 0, ...","[[0, 2, 0, 2, 1, 0], [0, -2, 0, 1, 1, 2], [0, ...","[-1, -1, 2, -2, 2, 1, 1, 2, 0, -1, -1, -1, 1, ..."
864000,0.1,1,0.1,1,0.507031,"[0, -1, 0, 2, -1, 0, -2, 2, 0, -2, -2, 2, 0, -...","[[0, 2, 0, 0, -2, -1], [0, 2, 0, 2, 2, 0], [0,...","[-1, 2, -1, -2, 2, -2, -2, 2, -2, -1, 2, 2, -1..."
1152000,0.1,1,0.1,1,0.41562,"[0, 0, 0, -2, -2, 0, 1, -1, 0, -2, 2, -1, 0, 1...","[[0, 1, 0, -2, -1, 1], [0, -1, 0, -1, 2, 2], [...","[0, -2, -2, 1, -1, -2, 2, -1, 1, -2, -2, -2, -..."
1440000,0.1,1,0.1,1,0.171856,"[0, -2, 0, -1, -2, 0, 1, 0, 0, -1, 1, -2, 0, -...","[[0, 1, 0, 0, -2, -1], [0, 2, 0, 0, -2, -1], [...","[-2, -1, -2, 1, 0, -1, 1, -2, -2, 2, -1, -2, 2..."
1728000,0.1,1,0.1,1,0.185276,"[0, 1, 0, 1, 2, 0, 1, -1, 0, 1, -1, -2, 0, 1, ...","[[0, 2, 0, -1, 0, -2], [0, 2, 0, 2, 0, 0], [0,...","[1, 1, 2, 1, -1, 1, -1, -2, 1, 1, 2, 2, 2, -2,..."
2016000,0.1,1,0.1,1,0.076368,"[0, 2, 0, 1, -1, 0, -2, 1, 0, 1, 1, 2, 0, 0, 1...","[[0, -2, 0, 2, 0, 2], [0, 2, 0, -1, 2, -1], [0...","[2, 1, -1, -2, 1, 1, 1, 2, 0, 1, 2, 2, -2, -2,..."
2304000,0.1,1,0.1,1,0.635129,"[0, 1, 0, 0, 2, 0, 0, -1, 0, -2, -1, 2, 0, -1,...","[[0, 2, 0, 0, 2, -1], [0, -1, 0, 2, 0, 2], [0,...","[1, 0, 2, 0, -1, -2, -1, 2, -1, -2, -1, -1, -2..."
2592000,0.1,1,0.1,1,0.906625,"[0, 1, 0, 2, -2, 0, -1, -2, 0, -1, 0, -2, 0, 1...","[[0, -1, 0, -1, 1, -1], [0, -1, 0, 0, 2, 1], [...","[1, 2, -2, -1, -2, -1, 0, -2, 1, -2, -2, -2, 1..."


In [113]:
df_to_json_s['bzs_96']=df_to_json_s.apply(aflbz,axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_json_s['bzs_96']=df_to_json_s.apply(aflbz,axis=1)


In [99]:
def aflbt(pu):
    return remove_first_level_bzs(pu['bzs'],test_att)

In [97]:
#not neccesary since the first levels of bjs are zeros already, but it helps with matching coefficients in R
def remove_first_level_bzs(base_bz, test_att): 
    acc=[]
    count=0
    for i in [x for x in test_att.values()]:
        acc.append(count)
        
        count+=i
    base_bz=np.delete(base_bz,acc,axis=0)
    return base_bz

In [98]:
#np.delete(df_to_json_s['bzs'][0],0,axis=0)

In [112]:
df_to_json_s['bzs_96_t']=df_to_json_s.apply(aflbt,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_json_s['bzs_96_t']=df_to_json_s.apply(aflbt,axis=1)


In [118]:
#df_to_json_s['bzs'][0]

In [116]:
df_to_json_s['bzs_96_t'][0]

array([[ 0, -1,  0,  2, -1, -1],
       [ 0,  2,  0, -2,  0,  1],
       [ 0, -2,  0, -1, -1,  1],
       [ 0,  0,  0, -1,  0,  1],
       [ 0,  2,  0,  2, -2,  1],
       [ 0, -2,  0,  1, -1, -1],
       [ 0,  2,  0, -2, -2,  1],
       [ 0,  2,  0, -1, -2,  1],
       [ 0, -1,  0,  1,  2, -1],
       [ 0,  2,  0,  2, -2, -2],
       [ 0,  2,  0, -1,  2,  2],
       [ 0, -2,  0,  0,  2, -1],
       [ 0, -2,  0,  2,  0,  2],
       [ 0,  1,  0,  1,  1, -2],
       [ 0,  2,  0, -1, -2,  1],
       [ 0,  1,  0,  2,  2, -1],
       [ 0,  1,  0,  1, -2, -1],
       [ 0, -1,  0,  1,  0,  2],
       [ 0, -2,  0,  2,  2, -2],
       [ 0,  2,  0, -2,  1,  1],
       [ 0,  2,  0,  1, -2,  1],
       [ 0, -1,  0, -2,  2, -2],
       [ 0, -2,  0,  2, -2,  2],
       [ 0, -2,  0, -2,  0,  2]])

In [114]:
df_to_json_s['bzs_96_f']=df_to_json_s.apply(flat_bzs,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_json_s['bzs_96_f']=df_to_json_s.apply(flat_bzs,axis=1)


In [119]:
df_to_json_s['bzs_96_f'][0]

array([-1,  2, -1, -1,  2, -2,  0,  1, -2, -1, -1,  1,  0, -1,  0,  1,  2,
        2, -2,  1, -2,  1, -1, -1,  2, -2, -2,  1,  2, -1, -2,  1, -1,  1,
        2, -1,  2,  2, -2, -2,  2, -1,  2,  2, -2,  0,  2, -1, -2,  2,  0,
        2,  1,  1,  1, -2,  2, -1, -2,  1,  1,  2,  2, -1,  1,  1, -2, -1,
       -1,  1,  0,  2, -2,  2,  2, -2,  2, -2,  1,  1,  2,  1, -2,  1, -1,
       -2,  2, -2, -2,  2, -2,  2, -2, -2,  0,  2])

In [44]:
df_to_json_s['bzs'][0]

array([[ 0,  1,  0,  2, -2,  0],
       [ 0, -1,  0,  2, -1, -1],
       [ 0,  0,  0, -2,  0,  1],
       [ 0,  2,  0, -2,  0,  1],
       [ 0, -2,  0, -1, -1,  1],
       [ 0, -1,  0,  1, -1,  2],
       [ 0,  0,  0, -1,  0,  1],
       [ 0,  2,  0,  2, -2,  1],
       [ 0, -2,  0,  1,  2,  1],
       [ 0, -2,  0,  1, -1, -1],
       [ 0,  2,  0, -2, -2,  1],
       [ 0,  2,  0, -1, -2,  1],
       [ 0,  0,  0,  2,  1, -2],
       [ 0, -1,  0,  1,  2, -1],
       [ 0,  2,  0,  2, -2, -2],
       [ 0,  2,  0, -1,  2,  2],
       [ 0, -2,  0,  1,  0, -1],
       [ 0, -2,  0,  0,  2, -1],
       [ 0, -2,  0,  2,  0,  2],
       [ 0,  1,  0,  1,  1, -2],
       [ 0, -2,  0,  2, -2,  1],
       [ 0,  2,  0, -1, -2,  1],
       [ 0,  1,  0,  2,  2, -1],
       [ 0,  1,  0,  1, -2, -1],
       [ 0, -1,  0,  1,  0,  2],
       [ 0,  0,  0,  1, -1, -2],
       [ 0, -2,  0,  2,  2, -2],
       [ 0,  2,  0, -2,  1,  1],
       [ 0, -1,  0, -2,  0,  1],
       [ 0,  2,  0,  1, -2,  1],
       [ 0

In [45]:
df_to_json_s['bzs_96'][0]

[array([ 1,  2, -2,  0]),
 array([-1,  2, -1, -1]),
 array([ 0, -2,  0,  1]),
 array([ 2, -2,  0,  1]),
 array([-2, -1, -1,  1]),
 array([-1,  1, -1,  2]),
 array([ 0, -1,  0,  1]),
 array([ 2,  2, -2,  1]),
 array([-2,  1,  2,  1]),
 array([-2,  1, -1, -1]),
 array([ 2, -2, -2,  1]),
 array([ 2, -1, -2,  1]),
 array([ 0,  2,  1, -2]),
 array([-1,  1,  2, -1]),
 array([ 2,  2, -2, -2]),
 array([ 2, -1,  2,  2]),
 array([-2,  1,  0, -1]),
 array([-2,  0,  2, -1]),
 array([-2,  2,  0,  2]),
 array([ 1,  1,  1, -2]),
 array([-2,  2, -2,  1]),
 array([ 2, -1, -2,  1]),
 array([ 1,  2,  2, -1]),
 array([ 1,  1, -2, -1]),
 array([-1,  1,  0,  2]),
 array([ 0,  1, -1, -2]),
 array([-2,  2,  2, -2]),
 array([ 2, -2,  1,  1]),
 array([-1, -2,  0,  1]),
 array([ 2,  1, -2,  1]),
 array([-1, -2,  2, -2]),
 array([-2,  2, -2,  2]),
 array([-2, -2,  0,  2])]

In [47]:
#df_bjss

In [66]:
len(df_to_json_s['bzs_96_f'][0])

132

In [62]:
df_bjs24

Unnamed: 0,0
0,"[-2, 0, -1, 1, -2, 0, 2, 1, 2, -2, 1, 2, -2, 1..."
1,"[1, -1, -2, 2, -1, -1, -1, -1, -2, 1, 0, 0, -1..."
2,"[-1, -1, 2, -2, 2, 1, 1, 2, 0, -1, -1, -1, 1, ..."
3,"[-1, 2, -1, -2, 2, -2, -2, 2, -2, -1, 2, 2, -1..."
4,"[0, -2, -2, 1, -1, -2, 2, -1, 1, -2, -2, -2, -..."
5,"[-2, -1, -2, 1, 0, -1, 1, -2, -2, 2, -1, -2, 2..."
6,"[1, 1, 2, 1, -1, 1, -1, -2, 1, 1, 2, 2, 2, -2,..."
7,"[2, 1, -1, -2, 1, 1, 1, 2, 0, 1, 2, 2, -2, -2,..."
8,"[1, 0, 2, 0, -1, -2, -1, 2, -1, -2, -1, -1, -2..."
9,"[1, 2, -2, -1, -2, -1, 0, -2, 1, -2, -2, -2, 1..."


In [120]:
bzs96_f=df_to_json_s['bzs_96_f']
df_bzs96_f= pd.DataFrame(bzs96_f.to_numpy())
t=[]
for i in df_bzs96_f[0]: #df to series
    t.append(i)
pd.DataFrame(t)
df_bzss=pd.DataFrame(t)
#df_bjss.to_csv('bjss_new.csv')

In [123]:
df_bzss.to_csv('bzss.csv')

In [121]:
df_bzss

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,-1,2,-1,-1,2,-2,0,1,-2,-1,...,2,-2,-2,2,-2,2,-2,-2,0,2
1,-2,2,1,2,-2,-2,0,2,-1,-1,...,-2,0,-2,0,-2,2,2,-2,2,1
2,-2,1,1,2,0,2,1,-1,2,0,...,0,2,0,-1,1,-1,2,-2,1,1
3,2,2,2,0,1,1,-2,-2,-2,-2,...,1,1,1,1,1,-1,-2,1,-2,1
4,-1,-1,2,2,-1,2,2,1,-2,-2,...,2,-1,-1,0,-2,-1,0,1,1,2
5,2,0,-2,-1,2,-1,-2,-1,1,0,...,2,-1,2,-2,2,-1,-1,-2,0,0
6,2,2,0,0,0,1,-1,2,1,-2,...,0,1,0,2,1,1,1,1,-1,-2
7,2,-1,2,-1,-1,2,-1,1,-1,-2,...,-1,1,1,-1,-2,-1,1,-2,-1,-1
8,-1,2,0,2,-2,2,-2,-1,-1,1,...,2,-2,0,2,-2,1,-2,1,-1,-2
9,-1,0,2,1,-2,-2,2,2,-1,1,...,-1,-1,-2,-1,1,2,2,2,2,2
