The goal of this notebook is to fit the continuous variables for all combinations of the categorical variables and then draw
from the distributions new continuous variables values. These new values will be used later to create a new population.

In [1]:
import os
import copy
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from scipy.stats import multivariate_normal

microsimDir = "/Users/deligkaris.1/OneDrive - The Ohio State University Wexner Medical Center/MICROSIM"
dataDir = microsimDir+"/NOTEBOOKS/DATA"

#contains person-year information from a Microsim NHANES simulation
df = pd.read_csv(dataDir+"/nhanes-normality-test-4.csv")

In [2]:
#df.columns.to_list()

In [3]:
categoricalVars = ["gender", "smokingStatus", "raceEthnicity", "statin",'education',
                  'alcoholPerWeek','anyPhysicalActivity','antiHypertensiveCount']
continuousVars = ['age', 'hdl', 'bmi', 'totChol', 'trig', 'a1c', 'ldl', 'waist', 'creatinine', 'sbp', 'dbp']

In [4]:
#for col in categoricalVars:
#    print(df[col].value_counts())

In [6]:
%%time
#not all combinations of categorical variables include 1 or more nhanes persons...so find the combinations that actually
#include 1 or more persons
nCategories = 0
sizesList = list()
for gender in set(df["gender"].tolist()):
    for smoking in set(df["smokingStatus"].tolist()):
        print(gender,smoking)
        for raceEthnicity in set(df["raceEthnicity"].tolist()):
            for statin in set(df["statin"].tolist()):
                for education in set(df["education"].tolist()):
                    for alcoholPerWeek in set(df["alcoholPerWeek"].tolist()):
                        for anyPhysicalActivity in set(df["anyPhysicalActivity"].tolist()):
                            for antiHypertensiveCount in set(df["antiHypertensiveCount"].tolist()):
                                
                                nCategories += 1
                                
                                dfForGroup = df.loc[(df["gender"]==gender) & 
                                                        (df["smokingStatus"]==smoking) &
                                                        (df["raceEthnicity"]==raceEthnicity) &
                                                        (df["statin"]==statin) &
                                                        (df["education"]==education) &
                                                        (df["alcoholPerWeek"]==alcoholPerWeek) &
                                                        (df["anyPhysicalActivity"]==anyPhysicalActivity) &
                                                        (df["antiHypertensiveCount"]==antiHypertensiveCount), :].copy()
                                
                                size = dfForGroup.shape[0]
                                
                                if size>0:
                                    group = [gender, smoking, raceEthnicity, statin, education, alcoholPerWeek, 
                                         anyPhysicalActivity, antiHypertensiveCount]
                                    names = dfForGroup["name"].tolist()
                                    sizesList += [[group, size, names]]
                                                    

1 0
1 1
1 2
2 0
2 1
2 2
CPU times: user 6.05 s, sys: 6.9 ms, total: 6.06 s
Wall time: 6.07 s


In [7]:
groupsList = sizesList
totalPeople = sum(list(map( lambda x: x[1], sizesList)))

In [None]:
#trying this out...for now....but this would apply only on nhanes not on other cases...

with (open(dataDir+"/nhanesMeans-4.csv", "w") as meansFile, open(dataDir+"/nhanesCovs-4.csv","w") as covsFile,
      open(dataDir+"/nhanesMin-4.csv", "w") as minFile, open(dataDir+"/nhanesMax-4.csv", "w") as maxFile):
    pd.DataFrame(categoricalVars+["weight"]+continuousVars).T.to_csv(meansFile, header=False, index=False)
    pd.DataFrame(categoricalVars+continuousVars).T.to_csv(covsFile, header=False, index=False)
    pd.DataFrame(categoricalVars+continuousVars).T.to_csv(minFile, header=False, index=False)
    pd.DataFrame(categoricalVars+continuousVars).T.to_csv(maxFile, header=False, index=False)
    for i,groupWithSize in enumerate([groupsList[0]]):
        if i%500==0:
            print(i)
        group = groupWithSize[0]
        size = groupWithSize[1]
    
        gender = group[0]
        smoking = group[1]
        raceEthnicity = group[2]
        statin = group[3]
        education = group[4]
        alcoholPerWeek = group[5]
        anyPhysicalActivity = group[6] 
        antiHypertensiveCount = group[7]
    
        dataDf = df.loc[(df["gender"]==gender) & 
                                (df["smokingStatus"]==smoking) &
                                (df["raceEthnicity"]==raceEthnicity) &
                                (df["statin"]==statin) &
                                (df["education"]==education) &
                                (df["alcoholPerWeek"]==alcoholPerWeek) &
                                (df["anyPhysicalActivity"]==anyPhysicalActivity) &
                                (df["antiHypertensiveCount"]==antiHypertensiveCount), 
                                                        continuousVars+["name"]].copy()
        data = np.array(dataDf)
        
        

In [None]:
dataDf.head()

In [None]:
nhanesDf = pd.read_stata(microsimDir + "/CODE/microsim/microsim/data/fullyImputedDataset.dta")
nhanesDf = nhanesDf.rename(columns={"level_0":"name"})
dataDf = dataDf.merge(nhanesDf[["name","WTINT2YR"]], on="name", how="inner").copy()

In [None]:
dataDf["weight"] = dataDf["WTINT2YR"].astype('int')
dataDf.head()

In [None]:
for row in range(dataDf.shape[0]):
    
    copied_df = pd.DataFrame(dataDf.iloc[row].repeat(2)).reset_index(drop=True)

    # Concatenate the original DataFrame with the copied rows
    dataDf = pd.concat([dataDf, copied_df], ignore_index=True)

In [None]:
copied_df.T

In [None]:
dataDf.head()

In [None]:
dataDf.applymap(lambda x: np.random.default_rng().uniform(0.8*x, 1.2*x))

In [8]:
#fit the continuous variables for all combinations of categorical variables
#some fits will result in singular covariance matrices, that is ok for now
#I am writing the fit parameters to files because I want to see what process I need to use
#if others give us distributions of their populations and we need to create Person objects from those
#distributions
with (open(dataDir+"/nhanesMeans-4.csv", "w") as meansFile, open(dataDir+"/nhanesCovs-4.csv","w") as covsFile,
      open(dataDir+"/nhanesMin-4.csv", "w") as minFile, open(dataDir+"/nhanesMax-4.csv", "w") as maxFile):
    pd.DataFrame(categoricalVars+["weight"]+continuousVars).T.to_csv(meansFile, header=False, index=False)
    pd.DataFrame(categoricalVars+continuousVars).T.to_csv(covsFile, header=False, index=False)
    pd.DataFrame(categoricalVars+continuousVars).T.to_csv(minFile, header=False, index=False)
    pd.DataFrame(categoricalVars+continuousVars).T.to_csv(maxFile, header=False, index=False)
    for i,groupWithSize in enumerate(groupsList):
        if i%500==0:
            print(i)
        group = groupWithSize[0]
        size = groupWithSize[1]
    
        gender = group[0]
        smoking = group[1]
        raceEthnicity = group[2]
        statin = group[3]
        education = group[4]
        alcoholPerWeek = group[5]
        anyPhysicalActivity = group[6] 
        antiHypertensiveCount = group[7]
    
        data = np.array(df.loc[(df["gender"]==gender) & 
                                (df["smokingStatus"]==smoking) &
                                (df["raceEthnicity"]==raceEthnicity) &
                                (df["statin"]==statin) &
                                (df["education"]==education) &
                                (df["alcoholPerWeek"]==alcoholPerWeek) &
                                (df["anyPhysicalActivity"]==anyPhysicalActivity) &
                                (df["antiHypertensiveCount"]==antiHypertensiveCount), 
                                                        continuousVars].copy())
    
        distMean, distCov = multivariate_normal.fit(data)
        
        distMinDf = pd.DataFrame( np.min(data,axis=0) )
        pd.concat(
                [ pd.DataFrame([gender, smoking, raceEthnicity, statin, 
                                education, alcoholPerWeek, anyPhysicalActivity, antiHypertensiveCount]).T,
                      distMinDf.T],axis=1, ignore_index=True).to_csv(minFile, header=False, index=False)
        minFile.write("\n\n")
        
        distMaxDf = pd.DataFrame( np.max(data,axis=0) )
        pd.concat(
                [ pd.DataFrame([gender, smoking, raceEthnicity, statin, 
                                education, alcoholPerWeek, anyPhysicalActivity, antiHypertensiveCount]).T,
                      distMaxDf.T],axis=1, ignore_index=True).to_csv(maxFile, header=False, index=False)
        maxFile.write("\n\n")
        
        #1 row per block because means are of shape (1,4)
        distMeanDf = pd.DataFrame(distMean)
        pd.concat(
                    [ pd.DataFrame([gender, smoking, raceEthnicity, statin, 
                                    education, alcoholPerWeek, anyPhysicalActivity, antiHypertensiveCount, size/totalPeople]).T,
                      distMeanDf.T],axis=1, ignore_index=True).to_csv(meansFile, header=False, index=False)
        meansFile.write("\n\n")
                    
        #4 rows per block because covs are of shape (4,4)
        distCovDf = pd.DataFrame(distCov)
        pd.concat(
                [ pd.concat([pd.DataFrame([gender, smoking, raceEthnicity, statin,
                                           education, alcoholPerWeek, anyPhysicalActivity, antiHypertensiveCount]).T]*len(continuousVars),ignore_index=True),
                  distCovDf.T],axis=1, ignore_index=True).to_csv(covsFile, header=False, index=False)
        covsFile.write("\n\n")

0
500
1000
1500


In [9]:
meansDf = pd.read_csv(dataDir+"/nhanesMeans-4.csv", header=0)
covsDf = pd.read_csv(dataDir+"/nhanesCovs-4.csv", header=0)
minDf = pd.read_csv(dataDir+"/nhanesMin-4.csv", header=0)
maxDf = pd.read_csv(dataDir+"/nhanesMax-4.csv", header=0)

In [10]:
meansDf[continuousVars].describe()

Unnamed: 0,age,hdl,bmi,totChol,trig,a1c,ldl,waist,creatinine,sbp,dbp
count,1611.0,1611.0,1611.0,1611.0,1611.0,1611.0,1611.0,1611.0,1611.0,1611.0,1611.0
mean,55.985339,50.595477,28.897159,203.552362,160.532213,5.821042,123.069421,98.957847,0.875937,132.117713,72.145019
std,17.100626,13.462886,5.577041,36.584872,110.356597,1.195587,31.883402,13.076731,0.772782,20.498374,10.290928
min,18.0,19.0,16.09,76.0,29.0,3.9,30.0,60.8,0.2,85.333333,41.333333
25%,42.881944,41.0,25.4925,181.0,102.0,5.2,103.376471,90.672222,0.6,117.704545,66.160494
50%,57.6,49.0,28.011053,202.0,137.277778,5.5,121.0,97.833333,0.771667,128.666667,71.407407
75%,70.0,57.333333,31.32,222.0,186.75,5.9,140.0,106.233333,0.9,143.166667,77.916667
max,85.0,138.0,67.83,480.0,1538.0,15.1,354.0,162.2,10.1,220.0,120.666667


In [11]:
#covsDf.head()
minDf.head()
#meansDf.head()

Unnamed: 0,gender,smokingStatus,raceEthnicity,statin,education,alcoholPerWeek,anyPhysicalActivity,antiHypertensiveCount,age,hdl,bmi,totChol,trig,a1c,ldl,waist,creatinine,sbp,dbp
0,1,0,1,False,1,0,0,0.0,22.0,35.0,21.76,146.0,67.0,4.8,83.0,78.3,0.6,100.666667,48.666667
1,1,0,1,False,1,0,0,1.0,62.0,35.0,25.58,187.0,69.0,4.8,96.0,95.0,0.6,139.333333,73.333333
2,1,0,1,False,1,0,0,2.0,71.0,42.0,23.78,150.0,130.0,6.9,59.0,92.3,8.5,177.333333,101.333333
3,1,0,1,False,1,0,0,3.0,59.0,47.0,22.5,243.0,178.0,5.0,160.0,86.1,0.7,126.666667,78.0
4,1,0,1,False,1,0,1,0.0,24.0,33.0,23.44,190.0,100.0,5.2,111.0,90.4,0.4,98.0,67.333333


In [12]:
#flag the distributions that have a singular covariance matrix
for groupWithSize in groupsList:
    group = groupWithSize[0]
    size = groupWithSize[1]
    
    gender = group[0]
    smoking = group[1]
    raceEthnicity = group[2]
    statin = group[3]
    education = group[4]
    alcoholPerWeek = group[5]
    anyPhysicalActivity = group[6] 
    antiHypertensiveCount = group[7]
    
    gCov = covsDf.loc[ (covsDf["gender"]==gender) &
             (covsDf["smokingStatus"]==smoking) &
             (covsDf["raceEthnicity"]==raceEthnicity) &
             (covsDf["statin"]==statin) &
             (covsDf["education"]==education) &
             (covsDf["alcoholPerWeek"]==alcoholPerWeek) &
             (covsDf["anyPhysicalActivity"]==anyPhysicalActivity) &
             (covsDf["antiHypertensiveCount"]==antiHypertensiveCount), continuousVars]
    
    #if np.linalg.matrix_rank(gCov) < gCov.shape[0]:
    #    groupWithSize += [True]
    #else:
    #    groupWithSize += [False]
        
    #if (np.linalg.matrix_rank(gCov) < gCov.shape[0]) | (not np.all(np.linalg.eig(gCov)[0]>0)) | (not np.allclose(gCov, gCov.T)):
    if not np.all(np.linalg.eig(gCov)[0]>10**(-3)):
        groupWithSize += [True]
    else:
        groupWithSize += [False]

In [13]:
sum(list(map(lambda x: x[3], groupsList)))

1531

In [14]:
#for the groups that have a singular covariance matrix find a group with a non-singular covariance matrix
#that can be used as a replacement
for groupWithSize in groupsList:
    group = groupWithSize[0]
    size = groupWithSize[1]
    singular = groupWithSize[3]
    
    if singular:
        #print(group)
        gender = group[0]
        smoking = group[1]
        raceEthnicity = group[2]
        statin = group[3]
        education = group[4]
        alcoholPerWeek = group[5]
        anyPhysicalActivity = group[6] 
        antiHypertensiveCount = group[7]
        
        gMean = meansDf.loc[ (meansDf["gender"]==gender) &
             (meansDf["smokingStatus"]==smoking) &
             (meansDf["raceEthnicity"]==raceEthnicity) &
             (meansDf["statin"]==statin) &
             (meansDf["education"]==education) &
             (meansDf["alcoholPerWeek"]==alcoholPerWeek) &
             (meansDf["anyPhysicalActivity"]==anyPhysicalActivity) &
             (meansDf["antiHypertensiveCount"]==antiHypertensiveCount), continuousVars]
        
        altGroupsList = list()
        for altGroupWithSize in groupsList:
            altGroup = altGroupWithSize[0]
            altSize = altGroupWithSize[1]
            altSingular = altGroupWithSize[3]
            
            if not altSingular:
                altGender = altGroup[0]
                altSmoking = altGroup[1]
                altRaceEthnicity = altGroup[2]
                altStatin = altGroup[3]
                altEducation = altGroup[4]
                altAlcoholPerWeek = altGroup[5]
                altAnyPhysicalActivity = altGroup[6] 
                altAntiHypertensiveCount = altGroup[7]
    
                altGMean = meansDf.loc[ (meansDf["gender"]==altGender) &
                                     (meansDf["smokingStatus"]==altSmoking) &
                                     (meansDf["raceEthnicity"]==altRaceEthnicity) &
                                     (meansDf["statin"]==altStatin) &
                                     (meansDf["education"]==altEducation) &
                                     (meansDf["alcoholPerWeek"]==altAlcoholPerWeek) &
                                     (meansDf["anyPhysicalActivity"]==altAnyPhysicalActivity) &
                                     (meansDf["antiHypertensiveCount"]==altAntiHypertensiveCount), continuousVars]
    
                altGCov = covsDf.loc[ (covsDf["gender"]==altGender) &
                                   (covsDf["smokingStatus"]==altSmoking) &
                                   (covsDf["raceEthnicity"]==altRaceEthnicity) &
                                   (covsDf["statin"]==altStatin) &
                                   (covsDf["education"]==altEducation) &
                                   (covsDf["alcoholPerWeek"]==altAlcoholPerWeek) &
                                   (covsDf["anyPhysicalActivity"]==altAnyPhysicalActivity) &
                                   (covsDf["antiHypertensiveCount"]==altAntiHypertensiveCount), continuousVars]
    
                gDist = multivariate_normal(np.array(altGMean)[0], np.array(altGCov), allow_singular=False)
        
                altGroupsList += [[altGroup, gDist.pdf(gMean)]] #[altGroup, gDist.pdf(group)]
    
        probabilities = list(map(lambda x: x[1], altGroupsList))
        maxProb = max(probabilities)
        altGroupWithMaxProb = altGroupsList[probabilities.index(maxProb)][0]
        groupWithSize += [altGroupWithMaxProb]
    else:
        groupWithSize += [None]

In [19]:
#draw from the distributions to get all continuous variables for all persons
#if a draw happens to be outside reasonable bounds, then re-draw until the draw is within reasonable bounds

personsDf = pd.DataFrame(data=None, columns= ["name"]+categoricalVars+continuousVars)

for iii, groupWithSize in enumerate(groupsList):
    if iii%100==0:
        print(iii)
    group = groupWithSize[0]
    size = groupWithSize[1]
    names = groupWithSize[2]
    singular = groupWithSize[3]
    altGroup = groupWithSize[4]
    
    if singular:
        gender = altGroup[0]
        smoking = altGroup[1]
        raceEthnicity = altGroup[2]
        statin = altGroup[3]
        education = altGroup[4]
        alcoholPerWeek = altGroup[5]
        anyPhysicalActivity = altGroup[6] 
        antiHypertensiveCount = altGroup[7]
    else:
        gender = group[0]
        smoking = group[1]
        raceEthnicity = group[2]
        statin = group[3]
        education = group[4]
        alcoholPerWeek = group[5]
        anyPhysicalActivity = group[6] 
        antiHypertensiveCount = group[7]
    
    gMean = meansDf.loc[ (meansDf["gender"]==gender) &
             (meansDf["smokingStatus"]==smoking) &
             (meansDf["raceEthnicity"]==raceEthnicity) &
             (meansDf["statin"]==statin) &
             (meansDf["education"]==education) &
             (meansDf["alcoholPerWeek"]==alcoholPerWeek) &
             (meansDf["anyPhysicalActivity"]==anyPhysicalActivity) &
             (meansDf["antiHypertensiveCount"]==antiHypertensiveCount), continuousVars]
    
    gCov = covsDf.loc[ (covsDf["gender"]==gender) &
             (covsDf["smokingStatus"]==smoking) &
             (covsDf["raceEthnicity"]==raceEthnicity) &
             (covsDf["statin"]==statin) &
             (covsDf["education"]==education) &
             (covsDf["alcoholPerWeek"]==alcoholPerWeek) &
             (covsDf["anyPhysicalActivity"]==anyPhysicalActivity) &
             (covsDf["antiHypertensiveCount"]==antiHypertensiveCount), continuousVars]
    
    gDist = multivariate_normal(np.array(gMean)[0], np.array(gCov), allow_singular=True)
    
    if size>4:
        gMin = minDf.loc[ (minDf["gender"]==group[0]) &
                 (minDf["smokingStatus"]==group[1]) &
                 (minDf["raceEthnicity"]==group[2]) &
                 (minDf["statin"]==group[3]) &
                 (minDf["education"]==group[4]) &
                 (minDf["alcoholPerWeek"]==group[5]) &
                 (minDf["anyPhysicalActivity"]==group[6]) &
                 (minDf["antiHypertensiveCount"]==group[7]), continuousVars]
    
        gMax = maxDf.loc[ (maxDf["gender"]==group[0]) &
                 (maxDf["smokingStatus"]==group[1]) &
                 (maxDf["raceEthnicity"]==group[2]) &
                 (maxDf["statin"]==group[3]) &
                 (maxDf["education"]==group[4]) &
                 (maxDf["alcoholPerWeek"]==group[5]) &
                 (maxDf["anyPhysicalActivity"]==group[6]) &
                 (maxDf["antiHypertensiveCount"]==group[7]), continuousVars]
    else:
        gMin = minDf.loc[ (minDf["gender"]==gender) &
                 (minDf["smokingStatus"]==smoking) &
                 (minDf["raceEthnicity"]==raceEthnicity) &
                 (minDf["statin"]==statin) &
                 (minDf["education"]==education) &
                 (minDf["alcoholPerWeek"]==alcoholPerWeek) &
                 (minDf["anyPhysicalActivity"]==anyPhysicalActivity) &
                 (minDf["antiHypertensiveCount"]==antiHypertensiveCount), continuousVars]
    
        gMax = maxDf.loc[ (maxDf["gender"]==gender) &
                 (maxDf["smokingStatus"]==smoking) &
                 (maxDf["raceEthnicity"]==raceEthnicity) &
                 (maxDf["statin"]==statin) &
                 (maxDf["education"]==education) &
                 (maxDf["alcoholPerWeek"]==alcoholPerWeek) &
                 (maxDf["anyPhysicalActivity"]==anyPhysicalActivity) &
                 (maxDf["antiHypertensiveCount"]==antiHypertensiveCount), continuousVars]
    
    #nDraws = int(popSize*size/totalPointsInGroups)
    nDraws = size
    nDrawsNeeded = nDraws
    draws=None
    while nDrawsNeeded!=0:
        if draws is None:
            draws = gDist.rvs(size=nDrawsNeeded)
        else:
            #print(f"{i} redrawing")
            if len(draws.shape)==1:
                draws = draws.reshape((1,len(continuousVars)))
            if (nDrawsNeeded==1):
                draws = np.concatenate( (draws, gDist.rvs(size=nDrawsNeeded).reshape((1,len(continuousVars)))), axis=0 )
            else:
                draws = np.concatenate( (draws, gDist.rvs(size=nDrawsNeeded)), axis=0 )
           
        if nDraws==1:
            draws = draws.reshape((1,len(continuousVars)))
            
        rowsOutOfBounds = np.array([False]*nDraws)
        for i, bound in enumerate(np.array(gMin)[0]):
            rowsOutOfBounds = rowsOutOfBounds | (draws[:,i]<0.9*bound)
        for i, bound in enumerate(np.array(gMax)[0]):
            rowsOutOfBounds = rowsOutOfBounds | (draws[:,i]>1.1*bound)
        nDrawsNeeded = nDraws - np.sum(~rowsOutOfBounds)
        draws = draws[~rowsOutOfBounds,:]    
    
    if nDraws == 0:
        pass
    elif nDraws==1:
        #print(nDraws)
        #personRowCont = pd.DataFrame(draws).T
        personRowCont = pd.DataFrame(draws)
    else:
        #print(nDraws)
        personRowCont = pd.DataFrame(draws)
    if nDraws>0:
        personRowCont.columns=continuousVars
        personRowCat = pd.concat([pd.DataFrame(group).T]*nDraws, ignore_index=True)
        personRowCat.columns = categoricalVars
        personRow = pd.concat( [pd.Series(names), personRowCat, personRowCont], axis=1).rename(columns={0:"name"})
        personsDf = pd.concat([personsDf,personRow])
    

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600


In [20]:
personsDf["age"] = round(personsDf["age"]).astype('int')

In [21]:
personsDf.head()

Unnamed: 0,name,gender,smokingStatus,raceEthnicity,statin,education,alcoholPerWeek,anyPhysicalActivity,antiHypertensiveCount,age,hdl,bmi,totChol,trig,a1c,ldl,waist,creatinine,sbp,dbp
0,41909,1,0,1,False,1,0,0,0.0,39,39.332549,32.093633,289.420901,202.594404,6.356258,190.692873,108.778183,1.151077,147.131136,81.747572
1,42688,1,0,1,False,1,0,0,0.0,30,57.942825,27.667434,199.841558,119.231151,5.769845,90.95666,92.577832,0.879097,128.707967,61.416109
2,43025,1,0,1,False,1,0,0,0.0,71,50.824894,23.164605,179.578231,85.613429,5.24512,117.216043,92.101304,0.556529,128.105878,67.220136
3,43390,1,0,1,False,1,0,0,0.0,49,42.460672,22.419467,203.219952,192.760294,5.032002,122.120623,90.043771,0.794121,137.643906,84.633473
4,44501,1,0,1,False,1,0,0,0.0,26,57.765745,27.510509,241.259803,137.484893,5.19422,136.485802,88.475559,0.977545,150.56988,87.679994


In [22]:
personsDf.to_csv(dataDir+"/nhanes-persons-from-Gaussians.csv", index=False)