# Exploratory Data Analysis
This notebook contains all of our preliminary analysis on the American Gut Microbiome data.

Libraries set-up

In [1]:
#%pip install matplotlib
#%pip install pandas
#%pip install numpy
#%pip install Bio

In [2]:
import pandas as pd
import numpy as np

Importing the American gut microbiome CSV

In [3]:
data = pd.read_csv('sample_information_American_gut_microbiome.csv.txt', sep="\t", dtype=object)
data.head()

Unnamed: 0,sample_name,acid_reflux,acne_medication,acne_medication_otc,add_adhd,age_cat,age_corrected,age_years,alcohol_consumption,alcohol_frequency,...,vioscreen_zinc,vitamin_b_supplement_frequency,vitamin_d_supplement_frequency,vivid_dreams,weight_cat,weight_change,weight_kg,weight_units,whole_eggs,whole_grain_frequency
0,10317.000001,Not provided,False,False,"Diagnosed by a medical professional (doctor, p...",60s,64.0,64.0,True,Daily,...,Not provided,Never,Regularly (3-5 times/week),Not provided,,Remained stable,52.0,kilograms,Never,Occasionally (1-2 times/week)
1,10317.000001001,Not provided,False,False,Not provided,50s,53.0,53.0,True,Rarely (a few times/month),...,Not provided,Not provided,Not provided,Not provided,,Remained stable,110.0,kilograms,Not provided,Not provided
2,10317.000001002,Not provided,False,False,Not provided,50s,53.0,53.0,True,Regularly (3-5 times/week),...,Not provided,Not provided,Not provided,Not provided,,Not provided,56.0,kilograms,Not provided,Not provided
3,10317.000001004,Not provided,False,False,Not provided,40s,44.0,44.0,True,Rarely (a few times/month),...,Not provided,Not provided,Not provided,Not provided,,Remained stable,86.0,kilograms,Not provided,Not provided
4,10317.000001008,Not provided,False,False,Not provided,60s,66.0,66.0,False,Never,...,Not provided,Not provided,Not provided,Not provided,,Increased more than 10 pounds,74.0,kilograms,Not provided,Not provided


Change all the true/false values to Yes/No

In [4]:
data[data=='true']='Yes'
data[data=='false']='No'
data[data=='I do not have this condition']='No'

Selecting only the samples from feces

In [6]:
data = data.loc[data.body_site == "UBERON:feces",:]

Removing all the subjects with special diseases and conditions

In [7]:
cond_no_disease = ( (data['alcohol_frequency']!='Daily') &
                    ((data['autoimmune']=='No') | (data['autoimmune']=='Not provided')) &
                    ((data['cancer']=='No') | (data['cancer']=='Not provided')) &
                    ((data['cdiff']=='No') | (data['cdiff']=='Not provided')) & #Clostridioides difficile
                    ((data['diabetes']=='No') | (data['diabetes']=='Not provided')) &
                    ((data['ibd']=='No') | (data['ibd']=='Not provided')) & #inflammatory bowel disease
                    ((data['ibs']=='No') | (data['ibs']=='Not provided')) & #inflamatiry bowel syndrome
                    ((data['kidney_disease']=='No') | (data['kidney_disease']=='Not provided')) &
                    ((data['liver_disease']=='No') | (data['liver_disease']=='Not provided')) &
                    ((data['mental_illness_type_anorexia_nervosa']=='No') | (data['mental_illness_type_anorexia_nervosa']=='Not provided')) &
                    ((data['mental_illness_type_bulimia_nervosa']=='No') | (data['mental_illness_type_bulimia_nervosa']=='Not provided')) &
                    ((data['sibo']=='No') | (data['sibo']=='Not provided'))) #small intestinal bacterial overgrowth

data_no_disease = data.loc[cond_no_disease, :]

Number of subjects that removed refined sugars from their diet:
- 633 subjects consume refined sugars and don't have any of the conditions listed before

In [8]:
data_no_disease.specialized_diet_exclude_refined_sugars.value_counts()

Not provided    7417
No              4633
Yes              633
Unspecified       22
Name: specialized_diet_exclude_refined_sugars, dtype: int64

Selecting only Male/Females and subject with age between 20s and 60s

In [9]:
data_no_disease.age_cat.value_counts()

30s             2359
40s             2288
50s             2221
60s             1735
20s             1352
Not provided     748
child            733
70+              594
teen             382
Unspecified      287
baby               6
Name: age_cat, dtype: int64

In [10]:
data_no_disease.sex.value_counts()

female          6178
male            5976
Not provided     526
unspecified       18
other              7
Name: sex, dtype: int64

In [11]:
data_f = data_no_disease.loc[((data_no_disease['age_cat'] == '20s') |
                          (data_no_disease['age_cat'] == '30s') |
                          (data_no_disease['age_cat'] == '40s') |
                          (data_no_disease['age_cat'] == '50s')|
                          (data_no_disease['age_cat'] == '60s'))  &
                          ((data_no_disease.sex == 'female') | (data_no_disease.sex == 'male')) ,:]

Table for age and sugars consumption

In [12]:
pd.crosstab(data_f.age_cat,
            data_f.specialized_diet_exclude_refined_sugars, 
            values=data_f.sample_name,
            aggfunc=len, 
            margins = True)

specialized_diet_exclude_refined_sugars,No,Not provided,Unspecified,Yes,All
age_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20s,573.0,698.0,,59.0,1330
30s,869.0,1295.0,3.0,128.0,2295
40s,817.0,1330.0,3.0,105.0,2255
50s,771.0,1284.0,6.0,101.0,2162
60s,557.0,1026.0,4.0,92.0,1679
All,3587.0,5633.0,16.0,485.0,9721


Checking if there are any discrepancies in what patients declared

In [13]:
pd.crosstab(data_f.specialized_diet_i_do_not_eat_a_specialized_diet,
            data_f.specialized_diet_exclude_refined_sugars, 
            values=data_f.sample_name,
            aggfunc=len, 
            margins = True)

specialized_diet_exclude_refined_sugars,No,Not provided,Unspecified,Yes,All
specialized_diet_i_do_not_eat_a_specialized_diet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
No,1110.0,,,455.0,1565
Not provided,,5633.0,,,5633
Unspecified,,,16.0,,16
Yes,2477.0,,,30.0,2507
All,3587.0,5633.0,16.0,485.0,9721


Creation of the Control/Treatment groups:
- Control: subjects without a particular health condition, that don't follow any specilized diet

- Treatment: subjects without a paritcular health condition, that exclude refined sugars from their diet. This group can include also people that exclude sugar and at the same time follow an other specialized diet that doesn't allow refined sugar (raw diet, paleo diet)

In [14]:
[x for x in list(data_f.columns) if x.lower().startswith("specialized_diet_")]

['specialized_diet_exclude_dairy',
 'specialized_diet_exclude_nightshades',
 'specialized_diet_exclude_refined_sugars',
 'specialized_diet_fodmap',
 'specialized_diet_halaal',
 'specialized_diet_i_do_not_eat_a_specialized_diet',
 'specialized_diet_kosher',
 'specialized_diet_modified_paleo_diet',
 'specialized_diet_other_restrictions_not_described_here',
 'specialized_diet_paleodiet_or_primal_diet',
 'specialized_diet_raw_food_diet',
 'specialized_diet_unspecified',
 'specialized_diet_westenprice_or_other_lowgrain_low_processed_fo',
 'specialized_diet_westenprice_or_other_lowgrain_low_processed_food_diet']

In [15]:
data_f = data_f.drop("specialized_diet_unspecified",axis=1)

In [16]:
diets_without_sugar = ['specialized_diet_exclude_refined_sugars',
                      'specialized_diet_paleodiet_or_primal_diet', 
                      'specialized_diet_raw_food_diet',
                      'specialized_diet_westenprice_or_other_lowgrain_low_processed_fo',
                      'specialized_diet_westenprice_or_other_lowgrain_low_processed_food_diet']

Function that assigns every subject to a group

In [17]:
def get_groups(data):
    exclude_sugar = False
    refined_sugar = False
    specialized_diet = False
    other_diets = False

    list_col = list(data.index)
    list_col = [x for x in list_col if x.lower().startswith("specialized_diet_")]
    for col in list_col:
        if col =="specialized_diet_i_do_not_eat_a_specialized_diet":
            if data[col] == "No":
                specialized_diet = True
        if col in diets_without_sugar:
            if data[col] == "Yes":
                exclude_sugar = True
        if col == "specialized_diet_exclude_refined_sugars":
            if data[col] == "Yes":
                refined_sugar = True
        if col not in diets_without_sugar and col != "specialized_diet_i_do_not_eat_a_specialized_diet":
            if data[col] == "Yes":
                other_diets = True
    
    if specialized_diet:
        
        if exclude_sugar and refined_sugar and not other_diets:
            return "Treatment"
        else:
            return "None"
    
    else:
        if not exclude_sugar and not refined_sugar and not other_diets:
            return "Control"
        
        return "None"
        

In [18]:
data_f['group'] = data_f.apply(get_groups, axis=1)

Only 191 subjects have the characteristics to be in the Treatment group

In [19]:
data_f.group.value_counts()

Control      8089
None         1441
Treatment     191
Name: group, dtype: int64

Excluding all subjects that are neither in the control or treatment group

In [20]:
data_groups = data_f.loc[((data_f.group == "Control") | (data_f.group == "Treatment"))]

Sex distribution in the two groups

In [21]:
print(data_groups.groupby(["group","sex"]).sample_name.nunique())

group      sex   
Control    female    4159
           male      3930
Treatment  female     105
           male        86
Name: sample_name, dtype: int64


Sex and age distribution in the two groups

In [22]:
print(data_groups.groupby(["group","sex","age_cat"]).sample_name.nunique())

group      sex     age_cat
Control    female  20s        561
                   30s        925
                   40s        981
                   50s        957
                   60s        735
           male    20s        519
                   30s        943
                   40s        897
                   50s        885
                   60s        686
Treatment  female  20s          5
                   30s         34
                   40s         20
                   50s         23
                   60s         23
           male    20s         18
                   30s         16
                   40s         13
                   50s         22
                   60s         17
Name: sample_name, dtype: int64


Average age in the groups

In [23]:
data_groups.age_years = data_groups.age_years.astype('float16')
print(data_groups.groupby(["group","sex"]).age_years.mean())

group      sex   
Control    female    45.365711
           male      45.274811
Treatment  female    47.019047
           male      45.151161
Name: age_years, dtype: float32


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_groups.age_years = data_groups.age_years.astype('float16')


Treatment group data

In [24]:
data_treatment = data_groups.loc[(data_groups.group == "Treatment")]

Control group data (selecting only 190 subjects)

In [25]:
data_control = data_groups.loc[(data_groups.group == "Control") & (data_groups.specialized_diet_exclude_refined_sugars == "No")]


In [26]:
data.specialized_diet_exclude_refined_sugars.value_counts()

Not provided      11341
No                 8917
Yes                1805
Unspecified         187
Not applicable       10
Name: specialized_diet_exclude_refined_sugars, dtype: int64

In [27]:
data_control.specialized_diet_exclude_refined_sugars.value_counts()

No    2440
Name: specialized_diet_exclude_refined_sugars, dtype: int64

In [28]:
data_control = data_control.groupby(["sex","age_cat"]).head(19)

In [29]:
data_treatment = data_treatment.head(190).reset_index(drop=True)

Concatenation of the two groups

In [30]:
data_final_groups = pd.concat([data_treatment,data_control]).reset_index(drop=True)

In [31]:
data_final_groups

Unnamed: 0,sample_name,acid_reflux,acne_medication,acne_medication_otc,add_adhd,age_cat,age_corrected,age_years,alcohol_consumption,alcohol_frequency,...,vitamin_b_supplement_frequency,vitamin_d_supplement_frequency,vivid_dreams,weight_cat,weight_change,weight_kg,weight_units,whole_eggs,whole_grain_frequency,group
0,10317.000023139,No,No,No,No,60s,69.0,69.0,Yes,Rarely (a few times/month),...,Regularly (3-5 times/week),Daily,Never,,Remained stable,63,kilograms,Regularly (3-5 times/week),Regularly (3-5 times/week),Treatment
1,10317.000027811,No,No,No,No,40s,46.0,46.0,Yes,Rarely (a few times/month),...,Rarely (a few times/month),Regularly (3-5 times/week),Rarely (a few times/month),,Remained stable,81,kilograms,Occasionally (1-2 times/week),Rarely (less than once/week),Treatment
2,10317.000031513,Self-diagnosed,No,No,Not provided,30s,37.0,37.0,Yes,Regularly (3-5 times/week),...,Never,Rarely (a few times/month),Rarely (a few times/month),,Remained stable,49,kilograms,Occasionally (1-2 times/week),Rarely (less than once/week),Treatment
3,10317.000033280,No,No,No,No,60s,60.0,60.0,Yes,Regularly (3-5 times/week),...,Never,Never,Never,,Remained stable,68,kilograms,Occasionally (1-2 times/week),Regularly (3-5 times/week),Treatment
4,10317.000038261,No,No,No,No,40s,48.0,48.0,Yes,Regularly (3-5 times/week),...,Never,Never,Occasionally (1-2 times/week),,Remained stable,70,kilograms,Rarely (less than once/week),Occasionally (1-2 times/week),Treatment
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,10317.000065667,No,No,No,No,20s,22.0,22.0,Yes,Occasionally (1-2 times/week),...,Never,Never,Rarely (a few times/month),,Remained stable,61,kilograms,Regularly (3-5 times/week),Never,Control
376,10317.000065673,No,No,Yes,No,20s,20.0,20.0,No,Never,...,Never,Never,Occasionally (1-2 times/week),,Remained stable,79,kilograms,Never,Rarely (less than once/week),Control
377,10317.000065675,No,No,No,No,20s,21.0,21.0,No,Never,...,Never,Never,Daily,,Remained stable,51,kilograms,Never,Regularly (3-5 times/week),Control
378,10317.000065681,No,No,Yes,No,20s,21.0,21.0,No,Never,...,Never,Never,Rarely (a few times/month),,Remained stable,137,kilograms,Rarely (less than once/week),Rarely (less than once/week),Control


Saving the dataframe with all needed info

In [32]:
data_final_groups.to_csv("data_final_groups.csv")