In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
pd.set_option("display.max_rows", 30)

In [2]:
symptoms_df = pd.read_csv("Indicators_of_Anxiety_or_Depression_Based_on_Reported_Frequency_of_Symptoms_During_Last_7_Days.csv")

In [3]:
symptoms_df.shape

(16794, 14)

In [4]:
symptoms_df.columns

Index(['Indicator', 'Group', 'State', 'Subgroup', 'Phase', 'Time Period',
       'Time Period Label', 'Time Period Start Date', 'Time Period End Date',
       'Value', 'Low CI', 'High CI', 'Confidence Interval', 'Quartile Range'],
      dtype='object')

In [5]:
symptoms_df.head(3)

Unnamed: 0,Indicator,Group,State,Subgroup,Phase,Time Period,Time Period Label,Time Period Start Date,Time Period End Date,Value,Low CI,High CI,Confidence Interval,Quartile Range
0,Symptoms of Depressive Disorder,National Estimate,United States,United States,1,1,"Apr 23 - May 5, 2020",04/23/2020,05/05/2020,23.5,22.7,24.3,22.7 - 24.3,
1,Symptoms of Depressive Disorder,By Age,United States,18 - 29 years,1,1,"Apr 23 - May 5, 2020",04/23/2020,05/05/2020,32.7,30.2,35.2,30.2 - 35.2,
2,Symptoms of Depressive Disorder,By Age,United States,30 - 39 years,1,1,"Apr 23 - May 5, 2020",04/23/2020,05/05/2020,25.7,24.1,27.3,24.1 - 27.3,


In [6]:
symptoms_df_clean = symptoms_df.copy()

In [7]:
rename_map = {
    "Time Period": "Time_period",
    "Time Period Start Date": "Start_date",
    "Time Period End Date":   "End_date",
    "Time Period Label":      "Time_period_label",
    "Low CI":                 "Low_ci",
    "High CI":                "High_ci",
    "Confidence Interval":    "Con_int",
    "Quartile Range":         "Quartile_range"
}

In [8]:
symptoms_df_clean.rename(columns= rename_map, inplace=True)
symptoms_df_clean.columns

Index(['Indicator', 'Group', 'State', 'Subgroup', 'Phase', 'Time_period',
       'Time_period_label', 'Start_date', 'End_date', 'Value', 'Low_ci',
       'High_ci', 'Con_int', 'Quartile_range'],
      dtype='object')

In [9]:
symptoms_df_clean["Start_date"] = pd.to_datetime(symptoms_df_clean["Start_date"])
symptoms_df_clean["End_date"] = pd.to_datetime(symptoms_df_clean["End_date"])


In [10]:
symptoms_df_clean.head(3)

Unnamed: 0,Indicator,Group,State,Subgroup,Phase,Time_period,Time_period_label,Start_date,End_date,Value,Low_ci,High_ci,Con_int,Quartile_range
0,Symptoms of Depressive Disorder,National Estimate,United States,United States,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,23.5,22.7,24.3,22.7 - 24.3,
1,Symptoms of Depressive Disorder,By Age,United States,18 - 29 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,32.7,30.2,35.2,30.2 - 35.2,
2,Symptoms of Depressive Disorder,By Age,United States,30 - 39 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,25.7,24.1,27.3,24.1 - 27.3,


In [11]:
symptoms_df_clean["Indicator"].unique()

array(['Symptoms of Depressive Disorder', 'Symptoms of Anxiety Disorder',
       'Symptoms of Anxiety Disorder or Depressive Disorder'],
      dtype=object)

In [12]:
depression_df = (
    symptoms_df_clean[
        (symptoms_df_clean["Indicator"].str.contains("Depressive", case=False)) &
        (~symptoms_df_clean["Indicator"].str.contains("Anxiety", case=False))
    ]
    .sort_values("Start_date")
    .reset_index(drop=True)
)
depression_df = depression_df.drop(
    columns=["Phase", "Time_period", "Time_period_label", "Con_int", "Quartile_range"]
)
depression_df.head(3)

Unnamed: 0,Indicator,Group,State,Subgroup,Start_date,End_date,Value,Low_ci,High_ci
0,Symptoms of Depressive Disorder,National Estimate,United States,United States,2020-04-23,2020-05-05,23.5,22.7,24.3
1,Symptoms of Depressive Disorder,By State,New Mexico,New Mexico,2020-04-23,2020-05-05,22.4,18.3,26.9
2,Symptoms of Depressive Disorder,By State,New Jersey,New Jersey,2020-04-23,2020-05-05,23.6,20.4,27.0


In [13]:
anxiety_df = (
    symptoms_df_clean[
        (symptoms_df_clean["Indicator"].str.contains("Anxiety", case=False)) &
        (~symptoms_df_clean["Indicator"].str.contains("Depressive", case=False))
    ]
    .sort_values("Start_date")
    .reset_index(drop=True)
)
anxiety_df = anxiety_df.drop(
    columns=["Phase", "Time_period", "Time_period_label", "Con_int", "Quartile_range"]
)
anxiety_df.head(3)

Unnamed: 0,Indicator,Group,State,Subgroup,Start_date,End_date,Value,Low_ci,High_ci
0,Symptoms of Anxiety Disorder,National Estimate,United States,United States,2020-04-23,2020-05-05,30.8,30.0,31.7
1,Symptoms of Anxiety Disorder,By State,New Mexico,New Mexico,2020-04-23,2020-05-05,27.9,23.9,32.2
2,Symptoms of Anxiety Disorder,By State,New Jersey,New Jersey,2020-04-23,2020-05-05,31.4,28.2,34.7


In [14]:
depression_anxiety_df = (
    symptoms_df_clean[
        (symptoms_df_clean["Indicator"].str.contains("Depressive", case=False)) &
        (symptoms_df_clean["Indicator"].str.contains("Anxiety", case=False))
    ]
    .sort_values(["Indicator", "Start_date"])
    .reset_index(drop=True)
)

depression_anxiety_df = depression_anxiety_df.drop(
    columns=["Phase", "Time_period", "Time_period_label", "Con_int", "Quartile_range"]
)

depression_anxiety_df.head(3)

Unnamed: 0,Indicator,Group,State,Subgroup,Start_date,End_date,Value,Low_ci,High_ci
0,Symptoms of Anxiety Disorder or Depressive Dis...,National Estimate,United States,United States,2020-04-23,2020-05-05,35.9,35.0,36.8
1,Symptoms of Anxiety Disorder or Depressive Dis...,By Age,United States,18 - 29 years,2020-04-23,2020-05-05,46.8,44.3,49.3
2,Symptoms of Anxiety Disorder or Depressive Dis...,By Age,United States,30 - 39 years,2020-04-23,2020-05-05,39.6,37.7,41.5


In [15]:
depression_anxiety_df.shape

(5598, 9)

In [16]:
symptoms_df_clean= symptoms_df_clean.drop(
    columns=["Phase", "Time_period", "Time_period_label", "Con_int", "Quartile_range"]
)

symptoms_df_clean.head(3)

Unnamed: 0,Indicator,Group,State,Subgroup,Start_date,End_date,Value,Low_ci,High_ci
0,Symptoms of Depressive Disorder,National Estimate,United States,United States,2020-04-23,2020-05-05,23.5,22.7,24.3
1,Symptoms of Depressive Disorder,By Age,United States,18 - 29 years,2020-04-23,2020-05-05,32.7,30.2,35.2
2,Symptoms of Depressive Disorder,By Age,United States,30 - 39 years,2020-04-23,2020-05-05,25.7,24.1,27.3


In [17]:
print("Depression rows before:", depression_df.shape[0])
print("Anxiety rows before:", anxiety_df.shape[0])
print("Depression and Anxiety rows before:", depression_anxiety_df.shape[0])

Depression rows before: 5598
Anxiety rows before: 5598
Depression and Anxiety rows before: 5598


In [18]:
depression_df = depression_df.dropna(how="any")
anxiety_df = anxiety_df.dropna(how="any")
depression_anxiety_df = depression_anxiety_df.dropna(how="any")

In [19]:
print("Depression rows after:", depression_df.shape[0])
print("Anxiety rows after:", anxiety_df.shape[0])
print("Depression+Anxiety rows after:", depression_anxiety_df.shape[0])

Depression rows after: 5363
Anxiety rows after: 5361
Depression+Anxiety rows after: 5363


In [20]:
depression_df = depression_anxiety_df.melt(
    id_vars=[
        "Indicator", "Group", "Subgroup", "State",
        "Start_date", "End_date", "Value"
    ],
    value_vars=["Low_ci", "High_ci"],   
    var_name="CI_Bound",                
    value_name="CI_Value"
)            
depression_df.head(4)

Unnamed: 0,Indicator,Group,Subgroup,State,Start_date,End_date,Value,CI_Bound,CI_Value
0,Symptoms of Anxiety Disorder or Depressive Dis...,National Estimate,United States,United States,2020-04-23,2020-05-05,35.9,Low_ci,35.0
1,Symptoms of Anxiety Disorder or Depressive Dis...,By Age,18 - 29 years,United States,2020-04-23,2020-05-05,46.8,Low_ci,44.3
2,Symptoms of Anxiety Disorder or Depressive Dis...,By Age,30 - 39 years,United States,2020-04-23,2020-05-05,39.6,Low_ci,37.7
3,Symptoms of Anxiety Disorder or Depressive Dis...,By Age,40 - 49 years,United States,2020-04-23,2020-05-05,38.9,Low_ci,37.2


In [21]:
depression_df.to_csv("depression.csv", index=False)
anxiety_df.to_csv("anxiety.csv", index=False)
depression_anxiety_df.to_csv("depression_anxiety.csv", index=False)