In [3]:
import pandas as pd
from prophet import Prophet
import datetime as dt
import numpy as np
from bs4 import BeautifulSoup
import markdown
import string
import re
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

# Function to remove Markdown
def remove_markdown(text):
    html = markdown.markdown(text)
    return BeautifulSoup(html, "html.parser").get_text()

def clean_row(row):
    return row.apply(lambda x: remove_markdown(remove_html_tags(x)) if isinstance(x, str) else x)

def clean_text(text):
    text = re.sub(r'-', ' ', text)
    text = re.sub(r'#', 'unique_column_heading', text)
    text = re.sub(r'None', 'unique_column_heading', text)
    text = re.sub(r'<strong>', '', text)
    text = re.sub(r'</strong>', ' ', text)
    text = re.sub(r'<em>', ' ', text)
    text = re.sub(r'</em>', ' ', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Replace spaces with dashes
    text = re.sub(r'\s+', '-', text)
    return text

def preprocess_df(df):
    df = df.astype(str)
    df = df.reset_index(drop=True)
    return df

def merge_columns(df: pd.DataFrame) -> pd.DataFrame:
    merged_columns = {}

    # Iterate through columns
    for col in df.columns:
        if col in merged_columns:
            merged_columns[col] = merged_columns[col].fillna('') + ' ' + df[col].fillna('')
        else:
            merged_columns[col] = df[col].fillna('')

    merged_df = pd.DataFrame(merged_columns)

    return merged_df

def reset_index_with_string(df, string):
    df = df.reset_index(drop=True)
    df.index = [f"{string}{i}" for i in df.index.astype(str)]
    return df


In [4]:
column_mapping = {
    'are-you-self-employed' : 'self-employed', 
    'how-many-employees-does-your-company-or-organization-have' : 'num-employees', 
    'is-your-employer-primarily-a-tech-companyorganization' : 'tech-employer', 
    'is-your-primary-role-within-your-company-related-to-techit' : 'tech-role', 
    'does-your-employer-provide-mental-health-benefits-as-part-of-healthcare-coverage' : 'mental-health-benefits', 
    'do-you-know-the-options-for-mental-health-care-available-under-your-employer-provided-health-coverage' : 'know-mental-health-options', 
    'has-your-employer-ever-formally-discussed-mental-health-for-example-as-part-of-a-wellness-campaign-or-other-official-communication' : 'discussed-mental-health', 
    'does-your-employer-offer-resources-to-learn-more-about-mental-health-disorders-and-options-for-seeking-help' : 'mental-health-resources', 
    'is-your-anonymity-protected-if-you-choose-to-take-advantage-of-mental-health-or-substance-abuse-treatment-resources-provided-by-your-employer' : 'anonymity-protected', 
    'if-a-mental-health-issue-prompted-you-to-request-a-medical-leave-from-work-how-easy-or-difficult-would-it-be-to-ask-for-that-leave' : 'mental-health-leave-ease', 
    'would-you-feel-more-comfortable-talking-to-your-coworkers-about-your-physical-health-or-your-mental-health' : 'comfort-discussing-health', 
    'would-you-feel-comfortable-discussing-a-mental-health-issue-with-your-direct-supervisors' : 'discuss-mental-health-supervisor', 
    'have-you-ever-discussed-your-mental-health-with-your-employer' : 'discussed-mental-health-employer', 
    'describe-the-conversation-you-had-with-your-employer-about-your-mental-health-including-their-reactions-and-what-actions-were-taken-to-address-your-mental-health-issuequestions' : 'mental-health-talk-employer', 
    'would-you-feel-comfortable-discussing-a-mental-health-issue-with-your-coworkers' : 'discuss-mental-health-coworkers', 
    'have-you-ever-discussed-your-mental-health-with-coworkers' : 'discussed-mental-health-coworkers', 
    'describe-the-conversation-with-coworkers-you-had-about-your-mental-health-including-their-reactions' : 'mental-health-talk-coworkers', 
    'have-you-ever-had-a-coworker-discuss-their-or-another-coworkers-mental-health-with-you' : 'coworker-mental-health-discussion', 
    'describe-the-conversation-your-coworker-had-with-you-about-their-mental-health-please-do-not-use-names' : 'coworker-mental-health-talk', 
    'overall-how-much-importance-does-your-employer-place-on-physical-health' : 'importance-physical-health', 
    'overall-how-much-importance-does-your-employer-place-on-mental-health' : 'importance-mental-health', 
    'do-you-have-medical-coverage-private-insurance-or-state-provided-that-includes-treatment-of-mental-health-disorders' : 'mental-health-coverage', 
    'do-you-know-local-or-online-resources-to-seek-help-for-a-mental-health-issue' : 'know-mental-health-resources', 
    'if-you-have-been-diagnosed-or-treated-for-a-mental-health-disorder-do-you-ever-reveal-this-to-clients-or-business-contacts' : 'reveal-mental-health-clients', 
    'if-you-have-revealed-a-mental-health-disorder-to-a-client-or-business-contact-how-has-this-affected-you-or-the-relationship' : 'effect-reveal-mental-health', 
    'if-you-have-been-diagnosed-or-treated-for-a-mental-health-disorder-do-you-ever-reveal-this-to-coworkers-or-employees' : 'reveal-mental-health-coworkers', 
    'if-you-have-revealed-a-mental-health-disorder-to-a-coworker-or-employee-how-has-this-impacted-you-or-the-relationship' : 'impact-reveal-mental-health', 
    'do-you-believe-your-productivity-is-ever-affected-by-a-mental-health-issue' : 'productivity-mental-health', 
    'if-yes-what-percentage-of-your-work-time-time-performing-primary-or-secondary-job-functions-is-affected-by-a-mental-health-issue' : 'percentage-affected-mental-health', 
    'do-you-have-previous-employers' : 'previous-employers', 
    'was-your-employer-primarily-a-tech-companyorganization' : 'previous-tech-employer', 
    'have-your-previous-employers-provided-mental-health-benefits' : 'previous-mental-health-benefits', 
    'were-you-aware-of-the-options-for-mental-health-care-provided-by-your-previous-employers' : 'aware-previous-mental-health-options', 
    'did-your-previous-employers-ever-formally-discuss-mental-health-as-part-of-a-wellness-campaign-or-other-official-communication' : 'discussed-mental-health-previous', 
    'did-your-previous-employers-provide-resources-to-learn-more-about-mental-health-disorders-and-how-to-seek-help' : 'previous-mental-health-resources', 
    'was-your-anonymity-protected-if-you-chose-to-take-advantage-of-mental-health-or-substance-abuse-treatment-resources-with-previous-employers' : 'previous-anonymity-protected', 
    'would-you-have-felt-more-comfortable-talking-to-your-previous-employer-about-your-physical-health-or-your-mental-health' : 'previous-comfort-discussing-health', 
    'would-you-have-been-willing-to-discuss-your-mental-health-with-your-direct-supervisors' : 'previous-discuss-mental-health-supervisor', 
    'did-you-ever-discuss-your-mental-health-with-your-previous-employer' : 'previous-discussed-mental-health', 
    'describe-the-conversation-you-had-with-your-previous-employer-about-your-mental-health-including-their-reactions-and-actions-taken-to-address-your-mental-health-issuequestions' : 'previous-mental-health-talk-employer', 
    'would-you-have-been-willing-to-discuss-your-mental-health-with-your-coworkers-at-previous-employers' : 'previous-discuss-mental-health-coworkers', 
    'did-you-ever-discuss-your-mental-health-with-a-previous-coworkers' : 'previous-discussed-mental-health-coworkers', 
    'describe-the-conversation-you-had-with-your-previous-coworkers-about-your-mental-health-including-their-reactions' : 'previous-mental-health-talk-coworkers', 
    'did-you-ever-have-a-previous-coworker-discuss-their-or-another-coworkers-mental-health-with-you' : 'previous-coworker-mental-health-discussion', 
    'describe-the-conversation-your-coworker-had-with-you-about-their-mental-health-please-do-not-use-names1' : 'previous-coworker-mental-health-talk', 
    'overall-how-much-importance-did-your-previous-employer-place-on-physical-health' : 'previous-importance-physical-health', 
    'overall-how-much-importance-did-your-previous-employer-place-on-mental-health' : 'previous-importance-mental-health', 
    'do-you-currently-have-a-mental-health-disorder' : 'current-mental-health-disorder', 
    'have-you-ever-been-diagnosed-with-a-mental-health-disorder' : 'diagnosed-mental-health', 
    'anxiety-disorder-generalized-social-phobia-etc' : 'anxiety-disorder', 
    'mood-disorder-depression-bipolar-disorder-etc' : 'mood-disorder', 
    'psychotic-disorder-schizophrenia-schizoaffective-etc' : 'psychotic-disorder', 
    'eating-disorder-anorexia-bulimia-etc' : 'eating-disorder', 
    'attention-deficit-hyperactivity-disorder' : 'adhd', 
    'personality-disorder-borderline-antisocial-paranoid-etc' : 'personality-disorder', 
    'obsessive-compulsive-disorder' : 'ocd', 
    'post-traumatic-stress-disorder' : 'ptsd', 
    'stress-response-syndromes' : 'stress-response', 
    'dissociative-disorder' : 'dissociative-disorder', 
    'substance-use-disorder' : 'substance-use', 
    'addictive-disorder' : 'addictive-disorder', 
    'other' : 'other', 
    'anxiety-disorder-generalized-social-phobia-etc1' : 'anxiety-disorder1', 
    'mood-disorder-depression-bipolar-disorder-etc1' : 'mood-disorder1', 
    'psychotic-disorder-schizophrenia-schizoaffective-etc1' : 'psychotic-disorder1', 
    'eating-disorder-anorexia-bulimia-etc1' : 'eating-disorder1', 
    'attention-deficit-hyperactivity-disorder1' : 'adhd1', 
    'personality-disorder-borderline-antisocial-paranoid-etc1' : 'personality-disorder1', 
    'obsessive-compulsive-disorder1' : 'ocd1', 
    'post-traumatic-stress-disorder' : 'ptsd1', 
    'stress-response-syndromes1' : 'stress-response1', 
    'dissociative-disorder1' : 'dissociative-disorder1', 
    'substance-use-disorder1' : 'substance-use1', 
    'addictive-disorder1' : 'addictive-disorder1', 
    'other1' : 'other1', 
    'anxiety-disorder-generalized-social-phobia-etc2' : 'anxiety-disorder2', 
    'mood-disorder-depression-bipolar-disorder-etc2' : 'mood-disorder2', 
    'psychotic-disorder-schizophrenia-schizoaffective-etc2' : 'psychotic-disorder2', 
    'eating-disorder-anorexia-bulimia-etc2' : 'eating-disorder2', 
    'attention-deficit-hyperactivity-disorder2' : 'adhd2', 
    'personality-disorder-borderline-antisocial-paranoid-etc2' : 'personality-disorder2', 
    'obsessive-compulsive-disorder2' : 'ocd2', 
    'post-traumatic-stress-disorder1' : 'ptsd2', 
    'stress-response-syndromes2' : 'stress-response2', 
    'dissociative-disorder2' : 'dissociative-disorder2', 
    'substance-use-disorder2' : 'substance-use2', 
    'addictive-disorder2' : 'addictive-disorder2', 
    'other2' : 'other2', 
    'have-you-had-a-mental-health-disorder-in-the-past' : 'past-mental-health-disorder', 
    'have-you-ever-sought-treatment-for-a-mental-health-disorder-from-a-mental-health-professional' : 'sought-treatment', 
    'do-you-have-a-family-history-of-mental-illness' : 'family-history-mental-illness', 
    'if-you-have-a-mental-health-disorder-how-often-do-you-feel-that-it-interferes-with-your-work-when-being-treated-effectively' : 'mental-health-interference-treated', 
    'if-you-have-a-mental-health-disorder-how-often-do-you-feel-that-it-interferes-with-your-work-when-not-being-treated-effectively-ie-when-you-are-experiencing-symptoms' : 'mental-health-interference-untreated', 
    'have-your-observations-of-how-another-individual-who-discussed-a-mental-health-issue-made-you-less-likely-to-reveal-a-mental-health-issue-yourself-in-your-current-workplace' : 'observations-mental-health-discussion', 
    'how-willing-would-you-be-to-share-with-friends-and-family-that-you-have-a-mental-illness' : 'share-mental-illness', 
    'would-you-be-willing-to-bring-up-a-physical-health-issue-with-a-potential-employer-in-an-interview' : 'physical-health-interview', 
    'why-or-why-not' : 'physical-health-reason', 
    'would-you-bring-up-your-mental-health-with-a-potential-employer-in-an-interview' : 'mental-health-interview', 
    'why-or-why-not1' : 'mental-health-reason', 
    'are-you-openly-identified-at-work-as-a-person-with-a-mental-health-issue' : 'open-mental-health', 
    'has-being-identified-as-a-person-with-a-mental-health-issue-affected-your-career' : 'mental-health-career', 
    'how-has-it-affected-your-career' : 'mental-health-career-effect', 
    'if-they-knew-you-suffered-from-a-mental-health-disorder-how-do-you-think-that-your-team-membersco-workers-would-react' : 'team-reaction', 
    'have-you-observed-or-experienced-an-unsupportive-or-badly-handled-response-to-a-mental-health-issue-in-your-current-or-previous-workplace' : 'unsupportive-response', 
    'describe-the-circumstances-of-the-badly-handled-or-unsupportive-response' : 'unsupportive-response-details', 
    'have-you-observed-or-experienced-a-supportive-or-well-handled-response-to-a-mental-health-issue-in-your-current-or-previous-workplace' : 'supportive-response', 
    'describe-the-circumstances-of-the-supportive-or-well-handled-response' : 'supportive-response-details', 
    'overall-how-well-do-you-think-the-tech-industry-supports-employees-with-mental-health-issues' : 'industry-support', 
    'briefly-describe-what-you-think-the-industry-as-a-whole-andor-employers-could-do-to-improve-mental-health-support-for-employees' : 'improve-industry-support', 
    'if-there-is-anything-else-you-would-like-to-tell-us-that-has-not-been-covered-by-the-survey-questions-please-use-this-space-to-do-so' : 'additional-comments', 
    'would-you-be-willing-to-talk-to-one-of-us-more-extensively-about-your-experiences-with-mental-health-issues-in-the-tech-industry-note-that-all-interview-responses-would-be-used-anonymously-and-only-with-your-permission' : 'interview-mental-health', 
    'what-is-your-age' : 'age', 
    'what-is-your-gender' : 'gender', 
    'what-country-do-you-live-in' : 'country-live', 
    'what-us-state-or-territory-do-you-live-in' : 'us-state-live', 
    'what-is-your-race' : 'race', 
    'other3' : 'other', 
    'what-country-do-you-work-in' : 'country-work', 
    'what-us-state-or-territory-do-you-work-in' : 'us-state-work', 
    'have-you-been-diagnosed-with-covid-19' : 'covid-19-diagnosis', 
    'response-type' : 'response-type', 
    'start-date-utc' : 'start-date', 
    'submit-date-utc' : 'submit-date', 
    'network-id' : 'network-id', 
    'tags' : 'tags'
}

# For 2021
mhs_2021 = pd.read_csv("resources/2021.csv", index_col=False)
mhs_2021["year"] = 2021
mhs_2021.columns = [clean_text(col) for col in mhs_2021.columns]
mhs_2021 = mhs_2021.apply(clean_row, axis=1)
mhs_2021 = preprocess_df(mhs_2021)
mhs_2021.rename(columns=column_mapping, inplace=True)
display(mhs_2021.info())
column_names_2021 = mhs_2021.columns.tolist()
print(len(column_names_2021))
# For 2022
mhs_2022 = pd.read_csv("resources/2022.csv", index_col=False)
mhs_2022["year"] = 2022
mhs_2022.columns = [clean_text(col) for col in mhs_2022.columns]
mhs_2022 = mhs_2022.apply(clean_row, axis=1)
mhs_2022 = preprocess_df(mhs_2022)
mhs_2022.rename(columns=column_mapping, inplace=True)
display(mhs_2022.info())
column_names_2022 = mhs_2022.columns.tolist()
print(len(column_names_2022))
# For 2023
mhs_2023 = pd.read_csv("resources/2023.csv", index_col=False)
mhs_2023["year"] = 2023
mhs_2023.columns = [clean_text(col) for col in mhs_2023.columns]
mhs_2023 = mhs_2023.apply(clean_row, axis=1)
mhs_2023 = preprocess_df(mhs_2023)
mhs_2023.rename(columns=column_mapping, inplace=True)

column_names_2023 = mhs_2023.columns.tolist()
print(len(column_names_2023))

chunk_size = 20
for i in range(0, len(column_names_2023), chunk_size):
    print(column_names_2023[i:i+chunk_size])

  return BeautifulSoup(text, "html.parser").get_text()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Columns: 125 entries, uniquecolumnheading to year
dtypes: object(125)
memory usage: 128.1+ KB


None

125


  return BeautifulSoup(text, "html.parser").get_text()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164 entries, 0 to 163
Columns: 125 entries, uniquecolumnheading to year
dtypes: object(125)
memory usage: 160.3+ KB


None

125
125
['uniquecolumnheading', 'self-employed', 'num-employees', 'tech-employer', 'tech-role', 'mental-health-benefits', 'know-mental-health-options', 'discussed-mental-health', 'mental-health-resources', 'anonymity-protected', 'mental-health-leave-ease', 'comfort-discussing-health', 'discuss-mental-health-supervisor', 'discussed-mental-health-employer', 'mental-health-talk-employer', 'discuss-mental-health-coworkers', 'discussed-mental-health-coworkers', 'mental-health-talk-coworkers', 'coworker-mental-health-discussion', 'coworker-mental-health-talk']
['importance-physical-health', 'importance-mental-health', 'mental-health-coverage', 'know-mental-health-resources', 'reveal-mental-health-clients', 'effect-reveal-mental-health', 'reveal-mental-health-coworkers', 'impact-reveal-mental-health', 'productivity-mental-health', 'percentage-affected-mental-health', 'previous-employers', 'previous-tech-employer', 'previous-mental-health-benefits', 'aware-previous-mental-health-options', 'dis

  return BeautifulSoup(text, "html.parser").get_text()


In [5]:
mhs_2021 = reset_index_with_string(mhs_2021, "2021_")
mhs_2022 = reset_index_with_string(mhs_2022, "2022_")
mhs_2023 = reset_index_with_string(mhs_2023, "2023_")


In [6]:
# display(mhs_2021.head())
# display(mhs_2022.head())
# display(mhs_2023.head())

merged_mhs = pd.concat([mhs_2021, mhs_2022, mhs_2023], axis=0)
merged_mhs.convert_dtypes()
merged_mhs.to_csv("resources/merged_mhs.csv", index=False)
# text_cols = []

# def check_if_big_string(col):
#     for x in col.values:
#         if isinstance(x, str) and len(x) > 12:
#             return True
#     return False

# for col in merged_mhs.columns:
#     if check_if_big_string(merged_mhs[col]):
#         text_cols.append(col)
# unique_values_by_col = {}

# for col in merged_mhs.columns:
#     col_series = merged_mhs[col].squeeze()
#     unique_values = col_series.unique()
#     unique_values_by_col[col] = unique_values.tolist()

# for col, values in unique_values_by_col.items():
#     print("{col}: {values}")

# print(f"{len(text_cols)}: ", text_cols)
# merged_mhs.head(25)