In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Check scikit-learn version
sklearn_version = sklearn.__version__
print(f"scikit-learn version: {sklearn_version}")

scikit-learn version: 1.4.2


### 1. Data Loading and Initial Exploration

In [3]:
# Load the dataset
df = pd.read_csv('mental-heath-in-tech-2016_20161114.csv')

In [4]:
# Display basic information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1433 entries, 0 to 1432
Data columns (total 63 columns):
 #   Column                                                                                                                                                                            Non-Null Count  Dtype  
---  ------                                                                                                                                                                            --------------  -----  
 0   Are you self-employed?                                                                                                                                                            1433 non-null   int64  
 1   How many employees does your company or organization have?                                                                                                                        1146 non-null   object 
 2   Is your employer primarily a tech company/organization?     

In [5]:
# Display the first few rows
print(df.head())

   Are you self-employed?  \
0                       0   
1                       0   
2                       0   
3                       1   
4                       0   

  How many employees does your company or organization have?  \
0                                             26-100           
1                                               6-25           
2                                               6-25           
3                                                NaN           
4                                               6-25           

   Is your employer primarily a tech company/organization?  \
0                                                1.0         
1                                                1.0         
2                                                1.0         
3                                                NaN         
4                                                0.0         

   Is your primary role within your company related to tech/IT?  \


In [6]:
# Display summary statistics
print(df.describe())

       Are you self-employed?  \
count             1433.000000   
mean                 0.200279   
std                  0.400349   
min                  0.000000   
25%                  0.000000   
50%                  0.000000   
75%                  0.000000   
max                  1.000000   

       Is your employer primarily a tech company/organization?  \
count                                        1146.000000         
mean                                            0.770506         
std                                             0.420691         
min                                             0.000000         
25%                                             1.000000         
50%                                             1.000000         
75%                                             1.000000         
max                                             1.000000         

       Is your primary role within your company related to tech/IT?  \
count                               

In [7]:
# Check for missing values
print(df.isnull().sum())

Are you self-employed?                                                                  0
How many employees does your company or organization have?                            287
Is your employer primarily a tech company/organization?                               287
Is your primary role within your company related to tech/IT?                         1170
Does your employer provide mental health benefits as part of healthcare coverage?     287
                                                                                     ... 
What US state or territory do you live in?                                            593
What country do you work in?                                                            0
What US state or territory do you work in?                                            582
Which of the following best describes your work position?                               0
Do you work remotely?                                                                   0
Length: 63

**Explanation:**  
We start by loading the dataset and performing initial exploratory data analysis.  
This helps us understand the structure of the data, including the number of features, data types, and presence of missing values.  

### 2. Data Preprocessing

In [8]:
# Select all columns as relevant features
relevant_features = df.columns.tolist()

In [9]:
# Remove any non-informative columns (you may need to adjust this based on your dataset)
relevant_features = [col for col in relevant_features if col not in ['Timestamp', 'comments']]

In [10]:
df_selected = df[relevant_features]

In [11]:
# Identify numeric and categorical columns
numeric_features = df_selected.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df_selected.select_dtypes(include=['object']).columns.tolist()

In [12]:
print("Numeric features:", numeric_features)

Numeric features: ['Are you self-employed?', 'Is your employer primarily a tech company/organization?', 'Is your primary role within your company related to tech/IT?', 'Do you have medical coverage (private insurance or state-provided) which includes treatment of \xa0mental health issues?', 'Do you have previous employers?', 'Have you ever sought treatment for a mental health issue from a mental health professional?', 'What is your age?']


In [13]:
print("Categorical features:", categorical_features)

Categorical features: ['How many employees does your company or organization have?', 'Does your employer provide mental health benefits as part of healthcare coverage?', 'Do you know the options for mental health care available under your employer-provided coverage?', 'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?', 'Does your employer offer resources to learn more about mental health concerns and options for seeking help?', 'Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?', 'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:', 'Do you think that discussing a mental health disorder with your employer would have negative consequences?', 'Do you think that discussing a physical health issue with your employer would have negative consequences?', 'Would

In [14]:
# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [15]:
# Create the OneHotEncoder based on the scikit-learn version
if sklearn.__version__ >= "1.2":
    onehot_encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
else:
    onehot_encoder = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')

In [16]:
# Now use this in your Pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', onehot_encoder)
])

In [17]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [18]:
# Fit and transform the data
df_preprocessed = preprocessor.fit_transform(df_selected)

In [19]:
print("Shape after preprocessing:", df_preprocessed.shape)

Shape after preprocessing: (1433, 3160)


**Explanation:**  
We've identified the actual columns in the dataset, separated them into numeric and categorical features, and applied appropriate preprocessing steps.  