<a href="https://colab.research.google.com/github/coding-geek21/50projects50days/blob/master/Mental_Health_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Mental Health Prediction**

### **Steps Involved :**


1.   Importing the Necessary Libraries
2.   Loading the Dataset
3.   Data Cleaning
4.   Data Encoding



##**1) Importing the Necessary Libraries**

In [79]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import randint
from scipy import stats

#For pre-processing 
from sklearn.datasets import make_classification
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#For training the models and building models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

#Naive bayes
from sklearn.naive_bayes import GaussianNB 

#Validation libraries
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_squared_error, precision_recall_curve
from sklearn.model_selection import cross_val_score

#For Neural Network
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV

#Bagging & Stacking 
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

##**2) Loading the Dataset**

In [80]:
#Importing the Dataset from Kaggle 

#Reading the CSV file
df = pd.read_csv('survey.csv')

#Data row count
print(df.shape)
    
#Distribution of the dataset
print(df.describe())
    
#Printing the info of dataset
print(df.info())

(1259, 27)
                Age
count  1.259000e+03
mean   7.942815e+07
std    2.818299e+09
min   -1.726000e+03
25%    2.700000e+01
50%    3.100000e+01
75%    3.600000e+01
max    1.000000e+11
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  1259 non-null   object
 1   Age                        1259 non-null   int64 
 2   Gender                     1259 non-null   object
 3   Country                    1259 non-null   object
 4   state                      744 non-null    object
 5   self_employed              1241 non-null   object
 6   family_history             1259 non-null   object
 7   treatment                  1259 non-null   object
 8   work_interfere             995 non-null    object
 9   no_employees               1259 non-null   object
 10  remote_work                1259 non-n

##**3) Data Cleaning**

In [81]:
#Finding the Missing data in the Dataset
print("\nMissing Data\n")
missing_data = df.isnull().sum().sort_values(ascending=False)
print(missing_data)

#Finding the percentage of Missing Data present
print("\nMissing Percentage\n")
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
print(percent)



Missing Data

comments                     1095
state                         515
work_interfere                264
self_employed                  18
seek_help                       0
obs_consequence                 0
mental_vs_physical              0
phys_health_interview           0
mental_health_interview         0
supervisor                      0
coworkers                       0
phys_health_consequence         0
mental_health_consequence       0
leave                           0
anonymity                       0
Timestamp                       0
wellness_program                0
Age                             0
benefits                        0
tech_company                    0
remote_work                     0
no_employees                    0
treatment                       0
family_history                  0
Country                         0
Gender                          0
care_options                    0
dtype: int64

Missing Percentage

comments                     0.86

In [82]:
#Dealing the Missing Data
#Removing the unwanted columns which are not needed for prediction
#Here Timestamp,comments,state and Country are removed which won't be considered for prediction.

df = df.drop(['comments'], axis= 1)
df = df.drop(['state'], axis= 1)
df = df.drop(['Timestamp'], axis= 1)
df = df.drop(['Country'], axis= 1)


df.isnull().sum().max() 
df.head(10)

Unnamed: 0,Age,Gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,Female,,No,Yes,Often,6-25,No,Yes,Yes,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,M,,No,No,Rarely,More than 1000,No,No,Don't know,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,Male,,No,No,Rarely,6-25,No,Yes,No,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,Male,,Yes,Yes,Often,26-100,No,Yes,No,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,Male,,No,No,Never,100-500,Yes,Yes,Yes,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No
5,33,Male,,Yes,No,Sometimes,6-25,No,Yes,Yes,...,Don't know,Don't know,No,No,Yes,Yes,No,Maybe,Don't know,No
6,35,Female,,Yes,Yes,Sometimes,1-5,Yes,Yes,No,...,No,Somewhat difficult,Maybe,Maybe,Some of them,No,No,No,Don't know,No
7,39,M,,No,No,Never,1-5,Yes,Yes,No,...,Yes,Don't know,No,No,No,No,No,No,No,No
8,42,Female,,Yes,Yes,Sometimes,100-500,No,Yes,Yes,...,No,Very difficult,Maybe,No,Yes,Yes,No,Maybe,No,No
9,23,Male,,No,No,Never,26-100,No,Yes,Don't know,...,Don't know,Don't know,No,No,Yes,Yes,Maybe,Maybe,Yes,No


In [83]:
#Replacing the instead of NaN (Making every cols with default values for that type)
int_cols = ['Age']

string_cols = ['Gender', 'self_employed', 'family_history', 'treatment', 'work_interfere',
                 'no_employees', 'remote_work', 'tech_company', 'anonymity', 'leave', 'mental_health_consequence',
                 'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview',
                 'mental_vs_physical', 'obs_consequence', 'benefits', 'care_options', 'wellness_program',
                 'seek_help']
float_cols = []

Int = 0
String = 'NaN'
Float = 0.0

for col in df:
    if col in int_cols:
        df[col] = df[col].fillna(Int)
    elif col in string_cols:
        df[col] = df[col].fillna(String)
    elif col in float_cols:
        df[col] = df[col].fillna(Float)
df.head(5)   

Unnamed: 0,Age,Gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,Female,,No,Yes,Often,6-25,No,Yes,Yes,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,M,,No,No,Rarely,More than 1000,No,No,Don't know,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,Male,,No,No,Rarely,6-25,No,Yes,No,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,Male,,Yes,Yes,Often,26-100,No,Yes,No,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,Male,,No,No,Never,100-500,Yes,Yes,Yes,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


In [84]:
# Gender values are filled differently for eg male, m, f and female etc.. 
# Replacing those with predefined values [female,male,trans]

gender = df['Gender'].str.lower()
gender = df['Gender'].unique()

male = ["male", "m", "male-ish", "maile", "mal", "male (cis)", "make", "male ", "man","msle", "mail", "malr","cis man", "Cis Male", "cis male"]
trans = ["trans-female", "something kinda male?", "queer/she/they", "non-binary","nah", "all", "enby", "fluid", "genderqueer", "androgyne", "agender", "male leaning androgynous", "guy (-ish) ^_^", "trans woman", "neuter", "female (trans)", "queer", "ostensibly male, unsure what that really means"]           
female = ["cis female", "f", "female", "woman",  "femake", "female ","cis-female/femme", "female (cis)", "femail"]

for (row, col) in df.iterrows():

    if str.lower(col.Gender) in male:
        df['Gender'].replace(to_replace=col.Gender, value='male', inplace=True)

    if str.lower(col.Gender) in female:
        df['Gender'].replace(to_replace=col.Gender, value='female', inplace=True)

    if str.lower(col.Gender) in trans:
        df['Gender'].replace(to_replace=col.Gender, value='trans', inplace=True)

stk_list = ['A little about you', 'p']
df = df[~df['Gender'].isin(stk_list)]

print(df['Gender'].unique())

['female' 'male' 'trans']


In [85]:
#Filling the missing age with mean

df['Age'].fillna(df['Age'].median(), inplace = True)

# Fill with median() values < 18 and > 120
s = pd.Series(df['Age'])
s[s<18] = df['Age'].median()
df['Age'] = s
s = pd.Series(df['Age'])
s[s>120] = df['Age'].median()
df['Age'] = s

#Ranges for Age
df['age_range'] = pd.cut(df['Age'], [0,20,30,65,100], labels=["0-20", "21-30", "31-65", "66-100"], include_lowest=True)


In [86]:
#changing NaN to not self_employed
#Replace NaN
#Here we have one two values for col self_employed => [yes,no]

df['self_employed'] = df['self_employed'].replace([String], 'No')
print(df['self_employed'].unique())

['No' 'Yes']


In [87]:
#Changeing NaN to Don't know (in Work_interfere Column)
#Replace NaN

df['work_interfere'] = df['work_interfere'].replace([String], 'Don\'t know' )
print(df['work_interfere'].unique())

['Often' 'Rarely' 'Never' 'Sometimes' "Don't know"]


##**4) Data Encoding**

In [88]:
#Encoding data (Converting the Categorical cols to numeric values)
# Using Label Encoding Technique
#In label encoding, each category is assigned a value from 0 to n, where n is number of category present in the column.

labelDict = {}
for col in df:
    le = preprocessing.LabelEncoder()
    le.fit(df[col])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    df[col] = le.transform(df[col])
    # Get labels
    labelKey = 'label-' + col
    labelValue = [*le_name_mapping]
    labelDict[labelKey] =labelValue
    
for key, value in labelDict.items():     
    print(key, value)

df.head()



label-Age [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 60, 61, 62, 65, 72]
label-Gender ['female', 'male', 'trans']
label-self_employed ['No', 'Yes']
label-family_history ['No', 'Yes']
label-treatment ['No', 'Yes']
label-work_interfere ["Don't know", 'Never', 'Often', 'Rarely', 'Sometimes']
label-no_employees ['1-5', '100-500', '26-100', '500-1000', '6-25', 'More than 1000']
label-remote_work ['No', 'Yes']
label-tech_company ['No', 'Yes']
label-benefits ["Don't know", 'No', 'Yes']
label-care_options ['No', 'Not sure', 'Yes']
label-wellness_program ["Don't know", 'No', 'Yes']
label-seek_help ["Don't know", 'No', 'Yes']
label-anonymity ["Don't know", 'No', 'Yes']
label-leave ["Don't know", 'Somewhat difficult', 'Somewhat easy', 'Very difficult', 'Very easy']
label-mental_health_consequence ['Maybe', 'No', 'Yes']
label-phys_health_consequence ['Maybe', 'No', 'Yes']
label-cow

Unnamed: 0,Age,Gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,age_range
0,19,0,0,0,1,2,4,0,1,2,...,2,1,1,1,2,1,0,2,0,2
1,26,1,0,0,0,3,5,0,0,0,...,0,0,1,0,0,1,1,0,0,2
2,14,1,0,0,0,3,4,0,1,1,...,1,1,1,2,2,2,2,1,0,2
3,13,1,0,1,1,2,2,0,1,1,...,1,2,2,1,0,0,0,1,1,2
4,13,1,0,0,0,1,1,1,1,2,...,0,1,1,1,2,2,2,0,0,2


In [89]:
# Testing the Change in GitHub