In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv('survey.csv')

In [None]:
df.head(20)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum().sort_values(ascending=False)

### DATA PREPROCESSING

In [None]:
#Dropping Unnessary Features : 'Comments','State' ,'Country' and 'TimeStamp'

df.drop(['comments'], axis= 1, inplace=True)
df.drop(['state'], axis= 1, inplace=True)
df.drop(['Country'], axis= 1, inplace=True)
df.drop(['Timestamp'], axis= 1, inplace=True)



In [None]:
df['Gender'].unique()

In [None]:
df['Gender'].value_counts()

In [None]:
df['Gender'].value_counts().plot(kind = 'bar',figsize=(8,8))

In [None]:
# Getting Rid of Unnecassary Values in Feature 'Gender'

df['Gender'].replace(['male','M','m','Make','Man','Mair','Guy (-ish) ^_^',
                     'male leaning androgynous','Cis Man','msle','cis male','Mail',
                     'Androgyne','Male (CIS)','Male-ish','maile','something kinda male?',
                     'Mal','ostensibly male, unsure what that really means','Malr',
                      'Cis Male','Male '],'Male',inplace=True)

df['Gender'].replace(['female','F','f','Female ','Female (cis)','Femail',
                      'Femake', 'woman','Cis Female','femail','Woman',
                      'cis-female/femme'],'Female',inplace = True)

df['Gender'].replace(['Trans-female','queer/she/they', 'non-binary', 'Nah',
       'All', 'Enby', 'fluid', 'Genderqueer', 'Agender', 'Trans woman',
       'Neuter', 'Female (trans)', 'queer'],'Other',inplace = True)

stk_list = ['A little about you', 'p']
df = df[~df['Gender'].isin(stk_list)]

In [None]:
df['Gender'].unique()

In [None]:
df['Gender'].value_counts().plot(kind = 'bar',figsize=(8,8))

In [None]:
df['Age'].value_counts().plot(kind = 'bar',figsize=(8,8))

In [None]:
df['Age'].value_counts()

In [None]:
#Range of data

df['Age'].value_counts().sort_index()

In [None]:
#Finding mean from ages between 18 and 75

mean_age = df[(df['Age'] <= 75) & (df['Age'] >= 18)]['Age'].mean()
mean_age=mean_age.astype(int)
mean_age

In [None]:
#Fixing Wrong Data in Age Feature 'Age'

df['Age'].replace(99999999999,mean_age,inplace=True)
df['Age'].replace(329,mean_age,inplace=True)
df['Age'].replace(-29,mean_age,inplace=True)

df['Age'].replace(-1726 ,mean_age,inplace=True)
df['Age'].replace(5 ,mean_age,inplace=True)
df['Age'].replace(11 ,mean_age,inplace=True)


df['Age'].value_counts()

In [None]:
#Range of Feature "Age"

bins = [18,25,35,45,75];
age_cats = ["18-25", "25-35", "35-45", ">45"]
Age_Range = pd.cut(df['Age'], bins, labels= age_cats, include_lowest=True)
Age_Range.value_counts().plot(kind = 'bar',figsize=(8,8))

#Filling Missing Data


In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df['self_employed'].value_counts()

In [None]:
df['self_employed'].fillna('No',inplace=True)

In [None]:
df['work_interfere'].value_counts()

In [None]:
df['work_interfere'].fillna('Not Sure',inplace=True)

In [None]:
df['work_interfere'].value_counts()

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.info()

#Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
df['Age'] = scaler.fit_transform(df[['Age']])

In [None]:
df.head(10)

#Preprocessing categorical features


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

In [None]:
# Seperating i/p and o/p

X = df.drop('treatment',axis=1)
y=df['treatment']

In [None]:
transformer =  ColumnTransformer([('ordinal_encoder',OrdinalEncoder(),['Gender','self_employed',
                                  'family_history','work_interfere','no_employees','remote_work',
                                  'tech_company' ,'benefits','care_options','wellness_program',
                                  'seek_help','anonymity','leave','mental_health_consequence',
                                  'phys_health_consequence','coworkers','supervisor','mental_health_interview',
                                  'phys_health_interview','mental_vs_physical','obs_consequence'])],remainder='passthrough')

In [None]:
X = transformer.fit_transform(X)

In [None]:
X

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder = LabelEncoder();
y = label_encoder.fit_transform(y);

In [None]:
y

#Splitting DataSet Into Train/Test Set


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#**Evaluating Models**



##Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score,roc_curve, roc_auc_score,confusion_matrix,precision_score,recall_score,precision_recall_curve,f1_score
from sklearn.model_selection import cross_val_score,cross_val_predict

###Accuracy

In [None]:
cross_val_score(log_reg, X_train, y_train, cv=3, scoring='accuracy')

###Confusion Matrix

In [None]:
y_pred_log = cross_val_predict(log_reg, X_train, y_train, cv=3)

In [None]:
confusion_matrix_log = confusion_matrix(y_train, y_pred_log)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the confusion matrix
plt.figure(figsize=(6, 4))
sns.set(font_scale=1.0)
sns.heatmap(confusion_matrix_log, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Actual Negative', 'Actual Positive'],
            yticklabels=['Predicted Negative', 'Predicted Positive'])
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()

###Precision/Recall/F1 Score

In [None]:
precision_score(y_train, y_pred_log)

In [None]:
recall_score(y_train, y_pred_log)

In [None]:
f1_score(y_train, y_pred_log)

###Precision-Recall Curve

In [None]:
y_scores = cross_val_predict(log_reg, X_train, y_train, cv=3, method='decision_function')

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)

In [None]:
print(len(precisions))
print(len(recalls))
print(len(thresholds))

In [None]:
plt.plot(thresholds, precisions[:-1], 'b-', label='Precision')
plt.plot(thresholds, recalls[:-1], 'r--', label='Recall')
plt.legend(loc='center left')
plt.show()

###ROC Curve

In [None]:
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

In [None]:
plt.plot(fpr, tpr, 'b-', label='SGD')
plt.plot([0, 1], [0, 1], 'g-', label='Random Classifier')
plt.legend(loc='center right')
plt.show()

In [None]:
roc_auc_score(y_train,  y_scores)

###Prediction Accuracy

In [None]:
y_pred_actual_log_reg = log_reg.predict(X_test)
accuracy = accuracy_score(y_test,y_pred_actual_log_reg)
accuracy

##Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train,y_train)


###Accuracy

In [None]:
cross_val_score(decision_tree, X_train, y_train, cv=3, scoring='accuracy')

###Confusion Matrix

In [None]:
y_pred_dec_tree = cross_val_predict(decision_tree, X_train, y_train, cv=3)

In [None]:
confusion_matrix_dec_tree = confusion_matrix(y_train, y_pred_dec_tree)

In [None]:
# Plot the confusion matrix
plt.figure(figsize=(6, 4))
sns.set(font_scale=1.0)
sns.heatmap(confusion_matrix_dec_tree, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Actual Negative', 'Actual Positive'],
            yticklabels=['Predicted Negative', 'Predicted Positive'])
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()

###Precision/Recall/F1 score

In [None]:
precision_score(y_train, y_pred_dec_tree)

In [None]:
recall_score(y_train, y_pred_dec_tree)

In [None]:
f1_score(y_train, y_pred_dec_tree)

###Precision-Recall Curve

In [None]:
y_probas_dec = cross_val_predict(decision_tree, X_train, y_train, cv=3, method='predict_proba')

In [None]:
y_scores_dec = y_probas_dec[:, 1]

In [None]:
precisions_dec, recall_dec, thresholds_dec = precision_recall_curve(y_train, y_scores_dec)

In [None]:
plt.plot(thresholds_dec, precisions_dec[:-1], 'b-', label='Precision')
plt.plot(thresholds_dec, recall_dec[:-1], 'r--', label='Recall')
plt.legend(loc='center left')
plt.show()

###ROC Curve

In [None]:
fpr_dec, tpr_dec, thresholds_dec = roc_curve(y_train, y_scores_dec)

In [None]:
plt.plot(fpr, tpr, 'b-', label='Logistic Regression')
plt.plot(fpr_dec, tpr_dec, 'r-', label='Decision Tree')
plt.plot([0, 1], [0, 1], 'g-', label='Random Classifier')
plt.legend(loc='center right')
plt.show()

In [None]:
roc_auc_score(y_train,  y_scores_dec)

###Prediction Accuracy


In [None]:
y_pred_actual_decision_tree = decision_tree.predict(X_test)
accuracy = accuracy_score(y_test,y_pred_actual_decision_tree)
accuracy

##Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rand_for = RandomForestClassifier()
rand_for.fit(X_train, y_train)

###Accuracy

In [None]:
cross_val_score(rand_for, X_train, y_train, cv=3, scoring='accuracy')

###Confusion Matrix

In [None]:
y_pred_rand_for = cross_val_predict(rand_for, X_train, y_train, cv=3)

In [None]:
confusion_matrix_rand_for = confusion_matrix(y_train, y_pred_dec_tree)

In [None]:
plt.figure(figsize=(6, 4))
sns.set(font_scale=1.0)
sns.heatmap(confusion_matrix_rand_for, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Actual Negative', 'Actual Positive'],
            yticklabels=['Predicted Negative', 'Predicted Positive'])
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()

###Precision/Recall/F1 Score


In [None]:
precision_score(y_train, y_pred_rand_for)

In [None]:
recall_score(y_train, y_pred_rand_for)

In [None]:
f1_score(y_train, y_pred_rand_for)

###Precision-Recall Curve

In [None]:
y_probas = cross_val_predict(rand_for, X_train, y_train, cv=3, method='predict_proba')

In [None]:
y_scores_rand = y_probas[:, 1]

In [None]:
precisions_rand, recall_rand, thresholds_rand = precision_recall_curve(y_train, y_scores_rand)

In [None]:
plt.plot(thresholds_rand, precisions_rand[:-1], 'b-', label='Precision')
plt.plot(thresholds_rand, recall_rand[:-1], 'r--', label='Recall')
plt.legend(loc='center left')
plt.show()

In [None]:
i = np.argmax(recall_rand < 0.9)

In [None]:
recall_rand[i]

In [None]:
recall_rand[i-1]

In [None]:
i

In [None]:
threshold_90_recall = thresholds_rand[i - 1]

In [None]:
y_preds_90_recall = y_scores_rand >= threshold_90_recall

In [None]:
precision_score(y_train, y_preds_90_recall)

In [None]:
recall_score(y_train, y_preds_90_recall)

In [None]:
y_pred_actual_random_forest = rand_for.predict(X_test)
accuracy = accuracy_score(y_test,y_pred_actual_random_forest)
accuracy

###ROC Curve

In [None]:
fpr_rand, tpr_rand, thresholds_rand = roc_curve(y_train, y_scores_rand)

In [None]:
plt.plot(fpr, tpr, 'b-', label='Logistic Regression')
plt.plot(fpr_dec, tpr_dec, 'r-', label='Decision Tree')
plt.plot(fpr_rand, tpr_rand, 'y-', label='RandomForest')
plt.plot([0, 1], [0, 1], 'g-', label='Random Classifier')
plt.legend(loc='center right')
plt.show()

In [None]:
roc_auc_score(y_train,  y_scores_rand)

###Prediction Accuracy

In [None]:
y_pred_actual_random_forest = rand_for.predict(X_test)
accuracy = accuracy_score(y_test,y_pred_actual_random_forest)
accuracy

In [None]:
a = X_test[5,:]

In [None]:
y_test[5]

In [None]:
rand_for.predict(a.reshape(1,-1))

array([1])

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'n_estimators': [50,100,150],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [ 6,8,10],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced']
}


In [None]:
param_grid2 = [{
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [4,6,8,10,12],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced']
}]

In [None]:
# param_grid = [
#     {'n_estimators': [3, 10, 30], 'max_features': [ 6, 8,10,12]},
#     {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
# ]

In [None]:
cv = GridSearchCV(rand_for, param_grid, cv=3)

In [None]:
cv.fit(X_train, y_train)

In [None]:
cv.best_params_

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 6,
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 100}

In [None]:
model = cv.best_estimator_

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred_actual_random_forest_tuned = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred_actual_random_forest_tuned)
accuracy

0.8227513227513228

In [None]:
mean_test_scores = cv.cv_results_['mean_test_score']
mean_test_scores

array([0.78274604, 0.78955379, 0.8168995 , 0.78844871, 0.79639283,
       0.8270955 , 0.76795246, 0.80094871, 0.8191201 , 0.77473937,
       0.79639283, 0.81457465, 0.75090701, 0.76797331, 0.75889283,
       0.79523561, 0.77133028, 0.78049416])

In [None]:
best_mean_test_score = cv.best_score_
best_mean_test_score

0.8293682235195996