In [None]:
#Returns dataframe with only categorical columns
def check_categorical(dataset):
    cat_columns = dataset.select_dtypes(include=['object','category']).columns
    return dataset[cat_columns]
check_categorical(dataset)

In [None]:
#Returns dataframe with only numerical columns
def check_numerical(dataset):
    num_columns = dataset.select_dtypes(include=np.number).columns
    return dataset[num_columns]
check_numerical(dataset)

In [None]:
#Returns columns with number of missing data in each 
def missing_values(dataframe):
    percent_null = (dataframe.isnull().sum()/dataframe.shape[0])*100
    missing_data = pd.Series(percent_null,index=dataframe.columns)
    return missing_data
missing_values(dataset)

In [None]:
#Returns skew() values for each continous feature:
def check_skewness(dataset):
    numerical_df = check_numerical(dataset)
    for i in numerical_df.columns:
        print(i,numerical_df[i].skew())
check_skewness(dataset)

In [None]:
#remove skewness, add condition for left skewmess if needed
def remove_skewness(dataset):
    numerical_df = check_numerical(dataset)
    for i in numerical_df.columns:
        if dataset[i].skew()>1:
            dataset[i] = np.log1p(dataset[i])
    return dataset
transformed_dataset = remove_skewness(dataset)
check_skewness(transformed_dataset) 

In [None]:
#plot distplots for the continous features
def plot_distplots(dataset):
    numerical_df = check_numerical(dataset)
    numerical_columns = numerical_df.columns.tolist()
    for i in range(0,len(numerical_columns),2):
        if len(numerical_columns) > i+1:
            plt.figure(figsize=(10,4))
            plt.subplot(121)
            sns.distplot(dataset[numerical_columns[i]],color='#ffa600')
            plt.subplot(122)            
            sns.distplot(dataset[numerical_columns[i+1]],color='#ffa600')
            plt.tight_layout()
            plt.show()

        else:
            sns.distplot(dataset[numerical_columns[i]],color='#ffa600')
plot_distplots(dataset)

In [None]:
#bar plots for categorical features
def plot_categorical_columns(dataframe):
    categorical_columns = dataframe.select_dtypes(include=['object']).columns
    
    for i in range(0,len(categorical_columns),2):
            if len(categorical_columns) > i+1:
                
                plt.figure(figsize=(10,4))
                plt.subplot(121)
                dataframe[categorical_columns[i]].value_counts(normalize=True).plot(kind='bar')
                plt.title(categorical_columns[i])
                plt.subplot(122)     
                dataframe[categorical_columns[i+1]].value_counts(normalize=True).plot(kind='bar')
                plt.title(categorical_columns[i+1])
                plt.tight_layout()
                plt.show()

            else:
                dataframe[categorical_columns[i]].value_counts(normalize=True).plot(kind='bar')
                plt.title(categorical_columns[i])
        
        
        
        
plot = plot_categorical_columns(dataset)

In [None]:
#bivariate countplots for categorical features vs categorical target
def bivariate_analysis_categorical(dataframe,target):
    dataframe = dataframe.drop('Attrition',1)
    categorical_columns = dataframe.select_dtypes(exclude=np.number).columns
    for i in range(0,len(categorical_columns),2):
        
        if len(categorical_columns) > i+1:
            plt.figure(figsize=(15,5))
            plt.subplot(121)
            sns.countplot(x=dataframe[categorical_columns[i]],hue=target,data=dataframe)
            plt.xticks(rotation=90)
            plt.subplot(122)            
            sns.countplot(dataframe[categorical_columns[i+1]],hue=target,data=dataframe)
            plt.xticks(rotation=90)
            plt.tight_layout()
            plt.show()


bivariate_analysis_categorical(dataset,dataset['Attrition'])

In [None]:
#bivariate plots for continous features vs categorical target
def bivariate_analysis_numerical(dataframe):
    dataset = check_numerical(dataframe)
    numerical_columns = dataset.columns
    for i in numerical_columns:
        plt.figure(figsize=(10,5))
        sns.barplot(x=dataframe['Attrition'],y=dataset[i])
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.show()


bivariate_analysis_numerical(dataset)

In [None]:
#returns percentage of classes in the categorical target
def class_imbalance(target):
    class_values = (target.value_counts()/target.value_counts().sum())*100
    return class_values

class_imbalance(dataset['Attrition'])

In [None]:
#Classification Models, Vanilla/Baseline 
def run_model(predictors,target, model):
    '''
    Performs model training and tests using ROC-AUC 
    returns AUC score
    '''
    X_train,X_test,y_train,y_test = train_test_split(predictors,target,test_size=0.2,random_state=42)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    auc = roc_auc_score(y_test, y_pred)
    print('Classification Metrics:')
    print('F1_Score',f1_score(y_test,y_pred))
    print('Recall Score',recall_score(y_test,y_pred))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    print('ROC_AUC_SCORE is',auc)
    
    #fpr, tpr, _ = roc_curve(y_test, predictions[:,1])
    
    plt.plot(false_positive_rate, true_positive_rate)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC curve')
    plt.show()
    return auc

# # Predictors in case when target is the last column
X = dataframe.iloc[:,:-1]

# # Target in case where Target is the last column
y = dataframe.iloc[:,-1]

# Choosing the models. If you want to specify additional models, kindly specify them as a key-value pair as shown below.
models = {'Logistic Regression':LogisticRegression,'Decision Tree':DecisionTreeClassifier,'Random Forest': RandomForestClassifier,'XGBoost':XGBClassifier,'Gradient Boosting':GradientBoostingClassifier}

for i in models.items():
    # run model
    model = i[1]()
    auc = run_model(X, y, model) # train and returns AUC test score
    print('AUC Score = %.2f' %(auc*100) +' %\nOn Model - \n'+str(i[0]))
    print('===='*20)

In [None]:
#Regression models- Vanilla/Baseline
def run_model(X,y,model):
    X_transform = x_scaler.fit_transform(X)
    y_transform = y_scaler.fit_transform(y)
    y_pred = cross_val_predict(model,X_transform,y_transform,cv=3)
    root_mean_squared_log = cross_val_score(model,X_transform,y_transform,cv=3,scoring='neg_mean_squared_log_error')
    return np.sqrt(abs(np.mean(root_mean_squared_log))),y_pred

# # Predictors in case when target is the last column
X = dataframe.iloc[:,:-1]

# # Target in case where Target is the last column
y = dataframe.iloc[:,-1]

# Choosing the models. If you want to specify additional models, kindly specify them as a key-value pair as shown below.
models = {'Linear Regression':LinearRegression,'Ridge':Ridge,'Lasso': Lasso,'Decision Tree':DecisionTreeRegressor, 'Random Forest':RandomForestRegressor,'SVR':SVR,'XGBoost':XGBRegressor}

for i in models.items():
    # run model
    model = i[1]()
    metric,y_predicted = run_model(X, y, model) # train and returns AUC test score
    print('RMSLE Score= '+str(metric) +'\nOn Model '+str(i[0]))
    print('**'*20)

In [None]:
#Feature Selection using RFE, specifying the number of feature and model object
def feature_selection(predictors,target,number_of_features,model):
    models = model()
    rfe = RFE(models,number_of_features)
    rfe = rfe.fit(predictors,target)
    feature_ranking = pd.Series(rfe.ranking_, index=predictors.columns)
    plt.show()
    print('Features  to be selected for {} are:'.format(str(i[0])))
    print(feature_ranking[feature_ranking.values==1].index.tolist())
    print('===='*30)

In [None]:
#Feature Importance graph using Random Forest, select appropriate number of features post visualization
def rfc_feature_selection(dataset,target):
    X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size=0.3, random_state=42, stratify=target)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    rfc = RandomForestClassifier(n_estimators= 1000 , criterion = 'entropy' , random_state = 0, bootstrap = True)
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)
    rfc_importances = pd.Series(rfc.feature_importances_, index=dataset.columns).sort_values()
    rfc_importances.plot(kind='bar')
    plt.show()