In [1]:
%pylab inline
%matplotlib inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import FeatureUnion, Pipeline 
from joblib import dump,load
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold

pd.set_option('display.max_rows', 3000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
df_h1b = pd.read_csv('data/h1b_kaggle.csv', sep=',')
df_h1b.head(5)
df_h1b.shape

(3002458, 11)

In [5]:
df = df_h1b.drop(df_h1b.columns[0], axis='columns')

In [6]:
#Dropping the latitude and longitude columns 
df.drop(['lon','lat'], axis=1, inplace=True)

In [7]:
#making multivariate classification into binary classification for case status column (target)
warnings.filterwarnings("ignore")
df.CASE_STATUS[df['CASE_STATUS']=='REJECTED'] = 'DENIED'
df.CASE_STATUS[df['CASE_STATUS']=='INVALIDATED'] = 'DENIED'
df.CASE_STATUS[df['CASE_STATUS']=='PENDING QUALITY AND COMPLIANCE REVIEW - UNASSIGNED'] = 'DENIED'
df.CASE_STATUS[df['CASE_STATUS']=='CERTIFIED-WITHDRAWN'] = 'CERTIFIED'

In [8]:
df = df.drop(df[df.CASE_STATUS == 'WITHDRAWN'].index)


In [9]:
## Storing non null in df w.r.t. case status
df = df[df['CASE_STATUS'].notnull()]
print(df['CASE_STATUS'].value_counts())

CERTIFIED    2818282
DENIED         94364
Name: CASE_STATUS, dtype: int64


In [10]:
#replacing employer name,  job title, full time and soc name with mode 
df['EMPLOYER_NAME'] = df['EMPLOYER_NAME'].fillna(df['EMPLOYER_NAME'].mode()[0])
df['JOB_TITLE'] = df['JOB_TITLE'].fillna(df['JOB_TITLE'].mode()[0])
df['FULL_TIME_POSITION'] = df['FULL_TIME_POSITION'].fillna(df['FULL_TIME_POSITION'].mode()[0])
df['SOC_NAME'] = df['SOC_NAME'].fillna(df['SOC_NAME'].mode()[0])

In [11]:
#converting worksite and extracting only the state from it
df['STATE'] = df.WORKSITE.str.split(',').str[-1]

In [12]:
df.drop(['WORKSITE'], axis=1, inplace=True)


In [13]:
#Dropping all rows with missing Prevailing Wage values 
df.dropna(inplace = True)

In [14]:
df.isnull().sum()

CASE_STATUS           0
EMPLOYER_NAME         0
SOC_NAME              0
JOB_TITLE             0
FULL_TIME_POSITION    0
PREVAILING_WAGE       0
YEAR                  0
STATE                 0
dtype: int64

In [15]:
df['EMPLOYER_NAME'] = df['EMPLOYER_NAME'].str.lower()


In [16]:
df.drop(['JOB_TITLE'], axis=1, inplace=True)

In [17]:
#Adding a new employer column : replacing top 10 employers and all others as 'others'
df['NEW_EMPLOYER'] = np.nan
df.NEW_EMPLOYER[df['EMPLOYER_NAME'].str.contains('infosys limited')] = 'infosys limited'
df.NEW_EMPLOYER[df['EMPLOYER_NAME'].str.contains('tata consultancy services limited')] = 'tata consultancy services limited'
df.NEW_EMPLOYER[df['EMPLOYER_NAME'].str.contains('wipro limited')] = 'wipro limited'
df.NEW_EMPLOYER[df['EMPLOYER_NAME'].str.contains('deloitte consulting llp')] = 'deloitte consulting llp'
df.NEW_EMPLOYER[df['EMPLOYER_NAME'].str.contains('ibm india private limited')] = 'ibm india private limited'
df.NEW_EMPLOYER[df['EMPLOYER_NAME'].str.contains('accenture llp')] = 'accenture llp'
df.NEW_EMPLOYER[df['EMPLOYER_NAME'].str.contains('microsoft corporation')] = 'microsoft corporation'
df.NEW_EMPLOYER[df['EMPLOYER_NAME'].str.contains('hcl america, inc')] = 'hcl america, inc'
df.NEW_EMPLOYER[df['EMPLOYER_NAME'].str.contains('ernst & young u.s. llp')] = 'ernst & young u.s. llp'
df.NEW_EMPLOYER[df['EMPLOYER_NAME'].str.contains('cognizant technology solutions u.s. corporation')] = 'cognizant technology solutions u.s. corporation'
df['NEW_EMPLOYER']= df.NEW_EMPLOYER.replace(np.nan, 'others', regex=True)

In [18]:
#Dropping EMPLOYER_NAME 
df.drop(['EMPLOYER_NAME'], axis=1, inplace=True)

In [19]:
df['OCCUPATION'] = np.nan
df['SOC_NAME'] = df['SOC_NAME'].str.lower()
df.OCCUPATION[df['SOC_NAME'].str.contains('computer','programmer')] = 'computer and mathematical occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('software','web developer')] = 'computer and mathematical occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('database')] = 'computer and mathematical occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('math','statistic')] = 'computer and mathematical occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('predictive model','stats')] = 'computer and mathematical occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('teacher','linguist')] = ' educational instruction and library occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('professor','Teach')] = 'educational instruction and library occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('school principal')] = 'educational instruction and library occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('medical','doctor')] = 'medical or healthcare support occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('physician','dentist')] = 'medical or healthcare support occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('Health','Physical Therapists')] = 'medical or healthcare support occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('surgeon','nurse')] = 'medical or healthcare support occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('psychiatr')] = 'medical or healthcare support occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('chemist','physicist')] = 'advanced sciences'
df.OCCUPATION[df['SOC_NAME'].str.contains('biology','scientist')] = 'advanced sciences'
df.OCCUPATION[df['SOC_NAME'].str.contains('biologi','clinical research')] = 'advanced sciences'
df.OCCUPATION[df['SOC_NAME'].str.contains('public relation','manage')] = 'management occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('management','operation')] = 'management occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('chief','plan')] = 'management occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('executive')] = 'management occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('advertis','marketing')] = 'marketing occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('promotion','market research')] = 'marketing occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('business','business analyst')] = 'business and financial occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('business systems analyst')] = 'business and financial occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('accountant','finance')] = 'business and financial occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('financial')] = 'business and financial occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('engineer','architect')] = 'architecture and engineering occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('surveyor','carto')] = 'architecture and engineering occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('technician','drafter')] = 'architecture and engineering occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('information security','information tech')] = 'architecture and engineering occupations'
df['OCCUPATION']= df.OCCUPATION.replace(np.nan, 'others', regex=True)

In [20]:
## capping prevailing wage to get rid of outliers and spiking data points
df.loc[df.PREVAILING_WAGE < 34029, 'PREVAILING_WAGE']= 34029
df.loc[df['PREVAILING_WAGE'] > 138703, 'PREVAILING_WAGE']= 138703

In [21]:
#dropping SOC_NAME and YEAR 
df.drop(['SOC_NAME'], axis=1, inplace= True)
df.drop(['YEAR'], axis=1, inplace= True)

In [22]:
#labelling categorical variables as 'category' datatype 
df[['FULL_TIME_POSITION','NEW_EMPLOYER','OCCUPATION','STATE']] = df[['FULL_TIME_POSITION','NEW_EMPLOYER','OCCUPATION','STATE']].apply(lambda x: x.astype('category'))

### Approach to try:
          Using Stratified K-cross fold validation with SMOTE for each iteration of the training set

In [23]:
sample_df = df.loc[:10000]

In [24]:
X = sample_df.drop('CASE_STATUS', axis=1)
y = sample_df['CASE_STATUS']

In [25]:
random_seed = 49
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.30, random_state=random_seed)

In [26]:
#One Hot encoding the dataset

In [27]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant'))])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [28]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['category']).columns

In [29]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [30]:
preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [31]:
svm_pipeline = Pipeline(steps = [['classifier', svm.SVC(probability = True)]])

In [41]:
#creating new svm pipeline post GridSearch
svm_gridsearch_pipeline = Pipeline(steps = [['SVM classifier after gridsearch', svm.SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)]])

In [33]:
#encoding x_train. 
X_train_encoded = preprocessing_pipeline.fit_transform(X_train)
X_train_encoded

<6706x68 sparse matrix of type '<class 'numpy.float64'>'
	with 33530 stored elements in Compressed Sparse Row format>

In [34]:
dump(preprocessing_pipeline, 'final_preprocessing_pipeline.joblib')

['final_preprocessing_pipeline.joblib']

In [35]:
#encoding y 
class_mapping = {'CERTIFIED':0, 'DENIED':1}
y_train_encoded = y_train.map(class_mapping)
y_test_encoded = y_test.map(class_mapping)

In [36]:
#Using SMOTE (Oversampling) with Stratified K-cross fold validation. SMOTE is applied inside each iteration
#Reference : https://github.com/lumiata/tech_blog/blob/master/Cross_Validation_Imbalanced_Datasets/cross-validation.ipynb

In [37]:
random_seed = 63445
kf = StratifiedKFold(n_splits=5, random_state=random_seed)

In [38]:
cross_val_f1_score_lst = []
cross_val_accuracy_lst = []
cross_val_recall_lst = []
cross_val_precision_lst = []

In [39]:
for train_index_ls, validation_index_ls in kf.split(X_train_encoded, y_train_encoded):
    # keeping validation set apart and oversampling in each iteration using smote 
    train, validation = X_train_encoded[train_index_ls], X_train_encoded[validation_index_ls]
    target_train, target_val = y_train_encoded.iloc[train_index_ls], y_train_encoded.iloc[validation_index_ls]
    sm = SMOTE(random_state=random_seed)
    X_train_res, y_train_res = sm.fit_sample(train, target_train)
    print (X_train_res.shape, y_train_res.shape)

(10178, 68) (10178,)
(10178, 68) (10178,)
(10180, 68) (10180,)
(10180, 68) (10180,)
(10180, 68) (10180,)


In [40]:
#iloc error fixed by : https://stackoverflow.com/questions/53183433/attributeerror-numpy-ndarray-object-has-no-attribute-iloc

In [56]:
#svm_pipeline.fit(X_train_res, y_train_res)

Pipeline(memory=None,
         steps=[['classifier',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1, probability=True,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False)]],
         verbose=False)

In [42]:
svm_gridsearch_pipeline.fit(X_train_res, y_train_res)

Pipeline(memory=None,
         steps=[['SVM classifier after gridsearch',
                 SVC(C=1000, break_ties=False, cache_size=200,
                     class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma=1,
                     kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False)]],
         verbose=False)

In [43]:
#dump(svm_pipeline, 'svm_pipeline.joblib')

In [44]:
#dumping the SVM post gridsearch pipeline
dump(svm_gridsearch_pipeline, 'svm_final_pipeline.joblib')

['svm_final_pipeline.joblib']

In [45]:
X_test.head()

Unnamed: 0,FULL_TIME_POSITION,PREVAILING_WAGE,STATE,NEW_EMPLOYER,OCCUPATION
6998,Y,99154.0,NEW YORK,others,management occupations
6999,N,47500.0,MARYLAND,others,management occupations
7000,N,61818.0,ILLINOIS,others,management occupations
7001,Y,131830.0,NEW YORK,others,management occupations
7002,Y,99153.6,NEW YORK,others,management occupations


In [46]:
enc = load('final_preprocessing_pipeline.joblib')

In [47]:
#encoding the new test data
X_test_encoded = enc.transform(X_test)

X_test_encoded

<2875x68 sparse matrix of type '<class 'numpy.float64'>'
	with 11816 stored elements in Compressed Sparse Row format>

In [48]:
svm_clf = load('svm_final_pipeline.joblib')

In [49]:
y_pred = svm_clf.predict(X_test_encoded)

In [51]:
#Converting y_test_encoded to matrix
test_y = y_test_encoded.as_matrix()

In [52]:
#predicting results
print('Accuracy Score : ' + str(accuracy_score(test_y,y_pred)))
print('Precision Score : ' + str(precision_score(test_y,y_pred)))
print('Recall Score : ' + str(recall_score(test_y,y_pred)))
print('F1 Score : ' + str(f1_score(test_y,y_pred)))

cf_matrix = confusion_matrix(test_y,y_pred)

Accuracy Score : 0.03408695652173913
Precision Score : 0.019088016967126194
Recall Score : 0.9642857142857143
F1 Score : 0.03743500866551127


In [53]:
# testing on 1 fold of validation set
validation_preds = svm_clf.predict(validation)
cross_val_recall_lst.append(recall_score(target_val, validation_preds))
cross_val_accuracy_lst.append(accuracy_score(target_val, validation_preds))
cross_val_precision_lst.append(precision_score(target_val, validation_preds))
cross_val_f1_score_lst.append(f1_score(target_val, validation_preds))
print ('Cross validated accuracy: {}'.format(np.mean(cross_val_accuracy_lst)))
print ('Cross validated recall score: {}'.format(np.mean(cross_val_recall_lst)))
print ('Cross validated precision score: {}'.format(np.mean(cross_val_precision_lst)))
print ('Cross validated f1_score: {}'.format(np.mean(cross_val_f1_score_lst)))

Cross validated accuracy: 0.35719612229679343
Cross validated recall score: 0.5217391304347826
Cross validated precision score: 0.04161849710982659
Cross validated f1_score: 0.07708779443254818


# TRYING GRIDSEARCH

In [46]:
from sklearn.model_selection import GridSearchCV

In [52]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  

In [53]:
grid = GridSearchCV(svm.SVC(),param_grid,refit=True,verbose=2)

In [54]:
grid.fit(X_train_res, y_train_res)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=   5.9s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.8s remaining:    0.0s


[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=   6.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=   6.3s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=   6.5s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=   6.5s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=0.1, gamma=0.1, kernel=rbf, total=   5.9s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=0.1, gamma=0.1, kernel=rbf, total=   6.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=0.1, gamma=0.1, kernel=rbf, total=   6.4s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[CV] ...................... C=10, gamma=0.1, kernel=rbf, total=  10.5s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ..................... C=10, gamma=0.01, kernel=rbf, total=   8.4s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ..................... C=10, gamma=0.01, kernel=rbf, total=   9.3s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ..................... C=10, gamma=0.01, kernel=rbf, total=   9.2s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ..................... C=10, gamma=0.01, kernel=rbf, total=  10.3s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ..................... C=10, gamma=0.01, kernel=rbf, total=   8.6s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .................... C=10, gamma=0.001, kernel=rbf, total=   7.4s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[CV] .................. C=1000, gamma=0.001, kernel=rbf, total=   7.0s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV] .................. C=1000, gamma=0.001, kernel=rbf, total=   7.6s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV] .................. C=1000, gamma=0.001, kernel=rbf, total=   7.2s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] ................. C=1000, gamma=0.0001, kernel=rbf, total=  16.2s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] ................. C=1000, gamma=0.0001, kernel=rbf, total=  16.2s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] ................. C=1000, gamma=0.0001, kernel=rbf, total=  15.6s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] ................. C=1000, gamma=0.0001, kernel=rbf, total=  18.5s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .

[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed: 18.0min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [55]:
# print best parameter after tuning 
print(grid.best_params_) 
  
# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 

{'C': 1000, 'gamma': 1, 'kernel': 'rbf'}
SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


In [56]:
grid_predictions = grid.predict(X_test_encoded)

In [57]:
#Converting y_test_encoded to matrix
test_y_2 = y_test_encoded.as_matrix()

In [74]:
from sklearn.metrics import classification_report

In [60]:
print(classification_report(test_y_2, grid_predictions)) 

              precision    recall  f1-score   support

           0       0.96      0.02      0.03      2819
           1       0.02      0.96      0.04        56

    accuracy                           0.03      2875
   macro avg       0.49      0.49      0.03      2875
weighted avg       0.94      0.03      0.03      2875



In [62]:
#predicting results
print('Accuracy Score : ' + str(accuracy_score(test_y_2,grid_predictions)))
print('Precision Score : ' + str(precision_score(test_y_2,grid_predictions)))
print('Recall Score : ' + str(recall_score(test_y_2,grid_predictions)))
print('F1 Score : ' + str(f1_score(test_y_2,grid_predictions)))

cf_matrix = confusion_matrix(test_y_2,grid_predictions)

Accuracy Score : 0.03408695652173913
Precision Score : 0.019088016967126194
Recall Score : 0.9642857142857143
F1 Score : 0.03743500866551127


In [41]:
#### Trying XGBoost with grid search usinf recall factor

In [42]:
import xgboost

In [77]:
parameters = [{'n_estimators': [10, 100]},
              {'learning_rate': [0.1, 0.01, 0.5]}]

In [87]:
grad_boost_model =xgboost.XGBClassifier(max_features='sqrt', subsample=0.8, random_state=10)

In [88]:
grid_search_recall = GridSearchCV(estimator = grad_boost_model, param_grid = parameters)

In [89]:
grid_search_recall.fit(X_train_res, y_train_res)

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, max_features='sqrt',
                                     min_child_weight=None, missing=nan,
                                     monotone_c...
                                     objective='binary:logistic',
                                     random_state=10, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=0.8, tree_method=None,


In [90]:
grid_search_recall_res = grid_search_recall.predict(X_test_encoded)

In [91]:
#Converting y_test_encoded to matrix
test_y_2 = y_test_encoded.as_matrix()

In [92]:
print(classification_report(test_y_2, grid_search_recall_res)) 

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2819
           1       0.12      0.12      0.12        56

    accuracy                           0.96      2875
   macro avg       0.55      0.55      0.55      2875
weighted avg       0.97      0.96      0.97      2875



In [93]:
#predicting results
print('Accuracy Score : ' + str(accuracy_score(test_y_2,grid_search_recall_res)))
print('Precision Score : ' + str(precision_score(test_y_2,grid_search_recall_res)))
print('Recall Score : ' + str(recall_score(test_y_2,grid_search_recall_res)))
print('F1 Score : ' + str(f1_score(test_y_2,grid_search_recall_res)))

cf_matrix = confusion_matrix(test_y_2,grid_search_recall_res)

Accuracy Score : 0.9645217391304348
Precision Score : 0.11666666666666667
Recall Score : 0.125
F1 Score : 0.1206896551724138
