In [None]:

import numpy as np
import pandas as pd
import seaborn as sb

# importing the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train = train_df.copy()
test = test_df.copy()

train.drop(['EmployeeNo', 'State_Of_Origin'], axis=1, inplace=True)
test.drop(['EmployeeNo', 'State_Of_Origin'], axis=1, inplace=True)


# Encoding categorical variables
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()


def change(y):
    if( y == 'More than 5'):
        return 6
    return y

train['No_of_previous_employers'] = train['No_of_previous_employers'].apply(change).astype("int32").values
test['No_of_previous_employers'] = test['No_of_previous_employers'].apply(change).astype("int32").values

train['Qualification'] = train['Qualification'].map({'MSc, MBA and PhD':3,'First Degree or HND':2,'Non-University Education':1})
test['Qualification'] = test['Qualification'].map({'MSc, MBA and PhD':3,'First Degree or HND':2,'Non-University Education':1})

# filling the missing values
train['Qualification'].fillna(0, inplace=True)
test['Qualification'].fillna(0, inplace=True)

train.info()

# categories to encode 
columns = ['Gender','Division','Foreign_schooled', 'Channel_of_Recruitment', 
           'Marital_Status', 
           'Past_Disciplinary_Action',
           'Previous_IntraDepartmental_Movement']

train[columns] = train[columns].apply(lambda col:labelencoder.fit_transform(col))
test[columns] = test[columns].apply(lambda col:labelencoder.fit_transform(col))

X_Data = train.drop('Promoted_or_Not', axis=1)
y_Data = train['Promoted_or_Not']
X_test_Data = test

num_cols = ['Trainings_Attended', 'Last_performance_score', 'Year_of_birth',
           'Year_of_recruitment', 'Targets_met', 'Previous_Award', 'Training_score_average']

X_encode_D = X_Data[num_cols]
X_encode_t = X_test_Data[num_cols]

X_Data.drop(num_cols, axis=1, inplace=True)
X_test_Data.drop(num_cols, axis=1, inplace=True)

onehotencoder = OneHotEncoder(categories='auto')
X_Data = onehotencoder.fit_transform(X_Data).toarray()
X_Data = pd.DataFrame(X_Data).join(X_encode_D).values

X_test_Data = onehotencoder.fit_transform(X_test_Data).toarray()
X_test_Data = pd.DataFrame(X_test_Data).join(X_encode_t).values

# Balancing the dataset using ADASYN
from imblearn.over_sampling import ADASYN
sm = ADASYN()
X_Data, y_Data = sm.fit_sample(X_Data, y_Data)

#from sklearn.preprocessing import StandardScaler
#scale_Data = StandardScaler().fit(X_Data)
#X_Data = scale_Data.transform(X_Data)
#
#scale_te = StandardScaler().fit(X_test_Data)
#X_test_Data = scale_te.transform(X_test_Data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_Data, y_Data, test_size=0.2, random_state=0)

#from sklearn.feature_selection import SelectFromModel
#selector = SelectFromModel(xgb.XGBClassifier(n_estimators = 100, random_state = 0, n_jobs = -1))
#selector.fit(X_train, y_train)
#selector.get_support()
#
#X_train_opt = selector.transform(X_train)
#X_test_opt = selector.transform(X_test)
#X_test_Data = selector.transform(X_test_Data)

import xgboost as xgb
from sklearn.metrics import accuracy_score

def run_xgboost(X_train, X_test, y_train, y_test):
    clf = xgb.XGBClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))

from sklearn.feature_selection import RFE
sel = RFE(xgb.XGBClassifier(n_estimators = 100,
                            random_state = 0, n_jobs = -1), n_features_to_select = 41)
sel.fit(X_train, y_train)
X_train_OPT = sel.transform(X_train)
X_test_OPT = sel.transform(X_test)
X_test_Data_used = sel.transform(X_test_Data)

classifier = xgb.XGBClassifier(n_estimators =100,
                               random_state = 0, n_jobs=-1)
classifier.fit(X_train_OPT, y_train)
y_pred = classifier.predict(X_test_Data_used)

#for index in range(1, 42):
#    sel = RFE(xgb.XGBClassifier(n_estimators = 100, random_state = 0, n_jobs = -1), n_features_to_select = index)
#    sel.fit(X_train, y_train)
#    X_train_OPT = sel.transform(X_train)
#    X_test_OPT = sel.transform(X_test)
#    print('Selected feature: ', index)
#    run_xgboost(X_train_OPT, X_test_OPT, y_train, y_test)
#    print()
#    


sample = pd.read_csv('sample_submission2.csv')
sample.EmployeeNo = test_df['EmployeeNo']
sample.Promoted_or_Not = y_pred

sample.to_csv('.\submissions\submission32.csv', index=False)
