In [1]:
'''导入库'''
import pandas as pd
import sys
sys.path.append('..')

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from pfm.code.discretization import discretization
from sklearn.svm import LinearSVC

In [2]:
df_train = pd.read_csv('./dataset/pfm_train.csv')
df_test = pd.read_csv('./dataset/pfm_test.csv')

In [3]:
df_train.drop(
    ['Over18', 'StandardHours', 'EmployeeNumber'], axis=1, inplace=True)
df_test.drop(
    ['Over18', 'StandardHours', 'EmployeeNumber'], axis=1, inplace=True)

# 预测变量
target_var = 'Attrition'

# 连续变量
continuous_var = [
    'Age', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany',
    'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'
]
# 分类变量
categorical_var = [
    x for x in df_train.columns if x not in continuous_var and x != target_var
]
df_train[continuous_var] = df_train[continuous_var].astype(float)
df_test[continuous_var] = df_test[continuous_var].astype(float)
# 数据类型
# 数值型
numeric_var = [
    x for x in df_train.dtypes.index
    if df_train.dtypes[x] != 'object' and x != target_var
]
# 字符型
character_var = [
    x for x in df_train.dtypes.index
    if x not in numeric_var and x != target_var
]


# 将字符变量数值化
le = LabelEncoder()
for col in character_var:
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.fit_transform(df_test[col])
    
# 将连续变量归一化
#scaler = MinMaxScaler()
#pattern = scaler.fit(df_train[continuous_var])
#df_train[continuous_var] = scaler.fit_transform(df_train[continuous_var])

#df_test[continuous_var] = scaler.transform(df_test[continuous_var])

# 将连续变量离散化
df_Ent = discretization(pd.concat([df_train[continuous_var], df_train[target_var]], axis=1),
                        target_var)

基本信息增益是: 0.638642
Age的最佳划分点是 33.500000, 最大信息增益是 0.028930。
MonthlyIncome的最佳划分点是 2487.500000, 最大信息增益是 0.029878。
TotalWorkingYears的最佳划分点是 2.500000, 最大信息增益是 0.031816。
YearsAtCompany的最佳划分点是 2.500000, 最大信息增益是 0.025815。
YearsInCurrentRole的最佳划分点是 2.500000, 最大信息增益是 0.019440。
YearsSinceLastPromotion的最佳划分点是 7.500000, 最大信息增益是 0.006556。
YearsWithCurrManager的最佳划分点是 0.500000, 最大信息增益是 0.025104。


In [4]:
from sklearn.preprocessing import Binarizer
for key in df_Ent.keys():
    bin_encoder = Binarizer(df_Ent[key]['best_point'])
    df_train[key] = bin_encoder.fit_transform(df_train[key].values.reshape(-1, 1))
    df_test[key] = bin_encoder.fit_transform(df_test[key].values.reshape(-1, 1))

In [5]:
# 创建哑变量
predictor_var = [x for x in df_train.columns if x != target_var]
for col in predictor_var:
    dummy = pd.get_dummies(df_train[col], prefix=col)
    df_train = pd.concat([df_train, dummy], axis=1)
    df_train.drop([col], axis=1, inplace=True)
    
    dummy = pd.get_dummies(df_test[col], prefix=col)
    df_test = pd.concat([df_test, dummy], axis=1)
    df_test.drop([col], axis=1, inplace=True)

In [6]:
predictor_var = [x for x in df_train.columns if x != target_var]
X_train, X_Test, Y_train, Y_Test = train_test_split(
    df_train[predictor_var],
    df_train[target_var],
    test_size=0.3,
    random_state=1)

In [7]:
predictor_var = [x for x in df_train.columns if x != target_var]
def validation_func(clf, X_train, X_Test, Y_train, Y_Test):  
    ensemble = clf.fit(X_train, Y_train)
    print("Model Accuracy on training: ", accuracy_score(Y_train, ensemble.predict(X_train)), '\n')
    finalpred = ensemble.predict(X_Test)
    print("Model Accuracy: ", accuracy_score(Y_Test, finalpred), '\n')
    print(confusion_matrix(Y_Test, finalpred), '\n')
    print(classification_report(Y_Test, finalpred))
    return

In [8]:
clf = LogisticRegression()
validation_func(clf, X_train, X_Test, Y_train, Y_Test)

Model Accuracy:  0.881818181818 

[[263   5]
 [ 34  28]] 

             precision    recall  f1-score   support

          0       0.89      0.98      0.93       268
          1       0.85      0.45      0.59        62

avg / total       0.88      0.88      0.87       330



In [9]:
parameters = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1],
    'tol': [1e-6, 1e-5, 1e-4]
}
grid_search = GridSearchCV(clf, parameters, verbose=0, cv=10).fit(X_train, Y_train)
print(grid_search.best_score_)
print(grid_search.best_estimator_)

0.883116883117
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)


In [10]:
validation_func(grid_search.best_estimator_, X_train, X_Test, Y_train, Y_Test)

Model Accuracy:  0.863636363636 

[[268   0]
 [ 45  17]] 

             precision    recall  f1-score   support

          0       0.86      1.00      0.92       268
          1       1.00      0.27      0.43        62

avg / total       0.88      0.86      0.83       330



In [11]:
selector = SelectFromModel(LogisticRegression(penalty='l1'))
selector.fit_transform(X_train, Y_train)
supported = pd.DataFrame({'support':selector.get_support(indices=False)}, index=predictor_var)

In [12]:
grid_search = GridSearchCV(clf, parameters, verbose=0, cv=10)\
             .fit(X_train[supported[supported['support']==True].index.tolist()], Y_train)
print(grid_search.best_score_)
print(grid_search.best_estimator_)

0.883116883117
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)


In [13]:
validation_func(grid_search.best_estimator_,
                X_train[supported[supported['support']==True].index.tolist()],
                X_Test[supported[supported['support']==True].index.tolist()],
                Y_train,
                Y_Test)

Model Accuracy:  0.860606060606 

[[267   1]
 [ 45  17]] 

             precision    recall  f1-score   support

          0       0.86      1.00      0.92       268
          1       0.94      0.27      0.42        62

avg / total       0.87      0.86      0.83       330



In [14]:
seed = 7
models = []
models.append(('LR', LogisticRegression()))
models.append(('RandomForest', RandomForestClassifier(
    n_estimators=500, max_features=3)))
models.append(('LinearSVC', LinearSVC(
    penalty='l1', C=0.1, dual=False)))
#models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('GradientBoosting', GradientBoostingClassifier(
    n_estimators=100, random_state=seed)))
models.append(('AdaBoost', AdaBoostClassifier(
    n_estimators=100, random_state=seed)))
results = []
names = []
print("Model performance")
for name, model in models:
    kfold = KFold(n_splits=5, random_state=1)
    cv_results = cross_val_score(
        model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# Predict values
ensemble = VotingClassifier(models).fit(X_train, Y_train)
finalpred = ensemble.predict(X_Test)
print("Combined Model Accuracy", accuracy_score(Y_Test, finalpred))
print(confusion_matrix(Y_Test, finalpred))
print(classification_report(Y_Test, finalpred))

Model performance
LR: 0.870130 (0.018820)
RandomForest: 0.858442 (0.009543)
LinearSVC: 0.883117 (0.009183)
GradientBoosting: 0.871429 (0.013866)
AdaBoost: 0.845455 (0.021101)
Combined Model Accuracy 0.854545454545
[[265   3]
 [ 45  17]]
             precision    recall  f1-score   support

          0       0.85      0.99      0.92       268
          1       0.85      0.27      0.41        62

avg / total       0.85      0.85      0.82       330



In [15]:
result = clf.predict(df_test[supported[supported['support']==True].index.tolist()])
pd.DataFrame({'result': result}).to_csv('result.csv', index=False)

ValueError: X has 76 features per sample; expecting 139