In [1]:
import numpy as np
import pandas as pd
X_train_s = np.load('data/X_train_s.npy' )
X_test_s = np.load('data/X_test_s.npy')
y_train = np.load('data/y_train.npy')
y_test = np.load('data/y_test.npy')

feature_names = np.load('data/feature_names.npy')


# Logistic Regression

Logistic regression is a type of regression analysis used when predicting classes as opposed to numerical values. In
this case, its goal is to predict whether a customer is a satisfied or dissatisfied customer. In order to fine-tune the
model, a cross-validated grid-search will be performed running various combination of tuning parameters.

In [2]:
from sklearn.model_selection import GridSearchCV

def find_best_parameters(model,parameters, x, y):
    gridSearch = GridSearchCV(model,parameters)
    gridSearch.fit(x,y)
    return gridSearch


In [3]:
from sklearn.linear_model import LogisticRegression

In [None]:
logRegGrid = {
    'penalty': ['elasticnet'],
    'solver': ['saga'],
    'fit_intercept': [True,False],
    'class_weight': ['balanced',None],
    'tol': [.0001, .01, 1, 10, 100],
    'C': [.01, 1, 10, 100],
    'l1_ratio': np.linspace(0,1,num=4),
    'n_jobs':[-1]
}

best = find_best_parameters(LogisticRegression(),logRegGrid,X_train_s,y_train)



Given the best parameters from the grid search, the following model achieved a score of 87.36%. The model was balanced
in predicting satisfied and unsatisfied customers. This is shown within the F1-score.

In [None]:
for param in best.best_params_:
    print('{} : {}'.format(param,best.best_params_[param]))

In [None]:
from sklearn.metrics import classification_report
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

logReg = LogisticRegression(
    C=100,
    tol=.01,
    class_weight=None,
    fit_intercept=False,
    l1_ratio=1.0,
    max_iter=1000,
    penalty='elasticnet',
    solver='saga'
)

logReg.fit(X_train_s, y_train)
predictions = logReg.predict(X_test_s)
score = logReg.score(X_test_s, y_test)

cm = metrics.confusion_matrix(y_test, predictions)

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Accuracy Score: {:.2f}%'.format(score*100), size = 15);

print(classification_report(y_test, predictions))

In [None]:
from sklearn.metrics import plot_roc_curve
rf_roc = plot_roc_curve(logReg, X_test_s, y_test)
plt.title("ROC")
plt.show()
metrics.plot_roc_curve

# Feature Selection

In an attempt to improve accuracy and reduce the complexity of the model, feature selection could help. In this case,
the following will systematically choose the best number of features by minimizing the mean absolute error.

In [None]:
from sklearn import feature_selection, model_selection

def select_features(training_data,target,model, percentiles, scoring='neg_mean_absolute_error',cv=5):
    results = []
    for i in percentiles:
        fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i)
        X_train_fs = fs.fit_transform(training_data, target)
        scores = model_selection.cross_val_score(model, X_train_fs, target, cv=cv,scoring=scoring)
        results = np.append(results, abs(scores).mean())

    optimal_percentile = np.where(results == results.min())[0]
    print ("Optimal percentile of features:{0}".format(percentiles[optimal_percentile[0]]), "\n")
    optimal_num_features = int(percentiles[optimal_percentile[0]]*training_data.shape[1]/100)
    print ("Optimal number of features:{0}".format(optimal_num_features), "\n")
    return optimal_percentile, results

def plot_coefficients(model, n_features, feature_names):
    plt.figure(figsize=(10,20))
    plt.yticks(np.arange(n_features), feature_names)
    plt.xlabel("Coefficient Value")
    plt.ylabel("Feature")
    plt.ylim(-1, n_features)
    plt.barh(range(n_features), model, align='center')
    plt.show()


In [None]:
percentiles = range(1, 101)

optimal_percentile, results = select_features(X_train_s,y_train,logReg,percentiles)


In [None]:
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=93)
X_train_fs = fs.fit_transform(X_train_s, y_train)

features = pd.DataFrame()
features['columns'] = feature_names[fs.get_support()]
features['weights'] = fs.scores_[fs.get_support()]
features = features.sort_values(by='weights')
plot_coefficients(features['weights'], len(features['columns']), features['columns'])


In [None]:

X_test_fs = X_test_s[:,fs.get_support()]

logReg.fit(X_train_fs, y_train)
predictions = logReg.predict(X_test_fs)
score = logReg.score(X_test_fs, y_test)


cm = metrics.confusion_matrix(y_test, predictions)

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Accuracy Score: {:.2f}%'.format(score*100), size = 15);

# Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE

selector = RFE(logReg)
selector.fit(X_train_s,y_train)
predictions = selector.predict(X_test_s)
score = selector.score(X_test_s, y_test)


cm = metrics.confusion_matrix(y_test, predictions)

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Accuracy Score: {:.2f}%'.format(score*100), size = 15);

In [None]:
X_train_rfe = selector.transform(X_train_s)

logReg.fit(X_train_rfe, y_train)
features = pd.DataFrame()
features['columns'] = feature_names[selector.support_]
features['weights'] = logReg.coef_[0]
features = features.sort_values(by='weights')
plot_coefficients(features['weights'], len(features['columns']), features['columns'])

# Principal Components Analysis

In [None]:
from sklearn.decomposition import PCA
variance = 0
i = 0
while variance < .95:
    i += 1
    pca = PCA(n_components=i)
    pca.fit(X_train_s)
    variance = pca.explained_variance_ratio_.sum()
    print('{} components capture {:.2f}%'.format(i,variance*100))

print('To capture at least 95% of the variance, {} principal components are needed'.format(i))

In [None]:
pca_data = pca.fit_transform(X_train_s,y_train)
pca_data_test = pca.fit_transform(X_test_s,y_test)
logReg.fit(pca_data, y_train)
predictions = logReg.predict(pca_data_test)
score = logReg.score(pca_data_test, y_test)

cm = metrics.confusion_matrix(y_test, predictions)

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Accuracy Score: {:.2f}%'.format(score*100), size = 15);

print(classification_report(y_test, predictions))

In [None]:
import statsmodels.api as sm
logit_model=sm.Logit(y_train,X_train_rfe)
result=logit_model.fit()
print(result.summary2())

In [None]:
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
loadings = pd.DataFrame(loadings.T,columns=feature_names)
loadings[loadings > .01]