# Machine Learning Models

In [None]:
from sklearn.model_selection import learning_curve
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import classification_report
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import calibration_curve
from scipy.sparse import coo_matrix, hstack

%matplotlib inline

In [None]:
url = 'http://bit.ly/mc_clean_data_usc'
df = pd.read_csv(url)

df.head()

In [None]:
min_df = 0.01
max_df = 0.95
vect = CountVectorizer(stop_words='english', min_df=min_df, max_df=max_df, ngram_range=(1, 2))


X = df['trim_text']
y = df['rude']

X_dtm = vect.fit_transform(X)

In [None]:
"""
type(X_dtm_1)
review_len = np.array(df['review_len']).astype(float)
sentiment_polarity = np.array(df['sentiment_polarity']).astype(float)
sentiment_objectivity = np.array(df['sentiment_objectivity']).astype(float)

X_dtm_2 = hstack((X_dtm_1, review_len[:,None])).tocsr() 
X_dtm_3 = hstack((X_dtm_2, sentiment_polarity[:,None])).tocsr() 
X_dtm = hstack((X_dtm_3, sentiment_objectivity[:,None])).tocsr() 
"""

In [None]:
X_dtm.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_dtm, y, test_size=0.30, random_state=42)

## Naive_Bayes (`MultinomialNB`)

In [None]:
nb = MultinomialNB()

# Create the GridSearch estimator along with a parameter object containing the values to adjust
param_grid = {'alpha': [0, 1],
              'fit_prior': ['True','False']}

grid = GridSearchCV(nb, param_grid, verbose=3, cv=10)

In [None]:
 # Fit the model using the grid search estimator. 
# This will take the Naive Bayes model and try each combination of parameters
grid.fit(X_train, y_train)

 # List the best score
print(grid.best_score_)

In [None]:
# List the best parameters for this dataset
print(grid.best_params_)

In [None]:
 # Make predictions with the hypertuned model
predictions = grid.predict(X_test)

print(classification_report(y_test, predictions,
                            target_names=["not rude", "rude"]))

In [None]:
nb = MultinomialNB(alpha=1.0e-10, fit_prior=True)
nb.fit(X_train, y_train)
y_pred_class = nb.predict(X_test)
confusion_matrix(y_test, y_pred_class)

In [None]:
nb_roc_auc = roc_auc_score(y_test, y_pred_class)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_class)

plt.figure()
plt.grid()

plt.plot(fpr, tpr, label='Naive Bayes (area = %0.2f)' % nb_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')

plt.legend(loc="lower right")
plt.show()

In [None]:
plt.figure()

plt.xlabel("Training examples")
plt.ylabel("Score")

train_sizes, train_scores, test_scores = learning_curve(nb, X_train, y_train, cv=10)

plt.title("Learning Curves (Naive Bayes)")

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.grid()

plt.fill_between(train_sizes, 
                 train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, 
                 alpha=0.1,
                 color="r")

plt.fill_between(train_sizes, 
                 test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, 
                 alpha=0.1, color="g")

plt.plot(train_sizes, 
         train_scores_mean, 
         'o-', 
         color="r", 
         label="Training score")

plt.plot(train_sizes, 
         test_scores_mean, 
         'o-', color="g", 
         label="Cross-validation score")

plt.ylim([0.0, 1.05])
plt.legend(loc="best")
plt.show()

## Naive_Bayes (`GaussianNB`)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train.toarray(), y_train)
y_pred_class = nb.predict(X_test.toarray())
confusion_matrix(y_test, y_pred_class)

In [None]:
gnb_roc_auc = roc_auc_score(y_test, y_pred_class)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_class)

plt.figure()
plt.grid()

plt.plot(fpr, tpr, label='Gaussian Naive Bayes (area = %0.2f)' % gnb_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')

plt.legend(loc="lower right")
plt.show()

In [None]:
plt.figure()

plt.xlabel("Training examples")
plt.ylabel("Score")

train_sizes, train_scores, test_scores = learning_curve(gnb, X_train.toarray(), y_train, cv=10)

plt.title("Learning Curves (Gaussian Naive Bayes)")

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.grid()

plt.fill_between(train_sizes, 
                 train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, 
                 alpha=0.1,
                 color="r")

plt.fill_between(train_sizes, 
                 test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, 
                 alpha=0.1, color="g")

plt.plot(train_sizes, 
         train_scores_mean, 
         'o-', 
         color="r", 
         label="Training score")

plt.plot(train_sizes, 
         test_scores_mean, 
         'o-', color="g", 
         label="Cross-validation score")

plt.ylim([0.0, 1.05])
plt.legend(loc="best")
plt.show()

## Logistic Regression

In [None]:
lg = LogisticRegression()

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

grid = GridSearchCV(lg, hyperparameters, verbose=3, cv=10)

In [None]:
 # Fit the model using the grid search estimator. 
# This will take the logistic regression  model and try each combination of parameters

grid.fit(X_train, y_train)

In [None]:
# List the best parameters for this dataset
print(grid.best_params_)

In [None]:
# List the best score
print(grid.best_score_)

In [None]:
 # Make predictions with the hypertuned model
predictions = grid.predict(X_test)

 # Calculate classification report
print(classification_report(y_test, predictions,
                            target_names=["not rude", "rude"]))

In [None]:
lg = LogisticRegression(penalty='l1', C=2.7825594022071245)

lg.fit(X_train, y_train)
y_pred_class = lg.predict(X_test)
confusion_matrix(y_test, y_pred_class)

In [None]:
lg_roc_auc = roc_auc_score(y_test, y_pred_class)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_class)

plt.figure()
plt.grid()

plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % lg_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')

plt.legend(loc="lower right")
plt.show()

In [None]:
plt.figure()

plt.xlabel("Training examples")
plt.ylabel("Score")

train_sizes, train_scores, test_scores = learning_curve(lg, X_train, y_train, cv=10)

plt.title("Learning Curves (Logistic Regression)")

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.grid()

plt.fill_between(train_sizes, 
                 train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, 
                 alpha=0.1,
                 color="r")

plt.fill_between(train_sizes, 
                 test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, 
                 alpha=0.1, color="g")

plt.plot(train_sizes, 
         train_scores_mean, 
         'o-', 
         color="r", 
         label="Training score")

plt.plot(train_sizes, 
         test_scores_mean, 
         'o-', color="g", 
         label="Cross-validation score")

plt.ylim([0.0, 1.05])
plt.legend(loc="best")
plt.show()

## KNN Model

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []

for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)

In [None]:
plt.figure()

plt.xlabel("Training examples")
plt.ylabel("Score")

train_sizes = [num for num in range(1, 40, 2)]

plt.title("k-Nearest Neighbors Complexity Curve")

plt.grid()

plt.plot(train_sizes, 
         train_scores, 
         'o-', 
         color="r", 
         label="Training score")

plt.plot(train_sizes, 
         test_scores, 
         'o-', color="g", 
         label="Cross-validation score")

plt.ylim([0.0, 1.05])
plt.legend(loc="best")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train, y_train)

In [None]:
 # Make predictions with the hypertuned model
y_pred_class = knn.predict(X_test)

 # Calculate classification report
print(classification_report(y_test, y_pred_class,
                            target_names=["not rude", "rude"]))

In [None]:
confusion_matrix(y_test, y_pred_class)

In [None]:
knn_roc_auc = roc_auc_score(y_test, y_pred_class)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_class)

plt.figure()
plt.grid()

plt.plot(fpr, tpr, label='k-Nearest Neighbors (area = %0.2f)' % knn_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')

plt.legend(loc="lower right")
plt.show()

In [None]:
plt.figure()

plt.xlabel("Training examples")
plt.ylabel("Score")

train_sizes, train_scores, test_scores = learning_curve(knn, X_train, y_train, cv=10)

plt.title("k-Nearest Neighbors Learning Curve")

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.grid()

plt.fill_between(train_sizes, 
                 train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, 
                 alpha=0.1,
                 color="r")

plt.fill_between(train_sizes, 
                 test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, 
                 alpha=0.1, color="g")

plt.plot(train_sizes, 
         train_scores_mean, 
         'o-', 
         color="r", 
         label="Training score")

plt.plot(train_sizes, 
         test_scores_mean, 
         'o-', color="g", 
         label="Cross-validation score")

plt.ylim([0.0, 1.05])
plt.legend(loc="best")
plt.show()

## SVC Model

In [None]:
# Create the SVC Model
from sklearn.svm import SVC 
model = SVC()

# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005],
              'kernel': ['linear', 'poly', 'rbf']}
grid = GridSearchCV(model, param_grid, verbose=3, cv=10)

In [None]:
 # Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train, y_train)

In [None]:
 # List the best score
print(grid.best_score_)

In [None]:
# List the best parameters for this dataset
print(grid.best_params_)

In [None]:
 # Make predictions with the hypertuned model
predictions = grid.predict(X_test)

 # Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["not rude", "rude"]))

In [None]:
model = SVC(kernel='rbf', C=50, gamma=0.001)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred_class = model.predict(X_test)
confusion_matrix(y_test, y_pred_class)

In [None]:
svm_roc_auc = roc_auc_score(y_test, y_pred_class)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_class)

plt.figure()
plt.grid()

plt.plot(fpr, tpr, label='k-Nearest Neighbors (area = %0.2f)' % svm_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')

plt.legend(loc="lower right")
plt.show()

In [None]:
plt.figure()

plt.xlabel("Training examples")
plt.ylabel("Score")

train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train, cv=10)

plt.title("SVM (Linear) Learning Curve")

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.grid()

plt.fill_between(train_sizes, 
                 train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, 
                 alpha=0.1,
                 color="r")

plt.fill_between(train_sizes, 
                 test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, 
                 alpha=0.1, color="g")

plt.plot(train_sizes, 
         train_scores_mean, 
         'o-', 
         color="r", 
         label="Training score")

plt.plot(train_sizes, 
         test_scores_mean, 
         'o-', color="g", 
         label="Cross-validation score")

plt.ylim([0.0, 1.05])
plt.legend(loc="best")
plt.show()

In [None]:
nb = MultinomialNB()
gnb = GaussianNB()
lg = LogisticRegression(penalty='l1', C=2.7825594022071245)
knn = KNeighborsClassifier(n_neighbors=11)
model = SVC(kernel='rbf', C=50, gamma=0.001)

plt.figure(figsize=(10, 10))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))

ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
for clf, name in [(nb, 'Multinomial NB'),
                  (gnb, 'GaussianNB'),
                  (lg, 'Logistic Regression'),
                  (knn, 'Random Forest'),
                  (model, 'Support Vector Machine')]:
    clf.fit(X_train.toarray(), y_train)
    if hasattr(clf, "predict_proba"):
        prob_pos = clf.predict_proba(X_test.toarray())[:, 1]
    else:  # use decision function
        prob_pos = clf.decision_function(X_test.toarray())
        prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
    fraction_of_positives, mean_predicted_value = calibration_curve(y_test, prob_pos, n_bins=10)

    ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
             label="%s" % (name, ))

    ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
             histtype="step", lw=2)

ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="lower right")
ax1.set_title('Calibration plots  (reliability curve)')
plt.grid()


ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")
ax2.legend(loc="upper center", ncol=2)

plt.grid()
plt.tight_layout()
plt.show()