In [None]:
# KNN
# Logistic Regression
# Decision Trees
# Random Forest
# SVM
# Gradient Boosting

In [1]:
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import create_engine
from flask_sqlalchemy import SQLAlchemy

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import warnings
import os
%matplotlib inline

warnings.filterwarnings("ignore") #, category=DeprecationWarning)



In [2]:
# env variable at tensorflow1.4 per https://conda.io/docs/user-guide/tasks/manage-environments.html#saving-environment-variables
# and https://vsupalov.com/flask-sqlalchemy-postgres/

def get_env_variable(name):
    try:
        return os.environ[name]
    except KeyError:
        message = "Expected environment variable '{}' not set.".format(name)
        raise Exception(message)

# the values of those depend on your setup
POSTGRES_URL = get_env_variable("POSTGRES_URL")
POSTGRES_USER = get_env_variable("POSTGRES_USER")
POSTGRES_PW = get_env_variable("POSTGRES_PW")
POSTGRES_DB = get_env_variable("POSTGRES_DB")

In [3]:
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user=POSTGRES_USER,pw=POSTGRES_PW,url=POSTGRES_URL,db=POSTGRES_DB)

In [4]:
engine_var = DB_URL
engine = create_engine(engine_var)

In [7]:
# df = pd.read_pickle('data/kickstarter_data.pkl')
# TODO determine whie ID shows up but is not called - index issue?
df = pd.read_sql_query('''SELECT * FROM kickstarter_data''',engine)
pd.read_sql_query('''SELECT state, main_category, main_category, currency, currency, deadline, launched, usd_goal_real, usd_pledged_real FROM kickstarter_data LIMIT 5''',engine)

Unnamed: 0,state,state_code,main_category,main_category_code,currency,currency_code,deadline,launched,usd_goal_real,usd_pledged_real
0,successful,1,Publishing,12,USD,13,2018-01-02,2017-12-06,2000.0,6083.0
1,successful,1,Music,10,USD,13,2018-01-02,2017-11-30,10000.0,11169.56
2,successful,1,Music,10,EUR,4,2018-01-02,2017-11-28,30112.5,30615.02
3,successful,1,Music,10,USD,13,2018-01-02,2017-12-09,1000.0,1743.0
4,failed,0,Food,7,USD,13,2018-01-02,2017-11-03,200000.0,1.0


In [None]:
X = df.drop(['name', 'category', 'main_category', 'currency',
       'pledged', 'state', 'backers',
       'country', 'usd pledged'], 1)

y = df['state_successful']

print(X.shape())
print(X.head())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7,random_state=42)

k_range = list(range(1, 101))
print(X.shape)
print(y.shape)

## K-Nearest Neighbors KNN

In [None]:
k_acc = []

for k in k_range:
    knn = KNeighborsClassifier(k)
    knn.fit(X_train, y_train)
    knn_prediction = knn.predict(X_test)
    k_acc.append(accuracy_score(y_test, knn_prediction))

In [None]:
plt.plot(k_range, k_acc)
plt.xlabel('# of neighbors (k)')
plt.ylabel('Accuracy on test set')
plt.title('knn model - accuracy vs neighbors')

In [None]:
plt.figure(figsize=(16,8))
plt.plot(k_range, k_acc, '-', label = 'K Nearest Neighbor')
plt.title('Comparison of accuracy between models')
plt.xlabel('# of neighbors in knn')
plt.ylabel('Accuracy on Test Case')
plt.legend(loc = 0)

In [None]:
# TO DO get top 5 k values
best_k = k_range[np.argmax(k_acc)]
best_k

In [None]:
from sklearn.learning_curve import learning_curve

In [None]:
[m, train_score, test_score] = learning_curve(log_reg, X, y)

In [None]:
train_cv_err = np.mean(train_score, axis=1)
test_cv_err = np.mean(test_score, axis=1)

In [None]:
plt.plot(m, train_cv_err, label='Training Error')
plt.plot(m, test_cv_err, label = 'Test Set Error')
plt.title('Learning Curve of Logistic Regression')
plt.xlabel('Size of Training Observations')
plt.ylabel('Accuracy')
plt.legend(loc=0)

In [None]:
[m2, train_score2, test_score2] = learning_curve(KNeighborsClassifier(best_k), X,y)

In [None]:
train_cv_err2 = np.mean(train_score2, axis=1)
test_cv_err2 = np.mean(test_score2, axis=1)

In [None]:
plt.plot(m2, train_cv_err2, label='Training Error')
plt.plot(m2, test_cv_err2, label = 'Test Set Error')
plt.title('Learning Curve of Best K {}-Nearest Neighbor'.format(best_k))
plt.xlabel('Size of Training Observations')
plt.ylabel('Accuracy')
plt.legend(loc=0)

In [None]:
k = 4
train_sizes, train_scores, test_scores = learning_curve(KNeighborsClassifier(k), 
                                                        X, y, 
                                                        train_sizes=np.linspace(0.05, 0.95, 18),
                                                        cv=10, 
                                                        scoring='accuracy')

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

In [None]:
# Plot Training Examples vs. Accuracy
fig, ax = plt.subplots(figsize=(10, 8))

plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

ax.set_title('Learning Curves (KNeighborsClassifier(k=%d))' % k)
ax.set_xlabel('Training Examples')
ax.set_ylabel('Accuracy')
# ax.set_ylim([0, 1]);
ax.legend(loc='best');

In [None]:
kval = np.arange(1, 101)
accuracy = np.zeros(kval.shape)
for idx, k in enumerate(kval):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy[idx] = accuracy_score(y_test, y_pred)

In [None]:
# Plot K vs. Accuracy
fig = plt.figure(figsize=(12, 8))
plt.plot(kval, accuracy, lw=2)
# plt.ylim([0, 1]);
plt.title('K vs. Accuracy')
plt.xlabel('K value')
plt.ylabel('Accuracy');

In [None]:
# argmax finds first occurence of max
best_k = kval[np.argmax(accuracy)]
best_k

In [None]:
# but for KNN, a larger K is generally better since it's a less complex model
# (i.e., less likely to overfit)
best_k = max(kval[accuracy == max(accuracy)])
best_k

In [None]:
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

pred = knn.predict(X_test)

In [None]:
print(confusion_matrix(y_test,pred))

In [None]:
print(classification_report(y_test,pred))

## Logistic Regression

In [None]:
## Make a logistic regression object, fit it on the training set, and test it to find accuracy
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
logistic_prediction = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, logistic_prediction)
print(log_reg_accuracy)

In [None]:
df.state.value_counts().plot(kind = 'bar')

In [None]:
train_sizes, train_scores, test_scores = learning_curve(LogisticRegression(), 
                                                        X, y,
                                                        cv=10, 
                                                        scoring='accuracy')

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

In [None]:
# Plot Training Examples vs. Accuracy
fig, ax = plt.subplots(figsize=(10, 8))

plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

ax.set_title('Learning Curves (LogisticRegression)')
ax.set_xlabel('Training Examples')
ax.set_ylabel('Accuracy')
ax.legend(loc='best');

In [None]:
logreg = LogisticRegression()
log_accuracy = cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean()
log_accuracy

In [None]:
train_sizes, train_scores, test_scores = learning_curve(LogisticRegression(), 
                                                        X, y,
                                                        cv=10, 
                                                        scoring='accuracy')

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

In [None]:

# Plot Training Examples vs. Accuracy
fig, ax = plt.subplots(figsize=(10, 8))

plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

ax.set_title('Learning Curves (LogisticRegression)')
ax.set_xlabel('Training Examples')
ax.set_ylabel('Accuracy')
ax.legend(loc='best');

## SVM

## Decision Trees

## Random Forest

## Naive Bayes

In [None]:
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score, classification_report

model = naive_bayes.GaussianNB()
model.fit(X_train, y_train)

print("Accuracy: %.3f"% accuracy_score(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))

In [None]:
model = naive_bayes.BernoulliNB()
model.fit(X_train, y_train)

print("Accuracy: %.3f"% accuracy_score(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))

In [None]:
model = naive_bayes.MultinomialNB()
model.fit(X_train, y_train)

print("Accuracy: %.3f"% accuracy_score(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))


In [None]:
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import scale

X_train = scale(X_train)
X_test = scale(X_test)

model = LinearSVC()
model.fit(X_train, y_train)

print("Accuracy: %.3f"% accuracy_score(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))

In [None]:
model = SVC()
model.fit(X_train, y_train)

print("Accuracy: %.3f"% accuracy_score(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))

## Gradient Boosting

## Ensemble

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
model_list = [GaussianNB(), SVC(), DecisionTreeClassifier(), RandomForestClassifier()]
for model in model_list:
    train_sizes, train_scores, test_scores = learning_curve(model, 
                                                            X, y, 
                                                            train_sizes=[0.7],
                                                            cv=10, 
                                                            scoring='accuracy')

    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    print('%s:\n\tTest Score: Mean= %.3f, Std= %.3f\n' % (model, test_scores_mean, test_scores_std))

In [None]:
from sklearn.model_selection import cross_val_score

model_list = [GaussianNB(), SVC(), DecisionTreeClassifier(), RandomForestClassifier()]
for model in model_list:
    test_scores = cross_val_score(model, X, y=y, cv=10, scoring='accuracy')
    test_scores_mean = np.mean(test_scores)
    test_scores_std = np.std(test_scores)
    
    print('%s:\n\tTest Score: Mean= %.3f, Std= %.3f\n' % (model, test_scores_mean, test_scores_std))

## Classification Errors

In [None]:
# Define models to test
model_list = [['KNN', KNeighborsClassifier(9)], # K=9 was best performance from Challenge07
              ['Logistic', LogisticRegression()],
              ['GaussianNB', GaussianNB()], 
              ['SVC', SVC(probability=True)], 
              ['DecisionTree', DecisionTreeClassifier()], 
              ['RandomForest', RandomForestClassifier()]]

# Calculate metrics for each model
roc = {}
for model in model_list:
    
    model_name = model[0]
    model = model[1]
    
    accuracy = []
    precision = []
    recall = []
    f1 = []
    auc = []
        
    # Perform K-Fold CV and calculate metrics for each fold
    kf = KFold(5, random_state=4444, shuffle=True)
    for train_idx, test_idx in kf.split(X, y=y):
        X_train = X.iloc[train_idx, :] 
        X_test = X.iloc[test_idx, :] 
        y_train = y.iloc[train_idx] 
        y_test = y.iloc[test_idx] 
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
        auc.append(roc_auc_score(y_test, y_pred))
        
    
    # Calculate mean metric across K-folds
    mean_accuracy = np.mean(accuracy)
    mean_precision = np.mean(precision)
    mean_recall = np.mean(recall)
    mean_f1 = np.mean(f1)
    mean_auc = np.mean(auc)
    
    # Capture TPR and FPR from last fold for plotting
    y_score = model.predict_proba(X_test)[:,1]
    roc[model_name] = roc_curve(y_test, y_score)
    
    # Print formatted results
    print(model)
    print('\t==============================')
    print('\tAccuracy:', mean_accuracy)
    print('\tPrecision:', mean_precision)
    print('\tRecall:', mean_recall)
    print('\tF1:', mean_f1)
    print('\tAUC:', mean_auc)
    print('\n')

In [None]:
# Plot the ROC curve from the last K-Fold split
fig, ax = plt.subplots(figsize=(12, 8))
colors = ['b', 'g', 'r', 'c', 'm', 'y']

# Plot 50-50 Line
ax.plot([0,1],[0,1], ls='--', color='k', label='50-50')
# Plot Classifier ROC Curves
for key, c in zip(roc.keys(), colors):
    ax.plot(roc[key][0], roc[key][1], color=c, label=key)
    
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.set_title('Classifier Comparison')
ax.legend(loc='best');

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [None]:
# Find the weighted precision and recall
print('Weighted Precision:', precision_score(y_test, y_pred, average='weighted'))
print('Weighted Recall:', recall_score(y_test, y_pred, average='weighted'))

In [None]:
print(classification_report(y_test, y_pred)) #, target_names=target_names))

In [None]:
logreg = OneVsRestClassifier(LogisticRegression())
logreg.fit(X_train, y_train)
y_score = logreg.decision_function(X_test)

In [None]:
print('Micro Average Precision:', average_precision_score(y_test, y_score, average='micro'))
print('Macro Average Precision:', average_precision_score(y_test, y_score, average='macro'))
print('Weighted Average Precision:', average_precision_score(y_test, y_score, average='weighted'))
print('Precision for each Class:', average_precision_score(y_test, y_score, average=None))

In [None]:
precision, recall, _ = precision_recall_curve(y_test.ravel(), y_score.ravel())

In [None]:
precision

In [None]:
recall

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import roc_curve, auc

y_score = logreg.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

# Plot 50-50 Line
ax.plot([0,1],[0,1], ls='--', color='k', label='50-50')
# Plot Logistic ROC curve
ax.plot(fpr, tpr, color='b', label='Logistic: %.3f' % roc_auc)
    
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.set_title('Logistic ROC Curve')
ax.legend(loc='best');