In [2]:
import tensorflow as tf
import tensorflow.keras

In [3]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore', category = DeprecationWarning)
import nltk
from nltk import word_tokenize
from datetime import datetime
import scipy.stats.distributions as dist

from statsmodels.stats.proportion import proportions_ztest


from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import TensorBoard
import datetime

import string
import re


In [None]:
pd.set_option('display.max_columns', None)

file = pd.read_csv(r'C:\Users\xuanx\Desktop\Steam_Reviews\steam_reviews.csv')

In [None]:
file['language'].value_counts()[:5]

In [None]:
#pick out english reviews only
df = file[file['language'] == 'english']

In [None]:
#Checking review column for null values and removing them.
print(df.review.isnull().sum())
df1 = df[df.review.isnull()== False]

In [None]:
df1['recommended'].value_counts()

In [None]:
#Hypothesis testing: early access population mean versus release reviews means
df1.groupby('written_during_early_access')['recommended'].value_counts()

In [None]:
prop = df1.groupby('written_during_early_access')['recommended'].value_counts(normalize = True)
prop

In [None]:
n = df1.groupby('written_during_early_access')['recommended'].count()
n

In [None]:
prop_early = prop[(True, False)]
prop_not_early = prop[(False, False)]
print(prop_early, prop_not_early)

In [None]:
n_early = n[True]
n_not_early = n[False]
print(n_early, n_not_early)

In [None]:
p_hat = (n_early * prop_early + n_not_early * prop_not_early) / (n_early + n_not_early)
std_error = np.sqrt(p_hat * (1-p_hat) / n_early + p_hat * (1-p_hat) / n_not_early)
z_score = (prop_early - prop_not_early) / std_error
print(z_score)

In [None]:
n_not_recommended = np.array([2644, 2597])
n_row = np.array([14424 + 2644, 6717 + 2597])
from statsmodels.stats.proportion import proportions_ztest
z_score, p_value = proportions_ztest(count = n_not_recommended, nobs = n_row, alternative = 'two-sided')
z_score, p_value

#Reject null hypothesis.
#There is a difference in proportion between not recommended reviews in early access and not recommended reviews outside
# of early access at a 1% level of significance. 

In [None]:
#received for free population reviews versus paid population reviews

df1.groupby('received_for_free')['recommended'].value_counts()

In [None]:
#received for free population = to paid population reviews?
rec_free_key = np.array([248307, 30925])
n_row = np.array([248307+30925, 8319820+1019635])

z_score, p_value = proportions_ztest(count = n_not_recommended, nobs = n_row, alternative = 'larger')
z_score, p_value

In [None]:
df_clean = df1
df_20_20_samp = df_clean.groupby(['recommended'])
df_20_20_samp.size()
df_20_20 = df_20_20_samp.apply(lambda x: x.sample(300, replace = False).reset_index(drop=True))
df_20_20['recommended'].value_counts()
#df_20_20_samp = df_clean.sample(15000, replace = False, random_state = 42)
#df_20_20_samp['recommended'].value_counts()

In [None]:
df_20_20['recommended'].value_counts()

In [None]:
df_20_20 = df_20_20.droplevel(['recommended'])

In [None]:
df_20_20.review = df_20_20.review.astype(str)

In [None]:
#Convert text to lowercase and removing punctuation 

def remove_punct(txt):
    text_input = "".join([char for char in txt if char not in string.punctuation])
    return text_input

df_20_20['token_review'] = df_20_20['review'].apply(lambda x: remove_punct(x.lower()))

In [None]:
#remove numbers

def remove_digits(txt):
    return re.sub(r' \d+', '', str(txt))

df_20_20['token_review1'] = df_20_20['token_review'].apply(lambda x: remove_digits(x))

In [None]:
#remove non alphabet characters

def remove_chars(txt):
    return re.sub("[^a-zA-Z]+", ' ', txt)

df_20_20['token_review2'] = df_20_20['token_review1'].apply(lambda x: remove_chars(x))

In [None]:
#Convert reviews into tokens

def tokenize(txt):
    tokens = re.split('\W+', txt)
    return tokens

df_20_20['token_review3'] = df_20_20['token_review2'].apply(lambda x: tokenize(x))

In [None]:
#remove stopwords

stopword = nltk.corpus.stopwords.words('english')

def rem_stopwords(txt):
    text = [word for word in txt if word not in stopword]
    return text

df_20_20['token_review4'] = df_20_20['token_review3'].apply(lambda x: rem_stopwords(x))
    

In [None]:
#convert words to base form 
wn = nltk.WordNetLemmatizer()

def lemmatizer(txt):
    text = [wn.lemmatize(word) for word in txt]
    return text

df_20_20['token_review5']=df_20_20['token_review4'].apply(lambda x: lemmatizer(x))

In [None]:
df_20_20['reviews_fin'] = df_20_20['token_review5'].apply(lambda x:' '.join(x))
#df_20_20['rec_var'] = df_20_20['recommended'].apply(lambda x: 0 if x==1 else 1)

In [None]:
print(df_20_20.reviews_fin.isnull().sum())

In [None]:
pos_reviews = df_20_20[df_20_20['recommended']== 1] #majority: variable positive reviews
neg_reviews = df_20_20[df_20_20['recommended']== 0] #minority: variable negative reviews

In [None]:
print(pos_reviews['review'].count(),
    neg_reviews['review'].count())

In [None]:
from nltk.probability import FreqDist

pos_rev_words = pos_reviews['token_review5']
pos_words = []
for reviews in pos_rev_words: 
    pos_words += reviews
    
freqdist_pos = FreqDist(pos_words)
freqdist_pos

In [None]:
neg_rev_words = neg_reviews['token_review5']
neg_words = []
for reviews in neg_rev_words: 
    neg_words += reviews
    
freqdist_neg = FreqDist(neg_words)
freqdist_neg

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(width = 800, height = 800,
                     background_color = 'white',
                     stopwords = stopword,
                     min_font_size = 10).generate_from_frequencies(freqdist_pos)

plt.figure(figsize = (10,10), facecolor = None)
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('postive.png')
plt.show


In [None]:
wordcloud = WordCloud(width = 800, height = 800,
                     background_color = 'white',
                     stopwords = stopword,
                     min_font_size = 10).generate_from_frequencies(freqdist_neg)

plt.figure(figsize = (10,10), facecolor = None)
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('negative.png')
plt.show


In [None]:
(pd.Series(nltk.ngrams(pos_words, 2)).value_counts())[:10]

In [None]:
(pd.Series(nltk.ngrams(pos_words, 3)).value_counts())[:10]

In [None]:
(pd.Series(nltk.ngrams(neg_words, 2)).value_counts())[:10]

In [None]:
(pd.Series(nltk.ngrams(neg_words, 3)).value_counts())[:10]

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
train = df_20_20.reset_index().groupby(['recommended']).apply(lambda x: x.sample(frac = 0.8, random_state = 42)
                                                             ).reset_index(drop = True).set_index('index')

test = df_20_20.drop(train.index)

train['recommended'].value_counts()

In [None]:
X_train = train["reviews_fin"]
y_train = train['recommended'].replace({True: 1, False: 0})
X_test = test['reviews_fin']
y_test = test['recommended'].replace({True: 1, False: 0})

In [None]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_train_arr = X_train_vec.toarray()
print(X_train_arr.shape)

In [None]:
X_test_vec = vectorizer.transform(X_test)
X_test_arr = X_test_vec.toarray()
print(X_test_arr.shape)

In [None]:
n_words = X_train_arr.shape[1]

In [None]:
#nn model
nn_model = Sequential()
nn_model.add(Dense(12, input_shape = (n_words,), activation = 'relu'))
#model.add(Dropout(0.1))
nn_model.add(Dense(8, activation = 'relu'))
#model.add(Dropout(0.1))
nn_model.add(Dense(1, activation ='sigmoid'))

tensorBoardCallback = TensorBoard(log_dir = './logs', write_graph = True)
nn_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])


In [None]:
start_time = datetime.datetime.now()

history = nn_model.fit(X_train_arr, y_train, epochs = 10, verbose = 2)



In [None]:
loss, acc = nn_model.evaluate(X_test_arr, y_test, verbose = 0)
print('Test Accuracy: %f' % (acc*100))

In [None]:
plt.plot(history.history['accuracy'])
#plt.plot(history.history['val_accuracy'])
plt.legend(["Train", "test"], loc = 'upper right')
plt.show()

In [None]:
import xgboost

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from time import time
from pprint import pprint
import logging
from sklearn.metrics import recall_score, roc_auc_score, make_scorer, roc_curve
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier

In [None]:
#Create Pipeline

pipe = Pipeline(steps = [("vects", TfidfVectorizer()), ("classifier", RandomForestClassifier())])

pipe.get_params().keys()

In [None]:
#Create a dictionary with the hyperparameters
search_space = [
                {"classifier": [RandomForestClassifier()]},
                {"classifier": [MultinomialNB()]}
               ]

test_params = [
                {"classifier": [RandomForestClassifier()],
                'classifier__max_depth': [10, 20, 50, 100]}
               ]

back_up_space = [{"vects": [TfidfVectorizer()],
                "vects__ngram_range": [(1,1), (2,2), (3,3)],
                'vects__use_idf': [True, False]},
                {"vects": [CountVectorizer()],
                "vects__ngram_range": [(1,1), (2,2), (3,3)]},
                {"classifier": [RandomForestClassifier()],
                'classifier__max_depth': [10, 20, 50, 100],
                'classifier__n_estimators': [200, 400, 600, 800],
                'classifier__max_depth': [20,30,60,None] },
                {"classifier": [MultinomialNB()]}
               ]

In [None]:
from sklearn import metrics
#create gridsearchcv object (cv=kfold) **check kfold
cv = KFold(n_splits=5, shuffle = True)
scoring={'AUC': metrics.make_scorer(roc_auc_score, needs_proba = True), 'Accuracy': metrics.make_scorer(metrics.accuracy_score)}
grid = GridSearchCV(estimator = pipe, param_grid = search_space, cv = cv, scoring = scoring,
                    return_train_score = True, verbose = 1, n_jobs = -1, refit = 'AUC', error_score = 'raise')
best_model = grid.fit(X_train, y_train)

In [None]:
print(best_model.best_score_, best_model.best_estimator_, best_model.best_params_)

In [None]:
#building a df from cv data
cv_scores = pd.DataFrame(best_model.cv_results_)
print(cv_scores)

In [None]:
#print(cv_scores['params'].iloc[24])

In [None]:
y_pred = best_model.predict_proba(X_test)[:,1]
print(best_model.score(X_test, y_test))
print(roc_auc_score(y_test, y_pred))

In [None]:
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
print(train_features.shape, test_vectors.shape)

In [None]:


cv = KFold(n_splits=5, shuffle = True)


pipe_multi_count = Pipeline([('vect', CountVectorizer()),
                      ('clf', MultinomialNB())])

pipe_rf_count = Pipeline([('vect', CountVectorizer()),
                      ('clf', RandomForestClassifier(random_state = 42))])

pipe_multi_tfidf = Pipeline([('vect', TfidfVectorizer()),
                      ('clf', MultinomialNB())])

pipe_rf_tfidf = Pipeline([('vect', TfidfVectorizer()),
                      ('clf', RandomForestClassifier(random_state = 42))])


grid_params_multi = {'vect__ngram_range': [(1,1), (2,2), (3,3)]
                     }

grid_params_rf = {'vect__ngram_range': [(1,1), (2,2), (3,3)],
                  'clf__max_depth': [10, 20, 50, 100],
                  'clf__n_estimators': [200, 500, 800, 1000, 1500]
                     }

multiNB = GridSearchCV(estimator = pipe_multi_count,
                       param_grid = grid_params_multi,
                       scoring = 'roc_auc',
                       cv = cv,
                       
                       error_score = 'raise',
                       n_jobs = -1)

rf = GridSearchCV(estimator = pipe_rf_count,
                  param_grid = grid_params_rf,
                  scoring = 'roc_auc',
                  cv = cv,
                  error_score = 'raise',
                  n_jobs = -1)

multiNB_tfidf = GridSearchCV(estimator = pipe_multi_tfidf,
                       param_grid = grid_params_multi,
                       scoring = 'roc_auc',
                       cv = cv,
                       
                       error_score = 'raise',
                       n_jobs = -1)

rf_tfidf = GridSearchCV(estimator = pipe_rf_tfidf,
                  param_grid = grid_params_rf,
                  scoring = 'roc_auc',
                  cv = cv,
                  error_score = 'raise',
                  n_jobs = -1)


grids = [multiNB, rf, multiNB_tfidf, rf_tfidf] #rf

grid_dict = {0: 'NB countvects',
             1: 'rf countvects',
             2: "NB tfidf",
             3: 'rf tfidf'}

print('Performing Model Optimizations')
best_auc = 0.0
best_clf = 0
best_gs = ''

for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    gs.fit(X_train, y_train)
    print('Best params are : %s' % gs.best_params_)
    # Best training data recall score
    print('Best training AUC score: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred_proba = gs.predict_proba(X_test)[:,1]
    # Test data accuracy of model with best params
    print('Test set AUC score for best params: %.3f ' % roc_auc_score(y_test, y_pred_proba))
    #Plot ROC curve for each estimator
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    auc = roc_auc_score(y_test,y_pred_proba)
    plt.plot([0,1],[0,1], linestyle ='--')
    plt.plot(fpr, tpr, label = 'auc=%.3f' % auc)
    plt.title(label = 'ROC Curve %s' % grid_dict[idx])
    name = grid_dict[idx]
    plt.savefig(name + ".png")
    plt.show()
    # Track best (highest test auc) model
    if roc_auc_score(y_test, y_pred_proba) > best_auc:
        best_auc = roc_auc_score(y_test, y_pred_proba)
        best_gs = gs
        best_clf = idx
print('\nClassifier with best test set AUC: %s' % grid_dict[best_clf])




In [None]:
cv1_scores = pd.DataFrame(rf.cv_results_).sort_values(by = 'rank_test_score')


In [None]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train).toarray()
X_train_vec_df = pd.DataFrame(X_train_vec)

X_test_vec = vectorizer.transform(X_test).toarray()
X_test_vec_df = pd.DataFrame(X_test_vec)


In [None]:
words = vectorizer.get_feature_names_out()

In [None]:
#X_train_vec_df.columns = words
#X_test_vec_df.columns = words

In [None]:
clf = MultinomialNB()
clf_fit = clf.fit(X_train_vec_df, y_train)


In [None]:
import shap
from scipy import interpolate
from sklearn import metrics

In [None]:
#clf_fit = pipe_multi_tfidf.fit(X_train, y_train)

In [None]:
feature_names = vectorizer.get_feature_names_out()

shap.initjs()
explainer = shap.explainers.Permutation(clf_fit.predict_proba, X_train_vec_df, max_evals = 3000,
                           feature_names = feature_names)


In [None]:

shap_values = explainer(X_test_vec_df[:100])
shap_values = shap_values[...,1]

np.shape(shap_values)


In [None]:
shap.plots.bar(shap_values)

In [None]:
shap.summary_plot(shap_values[0], X_test_vec_df.iloc[:1000,:], feature_names = feature_names)

In [None]:
def f(X):
    return clf_fit.predict([X[:,i] for i in frange(X.shape[1])]).flatten()

In [None]:
explainer = shap.KernelExplainer(f, X.iloc[:50,:]), feature_names = words)
shap_values = explainer.shap_values(X.iloc[299,:], nsamples = 500)
shap.force_plot(explainer.expected_value, shap_values, X_display.iloc[299,:])

In [None]:
import copy

shap.initjs()
shap_values1 = copy.deepcopy(shap_values)
shap_values1.values = shap_values1.values[:,:,1]
shap_values1.base_values = shap_values1.base_values[:,1]

shap.plots.beeswarm(shap_values1, max_display = 20, show= False)
plt.savefig('beeswarm.png', bbox_inches="tight")

In [None]:
shap.initjs()

ind = 48
print(X_test.iloc[ind], y_test.iloc[ind])

In [None]:
shap.plots.waterfall(shap_values[ind,:,1], show = True)
plt.savefig('waterfall.png', bbox_inches="tight")


In [None]:
shap.plots.scatter(shap_values[:,words.tolist().index('story'),1], show = True)

plt.savefig('scatter.png', bbox_inches="tight")

In [None]:
y_test.value_counts()

In [None]:
feature_scores = pd.Series(clf_fit.feature_importances_,
                          index = words).sort_values(ascending=False)
feature_scores[:10]


In [None]:
from sklearn.metrics import precision_recall_curve

y_pred_proba = clf_fit.predict_proba(X_test_vec_df)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print('Best Threshold = %f, F-Score = %.3f' % (thresholds[ix], fscore[ix]))
plt.plot(recall, precision, label = 'auc=%.3f' % auc, marker = '.')
plt.plot(recall[ix], precision[ix], marker = 'o', color = 'black', label = 'Best')
plt.title(label = 'Precision Recall Curve')
plt.savefig('prec_recall.png', bbox_inches="tight")
plt.show()
optimal_thresh = thresholds[ix]
optimal_prec = precision[ix]

In [None]:
feat_imp = feature_scores.nlargest(10).sort_values(ascending=True)
feat_imp.plot(kind = 'barh', figsize = (10,10))
plt.savefig('feat_imp.png', bbox_inches="tight")

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = (y_pred_proba > optimal_thresh)
conf_mat = confusion_matrix(y_test, y_pred.astype(int), labels = [1,0])
conf_mat

In [None]:
print(X_test.shape)
y_pred_df = y_pred.astype(int)
X_variables = pd.DataFrame(X_test)
X_variables['predictions'] = y_pred_df

In [None]:
negative_reviews = X_variables[X_variables['predictions'] == 1 ]
negative_reviews.head(10)


In [None]:
negative_reviews.count()

In [None]:
X_variables.iloc[ind]

## Unused Code For Now

In [None]:
import tensorflow.keras

In [None]:
nn_train_data = pd.concat([X_train, y_train], axis = 1)
nn_train_data.value_counts('recommended')

In [None]:
nn_balance = nn_train_data[nn_train_data['recommended']==0].sample(4198, random_state = 48)
nn_balance.value_counts('recommended')
nn_train = pd.concat([nn_balance, nn_train_data[nn_train_data['recommended']==1]])
nn_train.value_counts('recommended')
nn_xtrain = nn_train['reviews_fin']
nn_ytrain = nn_train['recommended']

In [None]:
vectorizer = TfidfVectorizer()
X_train_arr = vectorizer.fit_transform(nn_xtrain).toarray()

In [None]:
X_test_arr = vectorizer.transform(X_test).toarray()

In [None]:
print(X_train_arr.shape, X_test_arr.shape)

In [None]:
n_words = X_train_arr.shape[1]
n_words

In [None]:
nn_ytrain.value_counts()
y_test.value_counts()

In [None]:
#nn model
model = Sequential()
model.add(Dense(200, input_shape = (n_words,), activation = 'relu'))
#model.add(Dropout(0.1))
model.add(Dense(100, activation = 'relu'))
#model.add(Dropout(0.1))
model.add(Dense(1, activation ='sigmoid'))

tensorBoardCallback = TensorBoard(log_dir = './logs', write_graph = True)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
start_time = datetime.datetime.now()

history = model.fit(X_train_arr, nn_ytrain, epochs = 10, callbacks=[tensorBoardCallback], verbose = 2,
                   validation_data = (X_test_arr, y_test))



In [None]:
loss, acc = model.evaluate(X_test_arr, y_test, verbose = 0)
print('Test Accuracy: %f' % (acc*100))

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.legend(["Train", "test"], loc = 'upper right')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(["Train", "test"], loc = 'upper right')
plt.show()

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

In [None]:
precision.shape, recall.shape, thresholds.shape

In [None]:
precision = np.delete(precision, -1)
recall = np.delete(recall, -1)

precision.shape, recall.shape, thresholds.shape

In [None]:
#locate recall from selected precision

precision_score = optimal_prec
find_recall = interpolate.interp1d(precision, recall)
find_thresh = interpolate.interp1d(precision, thresholds)
thresh_value = find_thresh(precision_score)
recall_score = find_recall(precision_score)
recall_score, thresh_value

In [None]:
plt.plot(recall, precision, label = 'auc=%.3f' % auc, marker = '.')
plt.plot(recall_score, precision_score, marker = 'o', color = 'black', label = 'Best')
plt.title(label = 'Precision Recall Curve')
plt.show()


In [None]:
tpr_opt = find_tpr(thresh_value)
fpr_opt = find_fpr(thresh_value)
print(tpr_opt, fpr_opt, thresh_value)

In [None]:
#locating optimal point
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

find_tpr = interpolate.interp1d(thresholds, tpr) #plug in thresholds for tpr
find_fpr = interpolate.interp1d(thresholds, fpr) #plug in thresh for fpr
thresh_look = interpolate.interp1d(tpr, thresholds) #plug in tpr for thresholds


In [None]:
ttt = thresh_look(0.6) #threshold value for value of tpr
ttt

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc = roc_auc_score(y_test,y_pred_proba)
plt.plot([0,1],[0,1], linestyle ='--')
plt.plot(fpr, tpr, label = 'auc=%.3f' % auc, marker = '.')
plt.plot(fpr_opt, tpr_opt, marker = 'o', color = 'black')
plt.title(label = 'ROC Curve Random Forest Classifier')
plt.show()

In [None]:
y_pred_opt = (y_pred_proba > ttt)
conf_mat_opt = confusion_matrix(y_test, y_pred_opt, labels = [0,1])
conf_mat_opt

In [None]:
#find best true negative (tn) results
tn = 7000
best_thresh = 0
for t in high_thresh:
    y_pred_opt = (y_pred_proba > t)
    conf_mat_opt = confusion_matrix(y_test, y_pred_opt, labels = [0,1])
    if conf_mat_opt[1][0] > tn:
        best_thresh = t
        tn = conf_mat_opt[1][0]
        conf_mat_best = conf_mat_opt

best_thresh, tn, conf_mat_best

In [None]:
high_thresh = thresholds[thresholds > 0.5]
high_thresh

In [None]:
from sklearn.metrics import accuracy_score

#accuracy prior to choosing threshold
accuracy = accuracy_score(y_test, y_pred_proba.astype('int'))
print(accuracy)

In [None]:
y_pred = y_pred_proba
y_pred = (y_pred >= opt_thresh).astype('int')

accuracy_thresh = accuracy_score(y_test, y_pred)
print(accuracy_thresh)

In [None]:
plt.plot(recall, precision, label = 'auc=%.3f' % auc, marker = '.')
plt.plot(0.9, prec_point, marker = 'o', color = 'black', label = 'Best')
plt.title(label = 'Precision Recall Curve')
plt.show()
