In [None]:
#Imports 
from IPython.display import display

from collections import Counter
import pandas as pd
import numpy as np

from scipy.stats import ks_2samp
from scipy.stats import spearmanr

#Preprocessing and Visual

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import RocCurveDisplay

from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from IPython.display import display

from sklearn.decomposition import PCA

from sklearn.model_selection import cross_val_score



%matplotlib inline

# RANDOM STATE 42 USED THROUGHOUT SO VALUES ARE CONSISTENT WITH REPORT

In [None]:
#Load Dataset via pandas
FILE_NAME = "data.csv" #PLEASE CHANGE

main_df = pd.read_csv(FILE_NAME) #for reference
df = pd.read_csv(FILE_NAME) #file to change



In [None]:
#Column Categories - Easier to refer too
all_questions = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21',
       'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'Q27', 'Q28', 'Q29', 'Q30', 'Q31',
       'Q32']


training_features = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q18', 'Q19', 'Q20', 'Q21',
       'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'Q27', 'Q28', 'Q29', 'Q30', 'Q31',
       'Q32', 'age']

humour_scores = ['affiliative', 'selfenhancing', 'agressive', 'selfdefeating']

user_characteristics = ['age', 'gender', 'accuracy']


In [None]:
display(df.shape)
display(df.columns) 

In [None]:
#Removing 0 accuracy responses, gender and humour scores, storing accuracy values

df = df[df.accuracy != 0] 
print(f"These many instances removed: {len(df) - len(main_df)}")

df.drop(humour_scores+["gender","accuracy"], axis = 1, inplace = True, errors = "ignore")
display(df.columns)


In [None]:
#No blank values (Nan; excluding -1 values)
display(f"Null Values present?: {set(df.isnull().any())}")
#Data Type:
display(f"Data Types present: {set(df.dtypes)}")

In [None]:
#Apparent Range


display(f"Number of values for questions : {len(df.Q1.unique())}, therefore can be considered Ordinal since also integer")
display(f"Number of values for questions excluding -1: {len(df[df.Q1 != -1].Q1.unique())}, therefore can be considered Ordinal")
display(f"Number of unique age values: {len(df.age.unique())}, large number therefore can be considered continous")



#Range is consitent throughout all questions. According to codebook however -1 indicates unanaswered response 




In [None]:

#Finding Age outliers


display(df.age.describe())
#max value is over 122... considered outlier

sns.boxplot(main_df.age)
plt.show()


age_value_counts = df.age.value_counts()
sums = 0
max_age = 122

for value in age_value_counts.keys():
    if value > max_age:
        print(value,":", age_value_counts[value])
        sums += age_value_counts[value]
        
print(f"{sums} value(s) over 122")


In [None]:
#Number of missing responses per column, checking to see if we remove column
max_missing = 0
column_label = ""

for column in df[all_questions].columns:

    if -1 in df[column].value_counts().keys() and (df[column].value_counts()[-1]) > max_missing :   #IF COLUMN HAS -1 AND IF GREATER THAN WORST
            max_missing = (df[column].value_counts()[-1])
            column_label = column

f"Column with most amount of blanks: {column_label} with {max_missing} missing in column, insignificant amount"


In [None]:
#Rows with multiple missing values removed

per_column_missing_value = {} #FOR LATER USE TO REMOVE VALUES SPECIFIC TO COLUMN

minus_indexs = []
for column in df[all_questions]: #ITERATE THROUGH COLUMNS
    
    minus_indexs.extend(df[df[column] == float(-1)].index) #APPENDING INDEXES 
    per_column_missing_value[column] = df[df[column] == float(-1)].index 


#Key is number of missing entries in row - value is how many of those.
# For e.g 67 rows have at least 1 missing value


NUMBER_OF_MISSING_PER_INDEX = Counter(minus_indexs)

NUMBER_OF_MISSING_COUNT = Counter(NUMBER_OF_MISSING_PER_INDEX.values())

more_than_4_missing = 0  #Number of entries with more than 4 unaswered
OVER_LIMIT = 4 #Threshold

for key in NUMBER_OF_MISSING_COUNT.keys():
    if key > OVER_LIMIT: 
        more_than_4_missing += NUMBER_OF_MISSING_COUNT[key]

display(f"{more_than_4_missing} entries had more than 4 (> 4) missing values which will be removed")   #Including Q17 -> Q17 and three more or 4 other

Q17_MINUS_INDEXS = df[df["Q17"] == float(-1)].index   #ALL MISSING VALUES IN Q17 TO BE REMOVED -> AVOID UNCERTAINTY

display(f"Q17 had {len(Q17_MINUS_INDEXS)} blanks which will be removed")




In [None]:
to_remove = []


for key in NUMBER_OF_MISSING_PER_INDEX.keys():    
    if NUMBER_OF_MISSING_PER_INDEX[key] > 4:
        to_remove.append(key)


to_remove.extend(Q17_MINUS_INDEXS)




display(f"{len(to_remove)} will be removed since they either contain more than 4 missing values and/or have Q17 missing")

df.drop(list(set(to_remove)), axis = 0, inplace = True, errors = "ignore")




In [None]:
# FOR FUTURE WHEN WE REMOVE ROWS SPECIFIC TO COLUMNS
# WE REMOVE FROM DICTIONARY

# to_remove

for key in per_column_missing_value.keys():
    per_column_missing_value[key] = per_column_missing_value[key].to_list()



for key in per_column_missing_value.keys():
    
    for term in to_remove:
        if term in per_column_missing_value[key]:
            per_column_missing_value[key].remove(term)


# per_column_missing_value



In [None]:
#Column is encoded, showing distrubtion

df["Binary_Q17"] = pd.cut(df["Q17"], bins = [0,1,5], labels = [0,1])
BINARY_Q17_VALUE_COUNTS = df["Binary_Q17"].value_counts(normalize= True)

#Dropping Q17 column

df.drop("Q17", axis = 1, inplace=True, errors = "ignore")
display(BINARY_Q17_VALUE_COUNTS)

plt.pie([BINARY_Q17_VALUE_COUNTS[0],BINARY_Q17_VALUE_COUNTS[1]], labels = [0,1],autopct='%0.01f%%', startangle=90)
plt.show()






In [None]:
#SPLITING DATA SET
"NEEDS TO BE CHANGED"

X_train, X_test, y_train, y_test=train_test_split(df[training_features], df["Binary_Q17"], test_size=0.2, random_state=42,stratify=df["Binary_Q17"])


In [None]:
#Replacing Values for whole dataset Train and Test

training_set_question_modes = X_train.mode().drop("age", axis = 1) #TRAINING SET MODE
training_set_age_median = X_train.age.median() #TRAINING SET AGE MEDIAN


#Replace values across 3 datasets, X_train, X_test, df

for dataset in X_train, X_test, df:

    for column in training_features[:-1]: # Skip age column
        dataset[column] = np.where(dataset[column] == -1, training_set_question_modes[column], dataset[column])

    dataset["age"] = np.where(dataset.age > 122, training_set_age_median, dataset.age)


# # list(training_set_question_modes.values.ravel())
# list(testing_set_question_modes.values.ravel())

Exploratary Data Analysis

In [None]:
#Age analysis



kolmogorov_smirnov_results = ks_2samp(df[df["Binary_Q17"]==0]["age"], df[df["Binary_Q17"]==1]["age"])
kolmogorov_smirnov_results


fig, ax = plt.subplots()
sns.kdeplot(df[df["Binary_Q17"]==0]["age"], label="Binary_Q17: 0", ax=ax)
sns.kdeplot(df[df["Binary_Q17"]==1]["age"], label="Binary_Q17: 1", ax=ax)

ax.set_xlabel("Age"), ax.set_ylabel("Density"), ax.legend(), ax.set_title("Age distribtions specific to Q17_Binary")

plt.text(60, 0.035, "Kolmogorov-Smirnov test: ", ha='center', va='bottom',weight = "bold")
plt.text(60, 0.03, f"Statistic: {round(kolmogorov_smirnov_results.statistic, 3)}" ,ha='left', va='bottom')
plt.text(60, 0.026,f"P-Value: {round(kolmogorov_smirnov_results.pvalue, 3)}", ha='left', va='bottom')
plt.show()


# Will drop age column 


In [None]:
#Question Analysis


stats_results = []

for value in training_features[:-1]: #all except for last column
    correlation, pval = spearmanr(df[[value, "Binary_Q17"]])
    if pval < 0.05:
        stats_results.append([value,correlation])
    

stats_results = sorted(stats_results, key = lambda x: abs(x[1]), reverse=True)

NUMBER_OF_GRAPHS = 2
DIMENSIONS = 4,4

fig, ax =plt.subplots(1,NUMBER_OF_GRAPHS)
fig.set_size_inches(DIMENSIONS[0]*NUMBER_OF_GRAPHS, DIMENSIONS[1])

for i, value in enumerate([value[0] for value in stats_results[0:NUMBER_OF_GRAPHS]]):

    correlation, pval = spearmanr(df[[value, "Binary_Q17"]])
    # print(f'{value}: correlation={correlation:.6f}, p-value={pval:.6f}')

    sns.boxplot(x = "Binary_Q17", y =value, data = df, ax = ax[i])
    # ax[i].set_title(value, weight = "bold")
    
    ax[i].text(0, 5.2, f"Spearman: {round(correlation,3)} ", ha='center', va='bottom', weight = "bold")

    # display(pd.crosstab(df[value],df["Binary_Q17"]))
    # print("------\n")

plt.show()


#Top 5 values have absolute correlation over 5 with a pvalue < 0.05

In [None]:

selected_columns_replaced_values = []


for key in "Q21,Q1,Q25,Q5,Q29".split(","):
    selected_columns_replaced_values.extend(per_column_missing_value[key])


INDEXS_COUNT = Counter(selected_columns_replaced_values)


amount = Counter((INDEXS_COUNT).values())
TOTAL = 0
for key in amount.keys(): 
    if key > 1: 
        TOTAL += amount[key]

print(INDEXS_COUNT)




display(f"{TOTAL} needs to be removed from the dataset since these were replaced and since so few values its unlikely itll hinder statistical significance of previous tests")


In [None]:
#Remove entries with more than 2 blank values (reduce uncertaninty) from both test and train set
to_remove = []

for key in INDEXS_COUNT.keys(): 
    if INDEXS_COUNT[key] > 1:
        print(key)
        to_remove.append(key)

print(to_remove)
print("Above indexes to be removed since they contained more than 1 replaced values")

for value in to_remove:
    X_train.drop(value, errors = "ignore", inplace = True)
    y_train.drop(value, errors = "ignore", inplace = True)
    X_test.drop(value, errors = "ignore", inplace = True)
    y_test.drop(value, errors = "ignore", inplace = True)


# per_column_missing_value




In [None]:
X_train = X_train[np.array(stats_results[:5])[:,0]]
pd.concat([X_train, y_train], axis=1).head()

Scalar = MinMaxScaler()
Scalar.fit(X_train) #USING TRAIN DATA ONLY ALTHOUGH SHOULD BE SAME RANGE


In [None]:

X_train = pd.DataFrame(Scalar.transform(X_train), columns = X_train.columns)
X_train.head(20)

In [None]:


def apply_dm(func, alpha = 0.5, hue = y_train, **kwargs):

   
    arg_func = func(**kwargs)
    arg_func.fit(df)
    
    varimax_components = arg_func.fit_transform(X_train)

    # sns.scatterplot( x = varimax_components[:,0],hue=hue,y = varimax_components[:,1],alpha=alpha)
    # plt.show()
    print("-----")

    return arg_func

analysis = apply_dm(PCA, alpha = 0.8)
np.cumsum(analysis.explained_variance_ratio_)




# apply_dm(FactorAnalysis, alpha = alpha, rotation = "varimax")

In [None]:
#Model Training SVM GRID SEARCH





# C_values = [0.001,0.005,0.01,0.05,0.1,5,10,50,100,500,1000]



C_values = [0.001,0.1,10,1000]

tuned_parameters = [
    {"kernel": ["rbf"], "gamma": [1e2,1e1,1e-0,1e-1,1e-2], "C": C_values, "class_weight": ["balanced"]},
    {"kernel": ["linear"], "C": C_values,  "class_weight": ["balanced"]},
    {"kernel": ["poly"], "degree": [2,3,4], "C": C_values,  "class_weight": ["balanced"]},
    
]

# param_grid_SVM = {
#     'kernel': ["linear", "rbf","poly"],
#     'C': [0,0.01,1,100],
#     "degree": [2,3],
#     'gamma': [0.01,0.1,10,100],
#     'class_weight': ["balanced"]
# }

#BY DEFAULT STRATIFIED CROSS VALIDATOIN NO NEED TO SPECIFY

grid_search_svm = GridSearchCV(SVC(),tuned_parameters, scoring = "roc_auc",cv = 10, return_train_score=True, n_jobs = -1, refit = False, verbose = 1) 

grid_search_svm.fit(X_train, y_train)




In [None]:

grid_search_svm_results = pd.DataFrame(grid_search_svm.cv_results_)

temp = pd.melt(grid_search_svm_results, id_vars=["param_kernel"], value_vars=['mean_test_score', 'mean_train_score'])

temp.rename(columns = {"value": "AUC_Score"}, inplace = True)
sns.boxplot(data = temp, x = "param_kernel", y = "AUC_Score", hue = "variable")
plt.show()
# sns.boxplot(data = grid_search_svm_results, x = "param_kernel", y = "mean_test_score")
# plt.show()
# sns.boxplot(data = grid_search_svm_results, x = "param_kernel", y = "mean_test_score")
# plt.show()



temp = grid_search_svm_results[grid_search_svm_results.param_kernel == "linear"]

temp = pd.melt(temp, id_vars=[f"param_C"], value_vars=['mean_test_score', 'mean_train_score']).rename(columns = {"value": "AUC_Score"})
# temp.rename(columns = {"value": "AUC_Score"}, inplace = True)

grid = sns.lineplot(data = temp, x = f"param_C", y = "AUC_Score", hue = "variable")
grid.set(xscale="log")
grid.plot()
plt.show()





In [None]:
#CONVERGING ON C PARAMATERS


#Model Training SVM GRID SEARCH





C_values = [0.001,0.005,0.01,0.05,0.1,0.5,1,5,10,50,100,500,1000]



tuned_parameters = [
    {"kernel": ["linear"], "C": C_values,  "class_weight": ["balanced"]}    
]

# param_grid_SVM = {
#     'kernel': ["linear", "rbf","poly"],
#     'C': [0,0.01,1,100],
#     "degree": [2,3],
#     'gamma': [0.01,0.1,10,100],
#     'class_weight': ["balanced"]
# }

#BY DEFAULT STRATIFIED CROSS VALIDATOIN NO NEED TO SPECIFY

grid_search_svm = GridSearchCV(SVC(),tuned_parameters, scoring = "roc_auc",cv = 10, return_train_score=True, n_jobs = -1, refit = False, verbose = 1) 

grid_search_svm.fit(X_train, y_train)




In [None]:
display(grid_search_svm.best_params_)
display(grid_search_svm.best_score_)

In [None]:

x = []
y = []


for iterations in range(100,2000,20):
    clf = SVC(**grid_search_svm.best_params_, max_iter = iterations)
    scores = cross_val_score(clf, X_train, y_train, cv=10, scoring= "roc_auc")
    x.append(iterations)
    y.append(scores.mean())



In [None]:
fig = sns.lineplot(x,y)
fig.set_ylim(0.5,0.89)
fig.set(xlabel='iterations', ylabel='score')


--

In [None]:
#RANDOM FOREST



grid_values = {"max_depth": list(range(1,30,6)),
                "n_estimators": [1,10,100,200,500],
                'min_samples_split': range(1,30,5),
              "min_samples_leaf": range(1,30,5),
              "bootstrap": [True],
              "class_weight": ["balanced"], 
              "criterion": ["gini","entropy"] 
              }



grid_search_rfc = GridSearchCV(RandomForestClassifier(random_state = 42), param_grid = grid_values, scoring = 'roc_auc',cv = 10, return_train_score=True, n_jobs = -1, verbose = 10)

grid_search_rfc.fit(X_train, y_train)


# [5,10,15,20,25,35]
# [4,5,6,7,8]



In [None]:
# pd.DataFrame(grid_search_rfc.cv_results_)[["params", "mean_test_score"]].sort_values(by = "mean_test_score", ascending = False)


grid_search_rfc_results = pd.DataFrame(grid_search_rfc.cv_results_)


for variable in ["param_max_depth","param_n_estimators","param_min_samples_split","param_min_samples_leaf","param_max_depth"]:

    temp = pd.melt(grid_search_rfc_results, id_vars=[variable], value_vars=['mean_test_score', 'mean_train_score']).rename(columns = {"value": "AUC_Score"})
    sns.lineplot(data = temp, x = variable, y = "AUC_Score", hue = "variable")
    plt.show()



In [None]:
#RANDOM FOREST



grid_values = {"max_depth": list(range(1,10,1)),
                "n_estimators": [100,200],
                'min_samples_split': [10,20],
              "min_samples_leaf": [10,20],
              "bootstrap": [True],
              "class_weight": ["balanced"], 
              "criterion": ["gini"] 
              }



grid_search_rfc = GridSearchCV(RandomForestClassifier(random_state = 42), param_grid = grid_values, scoring = 'roc_auc',cv = 10, return_train_score=True, n_jobs = -1, verbose = 10)

grid_search_rfc.fit(X_train, y_train)


# [5,10,15,20,25,35]
# [4,5,6,7,8]



In [None]:
display(grid_search_rfc.best_params_)
display(grid_search_rfc.best_score_)

In [None]:
knn = KNeighborsClassifier()

n_neighbors = list(range(1,500,50))
p=[1,2]
weights = ["uniform", "distance"]
#Convert to dictionary
param_grid = dict(n_neighbors=n_neighbors, p=p, weights = weights)

  
# defining parameter range
grid_k = GridSearchCV(knn, param_grid, cv=10, scoring='roc_auc', return_train_score=True,verbose=10, n_jobs = -1)


grid_k.fit(X_train, y_train)



In [None]:
grid_knn_results = pd.DataFrame(grid_k.cv_results_)


temp = pd.melt(grid_knn_results, id_vars=["param_weights"], value_vars=['mean_test_score', 'mean_train_score'])
temp
sns.boxplot(data = temp, x = "param_weights", y = "value", hue = "variable")
plt.show()



# grid_knn_results = grid_knn_results[grid_knn_results.param_weights == "uniform"]
# temp = pd.melt(grid_knn_results, id_vars=["param_p"], value_vars=['mean_test_score', 'mean_train_score'])

# sns.boxplot(data = temp, x = "param_p", y = "value", hue = "variable")
# plt.show()


grid_knn_results = grid_knn_results[grid_knn_results.param_weights == "uniform"]
temp = pd.melt(grid_knn_results, id_vars=["param_n_neighbors"], value_vars=['mean_test_score', 'mean_train_score'])

sns.lineplot(data = temp, x = "param_n_neighbors", y = "value", hue = "variable")

In [None]:


# temp = grid_knn_results[grid_knn_results.param_weights == "uniform"]
# temp = temp[temp.param_p == 1]


# temp = pd.melt(temp, id_vars=["param_n_neighbors"], value_vars=['mean_test_score', 'mean_train_score']).rename(columns = {"value": "AUC_Score"})



# sns.lineplot(data = temp, x = "param_n_neighbors", y = "AUC_Score", hue = "variable")

# display(grid_k.best_params_)
display(grid_k.best_score_)

In [None]:
knn = KNeighborsClassifier()

n_neighbors = list(range(1,500,1))
p=[1]
weights = ["uniform"]
#Convert to dictionary
param_grid = dict(n_neighbors=n_neighbors, p=p, weights = weights)

  
# defining parameter range
grid_k = GridSearchCV(knn, param_grid, cv=10, scoring='roc_auc', return_train_score=True,verbose=10, n_jobs = -1)


grid_k.fit(X_train, y_train)



In [None]:
display(grid_k.best_params_)
display(grid_k.best_score_)

In [None]:
Scaled_X_test = pd.DataFrame(Scalar.transform(X_test[X_train.columns]), columns = X_train.columns)


In [None]:

# knn = KNeighborsClassifier(**{'n_neighbors': 89, 'p': 1, 'weights': 'uniform'})
knn = KNeighborsClassifier(**grid_k.best_params_)

# svm = SVC(**{'C': 50, 'class_weight': 'balanced', 'gamma': 0.1, 'kernel': 'rbf'})
svm = SVC(**grid_search_svm.best_params_)

# rfc = RandomForestClassifier(
#     **{'bootstrap': True,
#     'class_weight': "balanced",
#     'criterion': 'gini',
#     'max_depth': 3,
#     'min_samples_leaf': 19,
#     'min_samples_split': 52,
#     'n_estimators': 200} )

rfc = RandomForestClassifier(
    **grid_search_rfc.best_params_
                            )

rfc.fit(X_train, y_train)
svm.fit(X_train, y_train)
knn.fit(X_train, y_train)


knn_pred = knn.predict(Scaled_X_test)
svm_pred = svm.predict(Scaled_X_test)
rfc_pred = rfc.predict(Scaled_X_test)


ax = plt.gca()
svm_disp = RocCurveDisplay.from_estimator(svm, Scaled_X_test, y_test, ax=ax, alpha=0.8, color = "blue")

ax = plt.gca()
rfc_disp = RocCurveDisplay.from_estimator(rfc, Scaled_X_test, y_test, ax=ax, alpha=0.8, color = "orange")

ax = plt.gca()
knn_disp = RocCurveDisplay.from_estimator(knn, Scaled_X_test, y_test, ax=ax, alpha=0.8, color = "green")

plt.show()


In [None]:






# print("KNN")
KNN_CHART = pd.DataFrame(classification_report(y_test, knn_pred, output_dict=True)).transpose().round(3).loc[["0", "1", "macro avg"], ["precision", "recall", "f1-score"]]
display(KNN_CHART.style.set_caption("KNN"))
# display(confusion_matrix(y_test, knn_pred))



# display("SVM")
SVM_CHART = pd.DataFrame(classification_report(y_test, svm_pred, output_dict=True)).transpose().round(3).loc[["0", "1", "macro avg"], ["precision", "recall", "f1-score"]]
display(SVM_CHART.style.set_caption("SVM"))
# display(confusion_matrix(y_test, svm_pred))


# display("RFC")
RFC_CHART = pd.DataFrame(classification_report(y_test, rfc_pred, output_dict=True)).transpose().round(3).loc[["0", "1", "macro avg"], ["precision", "recall", "f1-score"]]


display(RFC_CHART.style.set_caption("RFC"))
# display(confusion_matrix(y_test, rfc_pred))








In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, knn.predict_proba(Scaled_X_test)[:,1])
plt.plot(recall, precision, label = 'Knn', color = "blue")
plt.xlabel('recall'), plt.ylabel('precision'), plt.title('Knn(n_neighbors = 8) PRC curve')



out_arr = np.divide(np.multiply(precision, recall), np.add(precision, recall)) + np.divide(np.multiply(precision, recall), np.add(precision, recall))


print(precision[np.where(out_arr == max(out_arr))[0][0]])
print(recall[np.where(out_arr == max(out_arr))[0][0]])


# print(max(out_arr))


precision, recall, thresholds = precision_recall_curve(y_test, svm.decision_function(Scaled_X_test))
plt.plot(recall, precision, label = 'Knn', color = "red")

# , plt.xlabel('recall'), plt.ylabel('precision'), plt.title('svm(n_neighbors = 8) PRC curve')

precision, recall, thresholds = precision_recall_curve(y_test, rfc.predict_proba(Scaled_X_test)[:,1])
plt.plot(recall, precision, label = 'Knn', color = "green")
plt.show()



# , plt.xlabel('recall'), plt.ylabel('precision'), plt.title('Knn(n_neighbors = 8) PRC curve'), plt.show()




In [None]:
knn = KNeighborsClassifier(**grid_k.best_params_)

knn.fit(X_train, y_train)

knn_pred = knn.predict(Scaled_X_test)
