In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

# Import own libraries with custom technical routines and own implementation of logistic regression
from custom_routines import custom_routines as cr
from logisticReg import logistic_regression

# Import Naive Bayes classifiers from sci-kit learn
from sklearn.naive_bayes import MultinomialNB, GaussianNB

## 1. Clasificación de spam

In [2]:
spam_data = pd.read_csv("~/Desktop/mcc20192/mlearning/tareas/tarea3/data/spam.csv", header=None, delimiter=" ").values
print(len(spam_data))
#Partición aleatoria del conjunto de datos, 70% para entrenamiento y 30% para validación
train_spam_data, test_spam_data = train_test_split(spam_data, test_size=0.30, random_state=0)
print(len(train_spam_data),len(test_spam_data))

5172
3620 1552


In [3]:
multinomial_classifier = MultinomialNB()

multinomial_classifier.fit(train_spam_data[0:,:-1], train_spam_data[0:,-1])
print(multinomial_classifier.score(train_spam_data[0:,:-1], train_spam_data[0:,-1]))
print(multinomial_classifier.score(test_spam_data[0:,:-1], test_spam_data[0:,-1]))
m = len(test_spam_data)
mpred = multinomial_classifier.predict(test_spam_data[0:,:-1])
fpmulti = dict()
tpmulti = dict()
areaucmulti= dict()

for i in range(0,m):
    fpmulti[i], tpmulti[i], _ = roc_curve(test_spam_data[0:,-1], mpred)
    areaucmulti[i] = auc(fpmulti[i], tpmulti[i])
    
# Compute micro-average ROC curve and ROC area
fpmulti["micro"], tpmulti["micro"], _ = roc_curve(test_spam_data[0:,-1].ravel(), mpred.ravel())#  y_test.ravel(), y_score.ravel())
areaucmulti["micro"] = auc(fpmulti["micro"], tpmulti["micro"])

print("Área bajo la curva ROC =", areauc[2])

0.9541436464088398
0.9478092783505154


NameError: name 'areauc' is not defined

In [4]:
# Train logistic regression 
logit_reg1 = logistic_regression()
logit_reg1.fit(train_spam_data[0:,:-1],train_spam_data[0:,-1],0)

KeyboardInterrupt: 

In [None]:
print(logit_reg1.score(train_spam_data[0:,:-1],train_spam_data[0:,-1]))
print(logit_reg1.score(test_spam_data[0:,:-1],test_spam_data[0:,-1]))

In [None]:
# Compute ROC curve and area under 
logit_reg1_predicted = logit_reg1.predict(test_spam_data[0:,:-1])

falsepositive = dict()
truepositive = dict()
areauc= dict()

for i in range(0,m):
    falsepositive[i], truepositive[i], _ = roc_curve(test_spam_data[0:,-1], logit_reg1_predicted)
    areauc[i] = auc(falsepositive[i], truepositive[i])
    
# Compute micro-average ROC curve and ROC area
falsepositive["micro"], truepositive["micro"], _ = roc_curve(test_spam_data[0:,-1].ravel(), logit_reg1_predicted.ravel())#  y_test.ravel(), y_score.ravel())
areauc["micro"] = auc(falsepositive["micro"], truepositive["micro"])

print("Área bajo la curva ROC =", areauc[2])

In [None]:
plt.figure(figsize=(12,6))
plt.title('Curva ROC para logit_reg1')
plt.plot(falsepositive[2], truepositive[2], color='darkorange', label='Logistic ROC')
plt.plot(fpmulti[2], tpmulti[2], color='navy', label='MultinomialNB ROC')
plt.plot(0,0,'ko', label = 'Área Regresión Logit ~ %0.2f' % areauc[2])
plt.plot(0,0,'bo', label = 'Área MultinomialNB ~ %0.2f' % areaucmulti[2])
plt.plot([0, 1], [0, 1], color='blue', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Proporción de Falsos Positivos')
plt.ylabel('Proporción de Verdaderos Positivos')

plt.legend(loc = 'lower right')
plt.show()

## 2. Clasificación de tumores de seno

In [None]:
cancer_data_raw = pd.read_csv("~/Desktop/mcc20192/mlearning/tareas/tarea3/data/breast-cancer-wisconsin.data", header=None).values[0:,1:]
print(cancer_data_raw)

In [None]:
#Sustituir valores desconocidos por 0 y cambiar las etiquetas por 0 y 1
n = len(cancer_data_raw)
l = len(cancer_data_raw[0])
cancer_data = np.ndarray(shape = (n,l))
type(cancer_data[0][0])

for i in range(0, n):
    for j in range(0, l-1):
        if cancer_data_raw[i][j] == '?':
            cancer_data[i][j] = 0
        else:
            cancer_data[i][j] = int(cancer_data_raw[i][j])
    cancer_data[i][-1] = int(cancer_data_raw[i][-1]/2 -1)

print(cancer_data)

#Partición aleatoria del conjunto de datos, 70% para entrenamiento y 30% para validación
train_cancer_data, test_cancer_data = train_test_split(cancer_data, test_size = 0.30, random_state=3)
print(len(train_cancer_data),len(test_cancer_data))

In [None]:
gaussian_classifier = GaussianNB()

# Entrenamos el clasificador
print(gaussian_classifier.fit(train_cancer_data[0:,:-1], train_cancer_data[0:,-1]))

In [None]:
gaussian_classifier.score(test_cancer_data[0:,:-1],test_cancer_data[0:,-1])
gpred = gaussian_classifier.predict(test_cancer_data[0:,:-1])
gpred

fpgauss = dict()
tpgauss = dict()
areaucgauss = dict()
for i in range(0,len(test_cancer_data)):
    fpgauss[i], tpgauss[i], _ = roc_curve(test_cancer_data[0:,-1], gpred)
    areaucgauss[i] = auc(fpgauss[i], tpgauss[i])

# Compute micro-average ROC curve and ROC area
fpgauss["micro"], tpgauss["micro"], _ = roc_curve(test_cancer_data[0:,-1].ravel(), gpred.ravel())#  y_test.ravel(), y_score.ravel())
areaucgauss["micro"] = auc(fpgauss["micro"], tpgauss["micro"])

print("Área bajo la curva ROC =", areaucgauss[2])

In [None]:
# Train logistic regression 
logit_reg2 = logistic_regression()
logit_reg2.fit(train_cancer_data[0:,:-1], train_cancer_data[0:,-1],0)

In [None]:
print(logit_reg2.score(train_cancer_data[0:,:-1], train_cancer_data[0:,-1]))
print(logit_reg2.score(test_cancer_data[0:,:-1], test_cancer_data[0:,-1]))

In [None]:
# Compute ROC curve and area under 
logit_reg2_predicted = logit_reg2.predict(test_cancer_data[0:,:-1])

falsepositive2 = dict()
truepositive2 = dict()
areauc2 = dict()
for i in range(0,len(test_cancer_data)):
    falsepositive2[i], truepositive2[i], _ = roc_curve(test_cancer_data[0:,-1], logit_reg2_predicted)
    areauc2[i] = auc(falsepositive2[i], truepositive2[i])

# Compute micro-average ROC curve and ROC area
falsepositive2["micro"], truepositive2["micro"], _ = roc_curve(test_cancer_data[0:,-1].ravel(), logit_reg2_predicted.ravel())#  y_test.ravel(), y_score.ravel())
areauc2["micro"] = auc(falsepositive2["micro"], truepositive2["micro"])

print("Área bajo la curva ROC =", areauc2[2])

In [None]:
plt.figure(figsize=(12,6))
plt.title('Curvas ROC')
plt.plot(falsepositive2[2], truepositive2[2], color='darkorange', label='Logistic ROC')
plt.plot(fpgauss[2], tpgauss[2], color='navy', label='GaussianNB ROC')
plt.plot(0,0,'ko', label = 'Área Regresión Logit ~ %0.2f' % areauc2[2])
plt.plot(0,0,'ko', label = 'Área GaussianNB ~ %0.2f' % areaucgauss[2])
plt.plot([0, 1], [0, 1], color='blue', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Proporción de Falsos Positivos')
plt.ylabel('Proporción de Veraderos Positivos')
plt.legend(loc = 'lower right')
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
#for i in range(0,10):
#    train_data, test_data= train_test_split(data_enc, test_size=0.30, random_state=3*i)
#    clf = LogisticRegression(C = 100, random_state=0, fit_intercept = False, solver='lbfgs', multi_class='ovr').fit(train_data[0:,:-1], train_data[0:,-1])
#    print(clf.score(test_data[0:,:-1], test_data[0:,-1]))#data_enc[0:,:-1], data_enc[0:,-1])
clf = LogisticRegression(C = 100, random_state=0, fit_intercept = False, solver='lbfgs', multi_class='ovr').fit(train_cancer_data[0:,:-1], train_cancer_data[0:,-1])
pred = clf.predict(test_cancer_data[0:,:-1])

falsepositive3 = dict()
truepositive3 = dict()
areauc3 = dict()
for i in range(0,len(test_cancer_data)):
    falsepositive3[i], truepositive3[i], _ = roc_curve(test_cancer_data[0:,-1], pred)
    areauc3[i] = auc(falsepositive3[i], truepositive3[i])

# Compute micro-average ROC curve and ROC area
falsepositive3["micro"], truepositive3["micro"], _ = roc_curve(test_cancer_data[0:,-1].ravel(), pred.ravel())#  y_test.ravel(), y_score.ravel())
areauc2["micro"] = auc(falsepositive3["micro"], truepositive3["micro"])

print("Área bajo la curva ROC =", areauc3[2])


In [None]:
plt.title('Curva ROC para sklearn-clf')
plt.plot(falsepositive3[2], truepositive3[2], color='darkorange', label='Curva ROC')
plt.plot(0,0,'ko', label = 'Área bajo la curva ~ %0.2f' % areauc3[2])
plt.plot([0, 1], [0, 1], color='blue', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc = 'lower right')
plt.show()