QUESTION 1 -----

In [1]:
#HW2 Python 
#Efe Comu

import pandas as pd
import urllib.request
from io import StringIO

# Load data directly from the URL
link = "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
f = urllib.request.urlopen(link)
myfile = f.read().decode('utf-8')

# Define column names
column_names = ['ID', 'Diagnosis'] + [
    f"{feature}_{stat}" for feature in [
        'radius', 'texture', 'perimeter', 'area', 'smoothness',
        'compactness', 'concavity', 'concave_points', 'symmetry', 'fractal_dimension'
    ] for stat in ['mean', 'se', 'worst']
]

# Load the data into a pandas DataFrame
data = pd.read_csv(StringIO(myfile), header=None, names=column_names)

# Drop the ID column
data = data.drop(columns=['ID'])

# Summary statistics
print("Summary Statistics:")
print(data.describe())

# Convert the Diagnosis column to numeric (M -> 0, B -> 1)
data['Diagnosis'] = data['Diagnosis'].map({'M': 0, 'B': 1})

# Correlation matrix
print("\nCorrelation Matrix:")
print(data.corr())


Summary Statistics:
       radius_mean   radius_se  radius_worst  texture_mean  texture_se  \
count   569.000000  569.000000    569.000000    569.000000  569.000000   
mean     14.127292   19.289649     91.969033    654.889104    0.096360   
std       3.524049    4.301036     24.298981    351.914129    0.014064   
min       6.981000    9.710000     43.790000    143.500000    0.052630   
25%      11.700000   16.170000     75.170000    420.300000    0.086370   
50%      13.370000   18.840000     86.240000    551.100000    0.095870   
75%      15.780000   21.800000    104.100000    782.700000    0.105300   
max      28.110000   39.280000    188.500000   2501.000000    0.163400   

       texture_worst  perimeter_mean  perimeter_se  perimeter_worst  \
count     569.000000      569.000000    569.000000       569.000000   
mean        0.104341        0.088799      0.048919         0.181162   
std         0.052813        0.079720      0.038803         0.027414   
min         0.019380        0

QUESTION 2

In [5]:
# Import necessary libraries for modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Split the data into training and testing sets
X = data.drop('Diagnosis', axis=1)
y = data['Diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print('Label counts in y:', y.value_counts())# how many samples belong to each class before split
print('Label counts in y_train:',y_train.value_counts()) # occurnace of each classes in training set of target variable
print('Label counts in y_test:',y_test.value_counts()) # occurnace of each classes in test set of target variable

# Standardize the features (important for k-NN and Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# k-NN Model (k=3)
knn = KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski', n_jobs=-1, weights='distance')
knn.fit(X_train_scaled, y_train)

# Logistic Regression Model
logreg = LogisticRegression(multi_class='auto',
                                      C=1, # strong regulation 
                                      solver = 'lbfgs',
                                      max_iter=100)
logreg.fit(X_train, y_train)

# Coefficients of Logistic Regression
print("Logistic Regression Coefficients:", logreg.coef_)
print("Logistic Regression Intercept:", logreg.intercept_)


Label counts in y: Diagnosis
1    357
0    212
Name: count, dtype: int64
Label counts in y_train: Diagnosis
1    250
0    148
Name: count, dtype: int64
Label counts in y_test: Diagnosis
1    107
0     64
Name: count, dtype: int64
Logistic Regression Coefficients: [[ 1.79591926  0.03201459 -0.09401678  0.0105638  -0.07711261 -0.35435553
  -0.49069124 -0.1994849  -0.12908718 -0.02980321  0.07435899  0.74937425
   0.57087952 -0.08394849 -0.01133937 -0.0946256  -0.13000895 -0.02945253
  -0.04265046 -0.00953842  1.68208154 -0.2199121  -0.15592556 -0.03376549
  -0.13910817 -1.12239844 -1.37903499 -0.38703359 -0.40072344 -0.11216514]]
Logistic Regression Intercept: [0.31683073]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


QUESTION 3

In [7]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

# Predictions for k-NN
y_pred_knn = knn.predict(X_test_scaled)

# Predictions for Logistic Regression
y_pred_logreg = logreg.predict(X_test)

# Confusion Matrix for k-NN
print("k-NN Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))

# Confusion Matrix for Logistic Regression
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))

# Accuracy, Precision, Recall, and F1-Score for k-NN
print("k-NN Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_knn):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_knn):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_knn):.4f}")

# Accuracy, Precision, Recall, and F1-Score for Logistic Regression
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_logreg):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_logreg):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_logreg):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_logreg):.4f}")

# Classification Report for both models
print("k-NN Classification Report:\n", classification_report(y_test, y_pred_knn, target_names=['Malignant', 'Benign']))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_logreg, target_names=['Malignant', 'Benign']))


k-NN Confusion Matrix:
 [[ 56   8]
 [  0 107]]
Logistic Regression Confusion Matrix:
 [[ 56   8]
 [  2 105]]
k-NN Performance:
Accuracy: 0.9532
Precision: 0.9304
Recall: 1.0000
F1-Score: 0.9640
Logistic Regression Performance:
Accuracy: 0.9415
Precision: 0.9292
Recall: 0.9813
F1-Score: 0.9545
k-NN Classification Report:
               precision    recall  f1-score   support

   Malignant       1.00      0.88      0.93        64
      Benign       0.93      1.00      0.96       107

    accuracy                           0.95       171
   macro avg       0.97      0.94      0.95       171
weighted avg       0.96      0.95      0.95       171

Logistic Regression Classification Report:
               precision    recall  f1-score   support

   Malignant       0.97      0.88      0.92        64
      Benign       0.93      0.98      0.95       107

    accuracy                           0.94       171
   macro avg       0.95      0.93      0.94       171
weighted avg       0.94      0.94 

QUESTION 4
In the context of diagnosing breast cancer, Recall is often the most important metric to consider. Here's why:

Recall measures the ability to correctly identify all actual positive cases (malignant cancer). Missing malignant cases (false negatives) can have serious consequences, so it's important to minimize this.
F1-Score can also be used, as it balances both precision and recall, but in this case, Recall is more crucial as we prioritize catching malignant cases over false positives.

CROSS VALIDATION

In [11]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(logreg,                
                       X,  # whole dataset
                       y,          
                       cv=10)     #  10 fold 

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #confidence interval (scores.std() * 2).
print(scores)

# F-1 Scores 

scores_f1=cross_val_score(logreg,           
                          X,  # whole dataset
                          y,          
                          cv=10,                
                          scoring='f1_macro') 

print("F1-score: %0.2f (+/- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) 
print(scores_f1) 

scores = cross_val_score(logreg,              
                         X,           # whole dataset
                         y,         
                         cv=10,               
                         scoring='f1_macro') 

print(scores)
print("F1 score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy: 0.95 (+/- 0.03)
[0.92982456 0.92982456 0.92982456 0.92982456 0.94736842 0.96491228
 0.92982456 0.96491228 0.96491228 0.96428571]
F1-score: 0.94 (+/- 0.04)
[0.92460317 0.92297297 0.92297297 0.92460317 0.94286669 0.96230159
 0.92460317 0.96230159 0.96298701 0.96190476]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[0.92460317 0.92297297 0.92297297 0.92460317 0.94286669 0.96230159
 0.92460317 0.96230159 0.96298701 0.96190476]
F1 score: 0.94 (+/- 0.04)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt