<a href="https://colab.research.google.com/github/ginareigns/svm_model_cancer/blob/main/svmfo_cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sklearn

from sklearn import datasets
# the sklearn module has inbuilt datasets so you can use them after importing the library


from sklearn import svm
# imports the SVM algorithm from the scikit-learn library in Python.
#  Once the SVM algorithm is imported, you can create an SVM model object and use it for classification or regression tasks, depending on the problem you are trying to solve.


from sklearn import metrics
# imports the module "metrics" from the scikit-learn library in Python.
# The "metrics" module contains various functions for evaluating the performance of machine learning models, such as classification accuracy, precision, recall, F1-score, confusion matrix, ROC curve, AUC (Area Under the Curve).

from sklearn.neighbors import KNeighborsClassifier
#  imports the K-Nearest Neighbors (KNN) classifier algorithm from the scikit-learn library in Python.
# we going to use k-nearest to compare which works best among the two models

In [2]:
# loading the cancer dataset
cancer = datasets.load_breast_cancer()


In [4]:
# cancer.feature_names
# These features represent various measurements of cell nuclei present in the digitized image of a breast mass, and are used to predict whether the mass is benign or malignant.

# cancer.target_names
#  list of target names, which are 'benign' and 'malignant'


['malignant' 'benign']


In [8]:
x = cancer.data
# The line of code x = cancer.data assigns the feature matrix of the cancer dataset to the variable x.
#  The feature matrix contains the values of all the features for each instance in the dataset.

y = cancer.target
# The line of code x = cancer.data assigns the feature matrix of the cancer dataset to the variable x. 
# The feature matrix contains the values of all the features for each instance in the dataset.



In [10]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.2)
# sklearn.model_selection.train_test_split(x, y, test_size=0.2) performs a train-test split on the breast cancer diagnostic dataset.
# The train_test_split function from the sklearn.model_selection module randomly splits the x feature matrix and y target variable into two sets: a training set and a test set. 
# The training set is used to train the machine learning model, while the test set is used to evaluate its performance.

In [12]:
clf = svm.SVC(kernel="linear")
# The line of code clf = svm.SVC(kernel="linear") creates a Support Vector Machine (SVM) classifier with a linear kernel.
# The svm module in scikit-learn contains the implementation of the SVM algorithm for classification tasks.
#  The SVC class is used to create an SVM classifier, and the kernel parameter specifies the type of kernel function to be used.

In [13]:
clf.fit(x_train, y_train)
#  trains the SVM classifier clf on the training data x_train and y_train.
# The fit() method in scikit-learn is used to train a machine learning model on a training dataset. 
# In this case, the fit() method is called on the clf object, which is an instance of the SVC class from the svm module.

In [14]:
y_pred = clf.predict(x_test)
# uses the SVM classifier clf to make predictions on the test data x_test, and stores the predicted values in the variable y_pred.
# The predict() method in scikit-learn is used to make predictions on new data using a trained machine learning model. 
# In this case, the predict() method is called on the clf object, which is an instance of the SVC class from the svm module.

In [15]:
acc = metrics.accuracy_score(y_test, y_pred)
# The line of code acc = metrics.accuracy_score(y_test, y_pred) calculates the accuracy of the SVM classifier on the test data.
# The accuracy_score() function in scikit-learn is used to compute the accuracy of a classification model.
#  In this case, the function is called on the metrics module, which contains various metrics for evaluating the performance of machine learning models.

In [16]:
print(acc)
#  prints the accuracy of the SVM classifier on the test data to the console.

0.9649122807017544
