<a href="https://colab.research.google.com/github/chandhu555/MachineLearning_Algorithms_Chandhu/blob/main/endsem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [4]:
# Load dataset
data = pd.read_csv("Lung_Cancer_dataset.csv")


print("Dataset Preview:")
print(data.head())
#Show information about dataset like coloms rows and data types
print("\nDataset Info:")
print(data.info())
#  Check for missing values in each column
print("\nMissing values:")
print(data.isnull().sum())


Dataset Preview:
     Name      Surname  Age  Smokes  AreaQ  Alkhol  Result
0    John         Wick   35       3      5       4       1
1    John  Constantine   27      20      2       5       1
2  Camela     Anderson   30       0      5       2       0
3    Alex       Telles   28       0      8       1       0
4   Diego     Maradona   68       4      5       6       1

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Name     59 non-null     object
 1   Surname  59 non-null     object
 2   Age      59 non-null     int64 
 3   Smokes   59 non-null     int64 
 4   AreaQ    59 non-null     int64 
 5   Alkhol   59 non-null     int64 
 6   Result   59 non-null     int64 
dtypes: int64(5), object(2)
memory usage: 3.4+ KB
None

Missing values:
Name       0
Surname    0
Age        0
Smokes     0
AreaQ      0
Alkhol     0
Result     0
dtype: int64


In [9]:
# Handle missing values (replace with column mean)
data = data.fillna(data.mean(numeric_only=True))

# Separate features and target
X = data.drop(columns=["Result", "Name", "Surname"])   # Assuming target column is named like this
y = data["Result"]

# Encode target if categorical
if y.dtype == "object":
    le = LabelEncoder()
    y = le.fit_transform(y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
# Train Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Predictions
y_pred = dt.predict(X_test)

# Evaluation
print("=== Baseline Decision Tree Performance ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



=== Baseline Decision Tree Performance ===
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-Score: 1.0

Confusion Matrix:
 [[6 0]
 [0 6]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         6

    accuracy                           1.00        12
   macro avg       1.00      1.00      1.00        12
weighted avg       1.00      1.00      1.00        12



In [14]:
# Using PCA to make data smaller (keep 95% of info)
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

# Spliting data into training and testing sets
X_train_pca, X_test_pca, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=42, stratify=y
)

# Training Decision Tree on PCA data
dt_pca = DecisionTreeClassifier(random_state=42)
dt_pca.fit(X_train_pca, y_train)

# Predicting results with test data
y_pred_pca = dt_pca.predict(X_test_pca)

# Evaluation
print("=== Decision Tree with PCA Performance ===")
print("Accuracy:", accuracy_score(y_test, y_pred_pca))
print("Precision:", precision_score(y_test, y_pred_pca))
print("Recall:", recall_score(y_test, y_pred_pca))
print("F1-Score:", f1_score(y_test, y_pred_pca))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_pca))
print("\nClassification Report:\n", classification_report(y_test, y_pred_pca))



=== Decision Tree with PCA Performance ===
Accuracy: 0.9166666666666666
Precision: 0.8571428571428571
Recall: 1.0
F1-Score: 0.9230769230769231

Confusion Matrix:
 [[5 1]
 [0 6]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.83      0.91         6
           1       0.86      1.00      0.92         6

    accuracy                           0.92        12
   macro avg       0.93      0.92      0.92        12
weighted avg       0.93      0.92      0.92        12

