<a href="https://colab.research.google.com/github/dreamingv-oid/CS290/blob/main/Penguins.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
#pip install palmerpenguins
from palmerpenguins import load_penguins
import pandas as pd

# Load the penguins dataset
penguins = load_penguins()

# Convert to pandas DataFrame if needed
df = pd.DataFrame(penguins)
print(df.head())



  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  year  
0       3750.0    male  2007  
1       3800.0  female  2007  
2       3250.0  female  2007  
3          NaN     NaN  2007  
4       3450.0  female  2007  


In [15]:
import numpy as np
import pandas as pd
from palmerpenguins import load_penguins
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the penguins dataset
penguins = load_penguins()

# Convert to pandas DataFrame
df = pd.DataFrame(penguins)

# Drop rows with missing values
df.dropna(inplace=True)

# Separate features and target variable
X = df.drop(columns=["species"])
y = df["species"]

# Encode categorical variables (like island and sex)
X_encoded = pd.get_dummies(X, columns=["island", "sex"])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

# Normalize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to calculate Euclidean distance
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# k-NN classifier implementation
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x):
        # Compute distances between x and all training samples
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        # Get the k nearest samples
        k_indices = np.argsort(distances)[:self.k]
        # Access y_train using iloc to ensure numerical indexing
        k_nearest_labels = [self.y_train.iloc[i] for i in k_indices]
        # Majority vote
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

# Instantiate the classifier
knn = KNN(k=5)

# Fit the model
knn.fit(X_train_scaled, y_train)

# Predict the labels for the test set
y_pred = knn.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 1.0000


In [16]:
# Import necessary libraries
import numpy as np
import pandas as pd
from palmerpenguins import load_penguins
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the penguins dataset
penguins = load_penguins()

# Convert to pandas DataFrame
df = pd.DataFrame(penguins)

# Drop rows with missing values
df.dropna(inplace=True)

# Convert species to binary classification: Adelie (1) vs. not Adelie (0)
df['species'] = np.where(df['species'] == 'Adelie', 1, 0)

# Separate features and target variable
X = df.drop(columns=["species"])
y = df["species"]

# Encode categorical variables (like island and sex)
X_encoded = pd.get_dummies(X, columns=["island", "sex"])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

# Normalize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to evaluate model performance
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

# (a) Logistic Regression
print("\nLogistic Regression")
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)
evaluate_model(y_test, y_pred_log_reg)

# (b) Support Vector Machine (SVM)
print("\nSupport Vector Machine (SVM)")
svm_model = SVC(kernel='linear', random_state=42)  # Linear kernel for binary classification
svm_model.fit(X_train_scaled, y_train)
y_pred_svm = svm_model.predict(X_test_scaled)
evaluate_model(y_test, y_pred_svm)



Logistic Regression
Accuracy: 0.9900
Precision: 0.9796
Recall: 1.0000
F1 Score: 0.9897
Confusion Matrix:
[[51  1]
 [ 0 48]]

Support Vector Machine (SVM)
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Confusion Matrix:
[[52  0]
 [ 0 48]]


In [18]:

import numpy as np
import pandas as pd
from palmerpenguins import load_penguins
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the penguins dataset
penguins = load_penguins()

# Convert to pandas DataFrame
df = pd.DataFrame(penguins)

# Drop rows with missing values
df.dropna(inplace=True)

# Separate features and target variable
X = df.drop(columns=["species"])
y = df["species"]

# Encode categorical variables (like island and sex)
X_encoded = pd.get_dummies(X, columns=["island", "sex"])

# Encode target variable (species)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Convert species names to numeric labels

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.3, random_state=42)

# Normalize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to evaluate model performance
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision (weighted): {precision:.4f}")
    print(f"Recall (weighted): {recall:.4f}")
    print(f"F1 Score (weighted): {f1:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

# (a) Softmax Regression (Multinomial Logistic Regression)
print("\nSoftmax Regression (Multinomial Logistic Regression)")
softmax_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42, max_iter=500)
softmax_reg.fit(X_train_scaled, y_train)
y_pred_softmax = softmax_reg.predict(X_test_scaled)
evaluate_model(y_test, y_pred_softmax)

# (b) Stochastic Gradient Descent (SGD) Classifier
print("\nStochastic Gradient Descent (SGD) Classifier")
sgd_clf = SGDClassifier(loss='log_loss', random_state=42, max_iter=1000, tol=1e-3) # Changed loss to 'log_loss'
sgd_clf.fit(X_train_scaled, y_train)
y_pred_sgd = sgd_clf.predict(X_test_scaled)
evaluate_model(y_test, y_pred_sgd)



Softmax Regression (Multinomial Logistic Regression)
Accuracy: 0.9900
Precision (weighted): 0.9902
Recall (weighted): 0.9900
F1 Score (weighted): 0.9899
Confusion Matrix:
[[48  0  0]
 [ 1 22  0]
 [ 0  0 29]]

Stochastic Gradient Descent (SGD) Classifier
Accuracy: 1.0000
Precision (weighted): 1.0000
Recall (weighted): 1.0000
F1 Score (weighted): 1.0000
Confusion Matrix:
[[48  0  0]
 [ 0 23  0]
 [ 0  0 29]]


