In [14]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the penguins dataset
df = sns.load_dataset("penguins")

df.dropna(inplace=True)

# Filter rows for 'Adelie' and 'Chinstrap' classes
selected_classes = ['Adelie', 'Chinstrap']
df_filtered = df[df['species'].isin(selected_classes)].copy()  # Make a copy to avoid the warning

# Initialize the LabelEncoder
le = LabelEncoder()

# Encode the species column
y_encoded = le.fit_transform(df_filtered['species'])

df_filtered['class_encoded'] = y_encoded

# Display the filtered and encoded DataFrame
print(df_filtered[['species', 'class_encoded']])

# Split the data into features (X) and target variable (y)

y = df_filtered['class_encoded']  # Target variable
X = df_filtered.drop(['species', 'island', 'sex','class_encoded'], axis=1)



       species  class_encoded
0       Adelie              0
1       Adelie              0
2       Adelie              0
4       Adelie              0
5       Adelie              0
..         ...            ...
215  Chinstrap              1
216  Chinstrap              1
217  Chinstrap              1
218  Chinstrap              1
219  Chinstrap              1

[214 rows x 2 columns]


---
1 What is the purpose of "y_encoded = le.fit_transform(df_filtered['species'])" ?

2 What is the purpose of "X = df.drop(['species', 'island', 'sex'], axis=1)" ?

3 Why we cannot use "island" and "sex" features?

---

1. The purpose of the line `y_encoded = le.fit_transform(df_filtered['species'])` is to encode the target variable `('species')` into numerical values. In many machine learning algorithms, the target variable should be numerical, not categorical. The LabelEncoder from scikit-learn is used to convert the species names (categorical labels) into unique integer labels. This encoding allows you to use the 'species' column as the target variable in machine learning models like logistic regression, where it requires numerical inputs for classification.


2. The purpose of the line X = df.drop(['species', 'island', 'sex'], axis=1) is to create a feature matrix X by removing columns from the original DataFrame df that are not intended to be used as features in a machine learning model. In this line, the columns 'species', 'island', and 'sex' are dropped from the DataFrame, leaving only the columns that are considered as features for training the model. This step is essential because machine learning models typically require numerical features, and certain columns like 'species', 'island', and 'sex' are categorical and need preprocessing before they can be used as features.

3. 
'Island' is a categorical feature representing the island on which a penguin was observed. To use it as a feature, you would typically perform one-hot encoding or another form of categorical encoding to convert it into numerical values. 

'Sex' is another categorical feature representing the gender of the penguin. Similar to 'island', we would need to perform encoding to convert it into numerical values. The choice of encoding method (e.g., one-hot encoding or label encoding) depends on the specific modeling approach we use.


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



logreg = LogisticRegression(solver='saga')

logreg.fit(X_train, y_train)

# Predict on the testing data
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(logreg.coef_, logreg.intercept_)

Accuracy: 0.5813953488372093
[[ 2.75700244e-03 -8.37987445e-05  4.49753068e-04 -2.85861156e-04]] [-8.54905722e-06]




 Why is accuracy low? why does the saga solver perform poorly?

The low accuracy could be due to various reasons such as data quality, imbalanced classes, and model complexity. The 'saga' solver might not perform optimally for your specific dataset.

In [19]:
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)
# Predict on the testing data
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(logreg.coef_, logreg.intercept_)

Accuracy: 1.0
[[ 1.59665154 -1.42501103 -0.15238046 -0.003951  ]] [-0.0755452]


Why is accuracy now? why does the "liblinear" solver perform better than "saga" solver ?


The "liblinear" solver may perform better than "saga" due to factors like dataset size, regularization strength, and dataset characteristics.

In [22]:
from sklearn.preprocessing import StandardScaler
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression with 'saga' solver and scaled features
logreg_saga = LogisticRegression(solver='saga', random_state=42)
logreg_saga.fit(X_train_scaled, y_train)
y_pred_saga = logreg_saga.predict(X_test_scaled)
accuracy_saga = accuracy_score(y_test, y_pred_saga)
print("Accuracy with 'saga' solver and scaled features:", accuracy_saga)

# Logistic Regression with 'liblinear' solver and unscaled features
logreg_liblinear = LogisticRegression(solver='liblinear', random_state=42)
logreg_liblinear.fit(X_train, y_train)
y_pred_liblinear = logreg_liblinear.predict(X_test)
accuracy_liblinear = accuracy_score(y_test, y_pred_liblinear)
print("Accuracy with 'liblinear' solver and unscaled features:", accuracy_liblinear)


Accuracy with 'saga' solver and scaled features: 0.9767441860465116
Accuracy with 'liblinear' solver and unscaled features: 1.0


Now observe the accuracies for both  "liblinear" solver and "saga" solver. Why accuracy of the "saga" solver is increased?

Now it has been increased in accuracy for the "saga" solver to 97% may be due to the benefits of feature scaling, which helps the solver converge to a better solution.



In [1]:

import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# Load the penguins dataset
df = sns.load_dataset("penguins")

df.dropna(inplace=True)

# Filter rows for 'Adelie' and 'Chinstrap' classes
selected_classes = ['Adelie', 'Chinstrap']
df_filtered = df[df['species'].isin(selected_classes)].copy()  # Make a copy to avoid the warning

# Initialize the LabelEncoder
le = LabelEncoder()

# Encode the species column
y_encoded = le.fit_transform(df_filtered['species'])
df_filtered['class_encoded'] = y_encoded


df_filtered.head()


# ---------------------------------------------------------------

#encode sex column
le_1 = LabelEncoder()

sex_encoded = le_1.fit_transform(df_filtered['sex'])
df_filtered.drop(['sex'], axis=1)
df_filtered['sex'] = sex_encoded

#encode island column
le_2 = LabelEncoder()

island_encoded = le_2.fit_transform(df_filtered['island'])
df_filtered.drop(['island'], axis=1)
df_filtered['island'] = island_encoded


# ---------------------------------------------------------------


X = df_filtered.drop(['species', 'class_encoded'], axis=1)  # Choose features
y = df_filtered['class_encoded']  # Target variable

X.head()



# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = LogisticRegression(solver='saga')

#logreg = LogisticRegression(max_iter=166, solver='newton-cg')
# logreg = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=100, multi_class='ovr', random_state=42)
logreg.fit(X_train, y_train)

# Predict on the testing data
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(logreg.coef_, logreg.intercept_)

Accuracy: 0.5813953488372093
[[-2.47386886e-05  2.75968297e-03 -8.11810431e-05  4.65073280e-04
  -2.86632956e-04  1.00269118e-05]] [-8.42137777e-06]




1. What is the problem? Why algorithm cannot perform classification?

beacuse, there are two non numerical data sets in the input data frame. So, it can't be usede to train the model when those are in string type.

2. How to solve this issue?

we can either remove or map that data to numberical value. Then we can run the model. But According to the observation 'Sex' and 'island' data sets don't give much impact to training model.

In [2]:
#encode string columns into numeric columns
# logistic regression
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# Load the penguins dataset
df = sns.load_dataset("penguins")

df.dropna(inplace=True)

# Filter rows for 'Adelie' and 'Chinstrap' classes
selected_classes = ['Adelie', 'Chinstrap']
df_filtered = df[df['species'].isin(selected_classes)].copy()  # Make a copy to avoid the warning

# Initialize the LabelEncoder
le = LabelEncoder()

# Encode the species column
y_encoded = le.fit_transform(df_filtered['species'])
df_filtered['class_encoded'] = y_encoded

df_filtered = pd.get_dummies(df_filtered, columns=['island', 'sex'], drop_first=True)
df_filtered.head()

X = df_filtered.drop(['species', 'class_encoded'], axis=1)  # Choose features
y = df_filtered['class_encoded']  # Target variable

X.head()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = LogisticRegression(solver='saga')

#logreg = LogisticRegression(max_iter=166, solver='newton-cg')
# logreg = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=100, multi_class='ovr', random_state=42)
logreg.fit(X_train, y_train)

# Predict on the testing data
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(logreg.coef_, logreg.intercept_)



Accuracy: 0.5813953488372093
[[ 2.76514270e-03 -8.25155584e-05  4.71591095e-04 -2.87125287e-04
   1.84941694e-04 -1.05141515e-04  1.05127125e-05]] [-8.48042914e-06]




In [3]:
# visualize the data
samples = df_filtered.groupby('sex_Male').head(1)
print(samples)
print()
samples = df_filtered.groupby('island_Torgersen').head(1)
print(samples)
print()
samples = df_filtered.groupby('island_Dream').head(1)
print(samples)

  species  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  \
0  Adelie            39.1           18.7              181.0       3750.0   
1  Adelie            39.5           17.4              186.0       3800.0   

   class_encoded  island_Dream  island_Torgersen  sex_Male  
0              0         False              True      True  
1              0         False              True     False  

   species  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  \
0   Adelie            39.1           18.7              181.0       3750.0   
20  Adelie            37.8           18.3              174.0       3400.0   

    class_encoded  island_Dream  island_Torgersen  sex_Male  
0               0         False              True      True  
20              0         False             False     False  

   species  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  \
0   Adelie            39.1           18.7              181.0       3750.0   
30  Adelie    

In [4]:
y = df_filtered['class_encoded']  # Target variable
print(X.shape, y.shape)
X.head()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import MaxAbsScaler
scaler=MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg = LogisticRegression(solver='saga',max_iter=150,)

#logreg = LogisticRegression(max_iter=166, solver='newton-cg')
# logreg = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=100, multi_class='ovr', random_state=42)
logreg.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred = logreg.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(logreg.coef_, logreg.intercept_)

(214, 7) (214,)
Accuracy: 1.0
[[ 3.63424554  0.16322352  0.6264067   0.10226686  2.59927497 -0.87716064
  -0.35907824]] [-5.99663294]


1. Why we are using the "MaxAbsScaler" scaler rather than the "StandardScaler"?

In MaxAbsScaler we map each value of data set into values between -1  and 1. So, this is a fix range scaling method relative to the standard scaler. Here, we don't want to map the values to normal distribution. Beacuse we don't want to change the shape of distribution. Therefore, we use MaxAbs Scaler rather than using Standard Scaler.

In [6]:
# visualize the scaled data

from sklearn.preprocessing import MaxAbsScaler
scaler=MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(X_test_scaled)

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(X_test_scaled)

[[0.59655172 0.98139535 0.93396226 0.91666667 0.         1.
  1.        ]
 [0.8862069  0.88372093 0.94811321 0.82291667 1.         0.
  1.        ]
 [0.68275862 0.8        0.9245283  0.73958333 0.         1.
  0.        ]
 [0.87586207 0.86046512 0.94811321 0.92708333 1.         0.
  1.        ]
 [0.7137931  0.86046512 0.95283019 0.80729167 0.         1.
  1.        ]
 [0.64310345 0.95348837 0.93867925 0.78645833 0.         1.
  1.        ]
 [0.65172414 0.85116279 0.82075472 0.70833333 0.         0.
  0.        ]
 [0.5862069  0.79534884 0.87264151 0.70833333 1.         0.
  0.        ]
 [0.73965517 0.81860465 0.9245283  0.97916667 0.         1.
  1.        ]
 [0.62068966 0.79534884 0.88207547 0.77083333 1.         0.
  0.        ]
 [0.82068966 0.85116279 0.91981132 0.80208333 1.         0.
  0.        ]
 [0.80517241 0.83255814 0.91981132 0.6875     1.         0.
  0.        ]
 [0.63103448 0.85581395 0.86792453 0.72395833 1.         0.
  0.        ]
 [0.72586207 0.88837209 0.91981132 0.8

1. What can you observe in the values related to "island_Dream",    "island_Torgersen"  and   "sex_Male" features before and after scaling?

before scaling we have boolean values as True or False , After scaling we can see, those values have been replaced with binary values . 

True  - > 1  , False - > 0 