In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np

# Assume X_train, X_test, y_train, y_test are already defined and preprocessed
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict probabilities
y_probs = model.predict_proba(X_test)[:, 1]

# Try different thresholds
thresholds = [0.3, 0.5, 0.7]
for thresh in thresholds:
    y_pred_thresh = (y_probs >= thresh).astype(int)
    acc = accuracy_score(y_test, y_pred_thresh)
    prec = precision_score(y_test, y_pred_thresh)
    rec = recall_score(y_test, y_pred_thresh)
    print(f"Threshold {thresh}: Accuracy={acc:.2f}, Precision={prec:.2f}, Recall={rec:.2f}")

In [None]:
target_columns = ['target1', 'target2']  # example targets
for col in target_columns:
    model = LogisticRegression()
    model.fit(X_train, y_train[col])
    y_probs = model.predict_proba(X_test)[:, 1]
    
    for thresh in thresholds:
        y_pred = (y_probs >= thresh).astype(int)
        acc = accuracy_score(y_test[col], y_pred)
        print(f"{col} - Threshold {thresh}: Accuracy={acc:.2f}")


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Define feature types
numeric_features = ['num1', 'num2']
categorical_features = ['cat1', 'cat2']

# Preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(), categorical_features)
])

# Create pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

y_probs = clf.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_probs)
auc_score = roc_auc_score(y_test, y_probs)

plt.plot(fpr, tpr, label=f'LogReg AUC={auc_score:.2f}')
plt.plot([0, 1], [0, 1], 'k--')  # random line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Assume your data is in DataFrame `X`
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

inertias = []
silhouettes = []
K = range(2, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X_scaled, kmeans.labels_))

# Plot inertia and silhouette score
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(K, inertias, marker='o')
plt.title('Inertia vs k')
plt.xlabel('k')
plt.ylabel('Inertia')

plt.subplot(1, 2, 2)
plt.plot(K, silhouettes, marker='o')
plt.title('Silhouette Score vs k')
plt.xlabel('k')
plt.ylabel('Silhouette Score')

plt.tight_layout()
plt.show()


In [None]:
### 2. What if you don't scale your features?

If features are not scaled, those with larger numeric ranges will dominate the distance calculations used in clustering algorithms like KMeans. This leads to:

- **Biased clustering results**: One feature may disproportionately influence cluster assignment.
- **Poor Silhouette and Inertia scores**: Because clusters are distorted and not representative of true groupings.
- **Incorrect conclusions**: Features with smaller ranges are undervalued in the clustering process.

#### Example Comparison (With vs Without Scaling)

```python
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# With scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans_scaled = KMeans(n_clusters=3, random_state=42).fit(X_scaled)
score_scaled = silhouette_score(X_scaled, kmeans_scaled.labels_)

# Without scaling
kmeans_raw = KMeans(n_clusters=3, random_state=42).fit(X)
score_raw = silhouette_score(X, kmeans_raw.labels_)

print(f"Silhouette Score with scaling: {score_scaled:.2f}")
print(f"Silhouette Score without scaling: {score_raw:.2f}")


In [None]:
## Question 3: Is there a 'right' k? Why or why not?

In KMeans clustering, choosing the "right" number of clusters (`k`) is not always straightforward. There is **no universally correct value of k**, but you can use several techniques to estimate a suitable choice.

### üîç Common Methods to Choose k

1. **Elbow Method**:
   - Plot the inertia (within-cluster sum of squares) against different values of k.
   - Look for the "elbow" point where the decrease in inertia slows down ‚Äî this suggests diminishing returns with higher k.

   ```python
   from sklearn.cluster import KMeans
   import matplotlib.pyplot as plt

   inertias = []
   K = range(1, 11)

   for k in K:
       kmeans = KMeans(n_clusters=k, random_state=42)
       kmeans.fit(X_scaled)
       inertias.append(kmeans.inertia_)

   plt.plot(K, inertias, marker='o')
   plt.title("Elbow Method: Inertia vs K")
   plt.xlabel("Number of Clusters (k)")
   plt.ylabel("Inertia")
   plt.show()


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Load dataset (replace with your actual file path)
df = pd.read_csv("food_nutrients.csv")
X = df[['Protein', 'Fat', 'Carbohydrate', 'Calories']]  # example features

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Test different k values
inertias = []
silhouettes = []
k_range = range(2, 10)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X_scaled, kmeans.labels_))

# Plot results
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(k_range, inertias, marker='o')
plt.title("Inertia vs K")
plt.xlabel("k")
plt.ylabel("Inertia")

plt.subplot(1, 2, 2)
plt.plot(k_range, silhouettes, marker='o')
plt.title("Silhouette Score vs K")
plt.xlabel("k")
plt.ylabel("Silhouette Score")

plt.tight_layout()
plt.show()
