In [5]:
using Pkg;

Pkg.activate(mktempdir())
Pkg.update()
Pkg.add([
    "MLDatasets",
    "Statistics",
    "DataFrames"
])

[32m[1m  Activating[22m[39m new project at `C:\Users\79021\AppData\Local\Temp\jl_x3maeb`
[32m[1m    Updating[22m[39m registry at `C:\Users\79021\.julia\registries\General.toml`
[36m[1m     Project[22m[39m No packages added to or removed from `C:\Users\79021\AppData\Local\Temp\jl_x3maeb\Project.toml`
[36m[1m    Manifest[22m[39m No packages added to or removed from `C:\Users\79021\AppData\Local\Temp\jl_x3maeb\Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m    Updating[22m[39m `C:\Users\79021\AppData\Local\Temp\jl_x3maeb\Project.toml`
  [90m[a93c6f00] [39m[92m+ DataFrames v1.8.1[39m
  [90m[eb30cadb] [39m[92m+ MLDatasets v0.7.19[39m
  [90m[10745b16] [39m[92m+ Statistics v1.11.1[39m
[32m[1m    Updating[22m[39m `C:\Users\79021\AppData\Local\Temp\jl_x3maeb\Manifest.toml`
  [90m[7d9f7c33] [39m[92m+ Accessors v0.1.43[39m
  [90m[79e6a3ab] [39m[92m+ Adapt v4.4.0[39m
  [90m[66dad0bd] [39m[92m+ AliasTables v1.1.3[39m
  [90m

In [8]:
# Complete Implementation: Linear Classifier on Iris Dataset
# From scratch gradient descent

using MLDatasets
using Statistics
using Random
Random.seed!(42)

println("="^60)
println("IRIS CLASSIFICATION WITH GRADIENT DESCENT")
println("="^60)

# Load data
iris = Iris()
X_train = Float64.(iris.features)  # (4, 150) - 4 features, 150 samples
y_train = iris.targets             # (150,) - class labels 1,2,3

println("\nDataset loaded:")
println("  Features: $(size(X_train))")
println("  Labels: $(size(y_train))")
println("  Classes: $(unique(y_train))")

# Initialize model: linear classifier
# Input: 4 features → Output: 3 classes
W = randn(3, 4) * 0.01  # (3, 4) - small random weights
b = zeros(3)            # (3,) - biases start at zero

println("\nModel initialized:")
println("  W: $(size(W))")
println("  b: $(size(b))")

# Softmax function
function softmax(logits)
    # Subtract max for numerical stability
    logits_stable = logits .- maximum(logits, dims=1)
    exp_logits = exp.(logits_stable)
    return exp_logits ./ sum(exp_logits, dims=1)
end

# Cross-entropy loss
function cross_entropy_loss(probs, y_true)
    n_samples = length(y_true)
    loss = 0.0
    for i in 1:n_samples
        # Get probability of correct class
        correct_class = y_true[i]
        prob_correct = probs[correct_class, i]
        # Accumulate -log(probability)
        loss += -log(prob_correct + 1e-10)
    end
    return loss / n_samples
end

# Compute gradients
function compute_gradients(X, y_true, probs)
    n_samples = size(X, 2)
    
    # Create one-hot encoding of true labels
    one_hot = zeros(size(probs))
    for i in 1:n_samples
        one_hot[y_true[i], i] = 1.0
    end
    
    # Gradient w.r.t. logits (softmax + cross-entropy simplification!)
    dL_dlogits = (probs .- one_hot) ./ n_samples
    
    # Gradient w.r.t. weights: dL/dW = dL/dlogits * X^T
    dL_dW = dL_dlogits * X'
    
    # Gradient w.r.t. bias: sum over samples
    dL_db = sum(dL_dlogits, dims=2) |> vec
    
    return dL_dW, dL_db
end

# Compute accuracy
function accuracy(probs, y_true)
    predictions = argmax(probs, dims=1) |> vec
    return mean(predictions .== y_true)
end

# Training loop
learning_rate = 0.01
n_epochs = 100

println("\nTraining...")
println("-"^60)

for epoch in 1:n_epochs
    # Forward pass
    logits = W * X_train .+ b  # (3, 150)
    probs = softmax(logits)     # (3, 150)
    
    # Compute loss
    loss = cross_entropy_loss(probs, y_train)
    
    # Compute gradients
    dW, db = compute_gradients(X_train, y_train, probs)
    
    # Gradient descent update
    global W = W .- learning_rate .* dW
    global b = b .- learning_rate .* db
    
    # Print progress every 10 epochs
    if epoch % 10 == 0
        acc = accuracy(probs, y_train)
        println("Epoch $(lpad(epoch, 3)): Loss = $(round(loss, digits=4)), Accuracy = $(round(acc*100, digits=2))%")
    end
end

println("-"^60)

# Final evaluation
final_logits = W * X_train .+ b
final_probs = softmax(final_logits)
final_loss = cross_entropy_loss(final_probs, y_train)
final_acc = accuracy(final_probs, y_train)

println("\nFINAL RESULTS:")
println("  Loss: $(round(final_loss, digits=4))")
println("  Accuracy: $(round(final_acc*100, digits=2))%")

if final_acc > 0.95
    println("\n✓ SUCCESS! Model trained successfully!")
else
    println("\n⚠ Warning: Accuracy is lower than expected")
end

# Show learned parameters
println("\n" * "="^60)
println("LEARNED PARAMETERS")
println("="^60)
println("Weights W (how features contribute to each class):")
display(round.(W, digits=3))
println("\n\nBiases b:")
display(round.(b, digits=3))

println("\n\nInterpretation:")
println("  Each row of W corresponds to one class (setosa, versicolor, virginica)")
println("  Each column corresponds to one feature")
println("  W[i,j] = how much feature j contributes to predicting class i")

IRIS CLASSIFICATION WITH GRADIENT DESCENT

Dataset loaded:
  Features: (150, 4)
  Labels: (150, 1)
  Classes: 3×1 DataFrame
 Row │ class
     │ String15
─────┼─────────────────
   1 │ Iris-setosa
   2 │ Iris-versicolor
   3 │ Iris-virginica

Model initialized:
  W: (3, 4)
  b: (3,)

Training...
------------------------------------------------------------


MethodError: MethodError: no method matching *(::Matrix{Float64}, ::DataFrame)
The function `*` exists, but no method is defined for this combination of argument types.

Closest candidates are:
  *(::Any, ::Any, !Matched::Any, !Matched::Any...)
   @ Base operators.jl:642
  *(!Matched::ChainRulesCore.NotImplemented, ::Any)
   @ ChainRulesCore C:\Users\79021\.julia\packages\ChainRulesCore\Vsbj9\src\tangent_arithmetic.jl:37
  *(!Matched::ChainRulesCore.NoTangent, ::Any)
   @ ChainRulesCore C:\Users\79021\.julia\packages\ChainRulesCore\Vsbj9\src\tangent_arithmetic.jl:64
  ...
