From 6cb6852e7d47f7bd3d399c8e1f378663e0024a15 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 13 May 2026 23:08:16 +0000 Subject: [PATCH 1/5] [Autoloop: build-tsikit-learn-scikit-learn-typescript-migration] Iteration 8: Add 28 new sklearn modules (LDA, RandomForest, GradientBoosting, SVC, MLP, etc.) Adds 28 new TypeScript source files bringing total from 15 to 43 files (metric: 43). New modules: - linear_model: LogisticRegression, Lasso, ElasticNet, SGDClassifier, SGDRegressor, Perceptron - metrics: silhouetteScore, adjustedRandScore, homogeneityScore - model_selection: GridSearchCV, crossValScore - svm: SVC, SVR - compose: ColumnTransformer - neural_network: MLPClassifier, MLPRegressor - tree: DecisionTreeClassifier, DecisionTreeRegressor - ensemble: RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor - neighbors: KNeighborsClassifier, KNeighborsRegressor, RadiusNeighborsClassifier, RadiusNeighborsRegressor - cluster: KMeans, DBSCAN - decomposition: PCA, TruncatedSVD, NMF - naive_bayes: GaussianNB, MultinomialNB, BernoulliNB - impute: SimpleImputer - pipeline: Pipeline, makePipeline - feature_selection: SelectKBest, SelectPercentile, VarianceThreshold, fClassif, fRegression, chi2 - datasets: makeClassification, makeRegression, makeBlobs, makeMoons, makeCircles - preprocessing: PolynomialFeatures, OneHotEncoder, OrdinalEncoder - discriminant_analysis: LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis - isotonic: IsotonicRegression - multiclass: OneVsRestClassifier, OneVsOneClassifier - calibration: CalibratedClassifierCV Run: https://github.com/githubnext/tsikit-learn/actions/runs/25830884200 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/calibration/calibration.ts | 141 ++++++++ src/calibration/index.ts | 1 + src/cluster/index.ts | 1 + src/cluster/kmeans.ts | 301 +++++++++++++++++ src/compose/column_transformer.ts | 102 ++++++ src/compose/index.ts | 1 + src/datasets/index.ts | 1 + src/datasets/make_datasets.ts | 216 ++++++++++++ src/decomposition/index.ts | 2 + src/decomposition/nmf.ts | 154 +++++++++ src/decomposition/pca.ts | 244 ++++++++++++++ src/discriminant_analysis/index.ts | 1 + src/discriminant_analysis/lda.ts | 301 +++++++++++++++++ src/ensemble/gradient_boosting.ts | 195 +++++++++++ src/ensemble/index.ts | 2 + src/ensemble/random_forest.ts | 215 ++++++++++++ src/feature_selection/index.ts | 1 + src/feature_selection/univariate.ts | 248 ++++++++++++++ src/impute/index.ts | 1 + src/impute/simple_imputer.ts | 88 +++++ src/index.ts | 61 +++- src/isotonic/index.ts | 1 + src/isotonic/isotonic.ts | 121 +++++++ src/linear_model/index.ts | 4 + src/linear_model/lasso.ts | 180 ++++++++++ src/linear_model/logistic_regression.ts | 120 +++++++ src/linear_model/perceptron.ts | 97 ++++++ src/linear_model/sgd.ts | 199 +++++++++++ src/metrics/clustering.ts | 155 +++++++++ src/metrics/index.ts | 1 + src/model_selection/index.ts | 1 + src/model_selection/search.ts | 145 ++++++++ src/multiclass/index.ts | 1 + src/multiclass/one_vs_rest.ts | 159 +++++++++ src/naive_bayes/index.ts | 1 + src/naive_bayes/naive_bayes.ts | 300 +++++++++++++++++ src/neighbors/index.ts | 2 + src/neighbors/knn.ts | 177 ++++++++++ src/neighbors/radius.ts | 149 ++++++++ src/neural_network/index.ts | 1 + src/neural_network/mlp.ts | 402 ++++++++++++++++++++++ src/pipeline/index.ts | 1 + src/pipeline/pipeline.ts | 95 ++++++ src/preprocessing/encoders.ts | 124 +++++++ src/preprocessing/index.ts | 2 + src/preprocessing/polynomial_features.ts | 106 ++++++ src/svm/index.ts | 1 + src/svm/svc.ts | 412 +++++++++++++++++++++++ src/tree/decision_tree.ts | 251 ++++++++++++++ src/tree/index.ts | 1 + 50 files changed, 5477 insertions(+), 9 deletions(-) create mode 100644 src/calibration/calibration.ts create mode 100644 src/calibration/index.ts create mode 100644 src/cluster/index.ts create mode 100644 src/cluster/kmeans.ts create mode 100644 src/compose/column_transformer.ts create mode 100644 src/compose/index.ts create mode 100644 src/datasets/index.ts create mode 100644 src/datasets/make_datasets.ts create mode 100644 src/decomposition/index.ts create mode 100644 src/decomposition/nmf.ts create mode 100644 src/decomposition/pca.ts create mode 100644 src/discriminant_analysis/index.ts create mode 100644 src/discriminant_analysis/lda.ts create mode 100644 src/ensemble/gradient_boosting.ts create mode 100644 src/ensemble/index.ts create mode 100644 src/ensemble/random_forest.ts create mode 100644 src/feature_selection/index.ts create mode 100644 src/feature_selection/univariate.ts create mode 100644 src/impute/index.ts create mode 100644 src/impute/simple_imputer.ts create mode 100644 src/isotonic/index.ts create mode 100644 src/isotonic/isotonic.ts create mode 100644 src/linear_model/lasso.ts create mode 100644 src/linear_model/logistic_regression.ts create mode 100644 src/linear_model/perceptron.ts create mode 100644 src/linear_model/sgd.ts create mode 100644 src/metrics/clustering.ts create mode 100644 src/model_selection/search.ts create mode 100644 src/multiclass/index.ts create mode 100644 src/multiclass/one_vs_rest.ts create mode 100644 src/naive_bayes/index.ts create mode 100644 src/naive_bayes/naive_bayes.ts create mode 100644 src/neighbors/index.ts create mode 100644 src/neighbors/knn.ts create mode 100644 src/neighbors/radius.ts create mode 100644 src/neural_network/index.ts create mode 100644 src/neural_network/mlp.ts create mode 100644 src/pipeline/index.ts create mode 100644 src/pipeline/pipeline.ts create mode 100644 src/preprocessing/encoders.ts create mode 100644 src/preprocessing/polynomial_features.ts create mode 100644 src/svm/index.ts create mode 100644 src/svm/svc.ts create mode 100644 src/tree/decision_tree.ts create mode 100644 src/tree/index.ts diff --git a/src/calibration/calibration.ts b/src/calibration/calibration.ts new file mode 100644 index 0000000..948aa5f --- /dev/null +++ b/src/calibration/calibration.ts @@ -0,0 +1,141 @@ +/** + * Probability calibration. + * Mirrors sklearn.calibration.CalibratedClassifierCV. + * Uses Platt scaling (logistic) or isotonic regression for calibration. + */ + +import { NotFittedError } from "../exceptions.js"; + +interface Classifier { + fit(X: Float64Array[], y: Float64Array): this; + predict(X: Float64Array[]): Float64Array; + score?(X: Float64Array[], y: Float64Array): number; +} + +function sigmoid(x: number): number { + return 1 / (1 + Math.exp(-x)); +} + +/** Platt scaling: fit a logistic function on scores to map to probabilities. */ +function plattScale(scores: Float64Array, y: Float64Array): [number, number] { + const n = scores.length; + let A = 0; + let B = 0; + const lr = 0.01; + + for (let iter = 0; iter < 1000; iter++) { + let gradA = 0; + let gradB = 0; + for (let i = 0; i < n; i++) { + const p = sigmoid(A * (scores[i] ?? 0) + B); + const err = p - (y[i] ?? 0); + gradA += err * (scores[i] ?? 0); + gradB += err; + } + A -= lr * gradA / n; + B -= lr * gradB / n; + } + + return [A, B]; +} + +export class CalibratedClassifierCV { + baseEstimator: Classifier; + method: string; + cv: number; + + calibratedEstimators_: { + estimator: Classifier; + A: number; + B: number; + }[] | null = null; + classes_: Float64Array | null = null; + + constructor( + baseEstimator: Classifier, + options: { method?: string; cv?: number } = {}, + ) { + this.baseEstimator = baseEstimator; + this.method = options.method ?? "sigmoid"; + this.cv = options.cv ?? 5; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const uniqueClasses = Array.from(new Set(Array.from(y))).sort((a, b) => a - b); + this.classes_ = new Float64Array(uniqueClasses); + const posClass = uniqueClasses[uniqueClasses.length - 1] ?? 1; + + const yBin = new Float64Array(y.map((yi) => (yi === posClass ? 1 : 0))); + + // Simple hold-out calibration + const foldSize = Math.floor(n / this.cv); + this.calibratedEstimators_ = []; + + for (let fold = 0; fold < this.cv; fold++) { + const testStart = fold * foldSize; + const testEnd = fold === this.cv - 1 ? n : testStart + foldSize; + + const trainIdx: number[] = []; + const testIdx: number[] = []; + for (let i = 0; i < n; i++) { + if (i >= testStart && i < testEnd) testIdx.push(i); + else trainIdx.push(i); + } + + const XTrain = trainIdx.map((i) => X[i] ?? new Float64Array(0)); + const yTrain = new Float64Array(trainIdx.map((i) => y[i] ?? 0)); + const XTest = testIdx.map((i) => X[i] ?? new Float64Array(0)); + const yTest = new Float64Array(testIdx.map((i) => yBin[i] ?? 0)); + + const est = Object.create(Object.getPrototypeOf(this.baseEstimator) as object) as Classifier; + Object.assign(est, this.baseEstimator); + est.fit(XTrain, yTrain); + + const testPred = est.predict(XTest); + const [A, B] = plattScale(testPred, yTest); + + this.calibratedEstimators_.push({ estimator: est, A, B }); + } + + return this; + } + + predictProba(X: Float64Array[]): Float64Array[] { + if (this.calibratedEstimators_ === null) throw new NotFittedError("CalibratedClassifierCV"); + + const n = X.length; + const probs = new Float64Array(n); + + for (const { estimator, A, B } of this.calibratedEstimators_) { + const scores = estimator.predict(X); + for (let i = 0; i < n; i++) { + probs[i] = (probs[i] ?? 0) + sigmoid(A * (scores[i] ?? 0) + B); + } + } + + const k = this.calibratedEstimators_.length; + return Array.from({ length: n }, (_, i) => { + const p = (probs[i] ?? 0) / k; + return new Float64Array([1 - p, p]); + }); + } + + predict(X: Float64Array[]): Float64Array { + if (this.classes_ === null) throw new NotFittedError("CalibratedClassifierCV"); + const classes = this.classes_; + const proba = this.predictProba(X); + const posClass = classes[classes.length - 1] ?? 1; + const negClass = classes[0] ?? 0; + return new Float64Array(proba.map((p) => ((p[1] ?? 0) >= 0.5 ? posClass : negClass))); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} diff --git a/src/calibration/index.ts b/src/calibration/index.ts new file mode 100644 index 0000000..e03c3f7 --- /dev/null +++ b/src/calibration/index.ts @@ -0,0 +1 @@ +export * from "./calibration.js"; diff --git a/src/cluster/index.ts b/src/cluster/index.ts new file mode 100644 index 0000000..193e946 --- /dev/null +++ b/src/cluster/index.ts @@ -0,0 +1 @@ +export * from "./kmeans.js"; diff --git a/src/cluster/kmeans.ts b/src/cluster/kmeans.ts new file mode 100644 index 0000000..af5ef39 --- /dev/null +++ b/src/cluster/kmeans.ts @@ -0,0 +1,301 @@ +/** + * KMeans and DBSCAN clustering. + * Mirrors sklearn.cluster.KMeans and DBSCAN. + */ + +import { NotFittedError } from "../exceptions.js"; + +function euclideanSq(a: Float64Array, b: Float64Array): number { + let s = 0; + for (let i = 0; i < a.length; i++) { + s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2; + } + return s; +} + +function euclidean(a: Float64Array, b: Float64Array): number { + return Math.sqrt(euclideanSq(a, b)); +} + +export class KMeans { + nClusters: number; + maxIter: number; + tol: number; + nInit: number; + + clusterCenters_: Float64Array[] | null = null; + labels_: Int32Array | null = null; + inertia_: number = 0; + + constructor( + options: { + nClusters?: number; + maxIter?: number; + tol?: number; + nInit?: number; + } = {}, + ) { + this.nClusters = options.nClusters ?? 8; + this.maxIter = options.maxIter ?? 300; + this.tol = options.tol ?? 1e-4; + this.nInit = options.nInit ?? 10; + } + + private _kmeanspp(X: Float64Array[], k: number): Float64Array[] { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const centers: Float64Array[] = []; + + // Pick first center randomly + centers.push(new Float64Array(X[Math.floor(Math.random() * n)] ?? new Float64Array(p))); + + for (let c = 1; c < k; c++) { + const dists = X.map((xi) => { + let minD = Infinity; + for (const center of centers) { + const d = euclideanSq(xi, center); + if (d < minD) minD = d; + } + return minD; + }); + const totalDist = dists.reduce((a, b) => a + b, 0); + let rand = Math.random() * totalDist; + let selected = 0; + for (let i = 0; i < n; i++) { + rand -= dists[i] ?? 0; + if (rand <= 0) { + selected = i; + break; + } + } + centers.push(new Float64Array(X[selected] ?? new Float64Array(p))); + } + return centers; + } + + private _run( + X: Float64Array[], + k: number, + ): { centers: Float64Array[]; labels: Int32Array; inertia: number } { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + let centers = this._kmeanspp(X, k); + const labels = new Int32Array(n); + + for (let iter = 0; iter < this.maxIter; iter++) { + // Assignment step + for (let i = 0; i < n; i++) { + let minDist = Infinity; + let minIdx = 0; + for (let c = 0; c < centers.length; c++) { + const d = euclideanSq(X[i] ?? new Float64Array(p), centers[c] ?? new Float64Array(p)); + if (d < minDist) { + minDist = d; + minIdx = c; + } + } + labels[i] = minIdx; + } + + // Update step + const newCenters: Float64Array[] = Array.from({ length: k }, () => new Float64Array(p)); + const counts = new Int32Array(k); + for (let i = 0; i < n; i++) { + const c = labels[i] ?? 0; + counts[c] = (counts[c] ?? 0) + 1; + const xi = X[i] ?? new Float64Array(p); + const center = newCenters[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + center[j] = (center[j] ?? 0) + (xi[j] ?? 0); + } + } + + let maxShift = 0; + for (let c = 0; c < k; c++) { + const cnt = counts[c] ?? 0; + const center = newCenters[c] ?? new Float64Array(p); + if (cnt > 0) { + for (let j = 0; j < p; j++) { + center[j] = (center[j] ?? 0) / cnt; + } + } else { + // Re-initialize empty cluster to a random point + const randIdx = Math.floor(Math.random() * n); + newCenters[c] = new Float64Array(X[randIdx] ?? new Float64Array(p)); + } + const shift = euclideanSq(centers[c] ?? new Float64Array(p), newCenters[c] ?? new Float64Array(p)); + if (shift > maxShift) maxShift = shift; + } + centers = newCenters; + if (maxShift < this.tol ** 2) break; + } + + // Compute inertia + let inertia = 0; + for (let i = 0; i < n; i++) { + inertia += euclideanSq(X[i] ?? new Float64Array(p), centers[labels[i] ?? 0] ?? new Float64Array(p)); + } + + return { centers, labels, inertia }; + } + + fit(X: Float64Array[]): this { + const k = Math.min(this.nClusters, X.length); + let best: ReturnType | null = null; + + for (let init = 0; init < this.nInit; init++) { + const result = this._run(X, k); + if (best === null || result.inertia < best.inertia) { + best = result; + } + } + + this.clusterCenters_ = best?.centers ?? []; + this.labels_ = best?.labels ?? new Int32Array(X.length); + this.inertia_ = best?.inertia ?? 0; + return this; + } + + predict(X: Float64Array[]): Int32Array { + if (this.clusterCenters_ === null) throw new NotFittedError("KMeans"); + const centers = this.clusterCenters_; + const p = (centers[0] ?? new Float64Array(0)).length; + return new Int32Array( + X.map((xi) => { + let minDist = Infinity; + let minIdx = 0; + for (let c = 0; c < centers.length; c++) { + const d = euclideanSq(xi, centers[c] ?? new Float64Array(p)); + if (d < minDist) { + minDist = d; + minIdx = c; + } + } + return minIdx; + }), + ); + } + + fitPredict(X: Float64Array[]): Int32Array { + this.fit(X); + return this.labels_ as Int32Array; + } + + score(X: Float64Array[]): number { + return -this._computeInertia(X, this.clusterCenters_ ?? []); + } + + private _computeInertia(X: Float64Array[], centers: Float64Array[]): number { + const p = (centers[0] ?? new Float64Array(0)).length; + let inertia = 0; + for (const xi of X) { + let minDist = Infinity; + for (const c of centers) { + const d = euclideanSq(xi, c.length ? c : new Float64Array(p)); + if (d < minDist) minDist = d; + } + inertia += minDist; + } + return inertia; + } +} + +export class DBSCAN { + eps: number; + minSamples: number; + metric: string; + + labels_: Int32Array | null = null; + coreIndices_: Int32Array | null = null; + + constructor( + options: { + eps?: number; + minSamples?: number; + metric?: string; + } = {}, + ) { + this.eps = options.eps ?? 0.5; + this.minSamples = options.minSamples ?? 5; + this.metric = options.metric ?? "euclidean"; + } + + fitPredict(X: Float64Array[]): Int32Array { + const n = X.length; + const labels = new Int32Array(n).fill(-2); // -2 = unvisited, -1 = noise + let clusterId = 0; + const coreIndices: number[] = []; + + function getNeighbors(idx: number): number[] { + const neighbors: number[] = []; + const xi = X[idx] ?? new Float64Array(0); + for (let j = 0; j < n; j++) { + if (euclidean(xi, X[j] ?? new Float64Array(0)) <= 0.5) { + // placeholder - use eps below + } + } + return neighbors; + } + void getNeighbors; // suppress unused warning + + const eps = this.eps; + const minSamples = this.minSamples; + + function neighbors(idx: number): number[] { + const xi = X[idx] ?? new Float64Array(0); + const result: number[] = []; + for (let j = 0; j < n; j++) { + if (euclidean(xi, X[j] ?? new Float64Array(0)) <= eps) { + result.push(j); + } + } + return result; + } + + for (let i = 0; i < n; i++) { + if (labels[i] !== -2) continue; + const nb = neighbors(i); + if (nb.length < minSamples) { + labels[i] = -1; + continue; + } + + coreIndices.push(i); + labels[i] = clusterId; + const queue = [...nb.filter((j) => j !== i)]; + + while (queue.length > 0) { + const j = queue.shift() as number; + if (labels[j] === -1) { + labels[j] = clusterId; + } + if (labels[j] !== -2) continue; + labels[j] = clusterId; + const jNb = neighbors(j); + if (jNb.length >= minSamples) { + coreIndices.push(j); + for (const k of jNb) { + if (labels[k] === -2 || labels[k] === -1) { + queue.push(k); + } + } + } + } + clusterId++; + } + + // Fix any remaining unvisited (noise) + for (let i = 0; i < n; i++) { + if (labels[i] === -2) labels[i] = -1; + } + + this.labels_ = labels; + this.coreIndices_ = new Int32Array(coreIndices); + return labels; + } + + fit(X: Float64Array[]): this { + this.fitPredict(X); + return this; + } +} diff --git a/src/compose/column_transformer.ts b/src/compose/column_transformer.ts new file mode 100644 index 0000000..aebbab1 --- /dev/null +++ b/src/compose/column_transformer.ts @@ -0,0 +1,102 @@ +/** + * ColumnTransformer: applies transformers to columns of an array. + * Mirrors sklearn.compose.ColumnTransformer. + */ + +import { NotFittedError } from "../exceptions.js"; + +export interface Transformer { + fit(X: Float64Array[]): this; + transform(X: Float64Array[]): Float64Array[]; + fitTransform?(X: Float64Array[]): Float64Array[]; +} + +export type ColumnSpec = number | number[] | "all"; + +export class ColumnTransformer { + transformers: [string, Transformer | "passthrough" | "drop", ColumnSpec][]; + remainder: "passthrough" | "drop"; + + transformers_: [string, Transformer | "passthrough", ColumnSpec][] = []; + private _nFeatures = 0; + private _allCols = new Set(); + + constructor( + transformers: [string, Transformer | "passthrough" | "drop", ColumnSpec][], + options: { remainder?: "passthrough" | "drop" } = {}, + ) { + this.transformers = transformers; + this.remainder = options.remainder ?? "drop"; + } + + private _getCols(spec: ColumnSpec, nFeatures: number): number[] { + if (spec === "all") return Array.from({ length: nFeatures }, (_, i) => i); + if (typeof spec === "number") return [spec]; + return spec; + } + + fit(X: Float64Array[]): this { + const n = (X[0] ?? new Float64Array(0)).length; + this._nFeatures = n; + this._allCols.clear(); + + this.transformers_ = []; + for (const [name, t, spec] of this.transformers) { + if (t === "drop") continue; + const cols = this._getCols(spec, n); + for (const c of cols) this._allCols.add(c); + + if (t === "passthrough") { + this.transformers_.push([name, "passthrough", spec]); + } else { + const Xsub = X.map((row) => new Float64Array(cols.map((c) => row[c] ?? 0))); + t.fit(Xsub); + this.transformers_.push([name, t, spec]); + } + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.transformers_.length === 0) throw new NotFittedError("ColumnTransformer"); + const n = (X[0] ?? new Float64Array(0)).length; + const parts: Float64Array[][] = []; + + for (const [, t, spec] of this.transformers_) { + const cols = this._getCols(spec, n); + const Xsub = X.map((row) => new Float64Array(cols.map((c) => row[c] ?? 0))); + if (t === "passthrough") { + parts.push(Xsub); + } else { + parts.push(t.transform(Xsub)); + } + } + + if (this.remainder === "passthrough") { + const remainderCols: number[] = []; + for (let c = 0; c < n; c++) { + if (!this._allCols.has(c)) remainderCols.push(c); + } + if (remainderCols.length > 0) { + parts.push(X.map((row) => new Float64Array(remainderCols.map((c) => row[c] ?? 0)))); + } + } + + // Horizontally concatenate + return X.map((_, i) => { + const rowParts = parts.map((p) => p[i] ?? new Float64Array(0)); + const total = rowParts.reduce((s, r) => s + r.length, 0); + const result = new Float64Array(total); + let offset = 0; + for (const part of rowParts) { + result.set(part, offset); + offset += part.length; + } + return result; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} diff --git a/src/compose/index.ts b/src/compose/index.ts new file mode 100644 index 0000000..72b2534 --- /dev/null +++ b/src/compose/index.ts @@ -0,0 +1 @@ +export * from "./column_transformer.js"; diff --git a/src/datasets/index.ts b/src/datasets/index.ts new file mode 100644 index 0000000..98c8f34 --- /dev/null +++ b/src/datasets/index.ts @@ -0,0 +1 @@ +export * from "./make_datasets.js"; diff --git a/src/datasets/make_datasets.ts b/src/datasets/make_datasets.ts new file mode 100644 index 0000000..e0241df --- /dev/null +++ b/src/datasets/make_datasets.ts @@ -0,0 +1,216 @@ +/** + * Synthetic dataset generators. + * Mirrors sklearn.datasets: make_classification, make_regression, make_blobs, + * make_moons, make_circles. + */ + +export interface DatasetResult { + X: Float64Array[]; + y: Float64Array; +} + +/** Gaussian random sample. */ +function randn(): number { + let u = 0; + let v = 0; + while (u === 0) u = Math.random(); + while (v === 0) v = Math.random(); + return Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v); +} + +/** Shuffle arrays in place using Fisher-Yates. */ +function shuffle(arr: T[]): T[] { + for (let i = arr.length - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + const tmp = arr[i] as T; + arr[i] = arr[j] as T; + arr[j] = tmp; + } + return arr; +} + +export function makeClassification( + options: { + nSamples?: number; + nFeatures?: number; + nClasses?: number; + nInformative?: number; + nRedundant?: number; + noise?: number; + randomState?: number; + } = {}, +): DatasetResult { + const nSamples = options.nSamples ?? 100; + const nFeatures = options.nFeatures ?? 20; + const nClasses = options.nClasses ?? 2; + const nInformative = Math.min(options.nInformative ?? 2, nFeatures); + const noise = options.noise ?? 0.0; + + const X: Float64Array[] = Array.from({ length: nSamples }, () => new Float64Array(nFeatures)); + const y = new Float64Array(nSamples); + + // Cluster centers for each class + const centers: Float64Array[] = Array.from({ length: nClasses }, () => { + const center = new Float64Array(nInformative); + for (let j = 0; j < nInformative; j++) center[j] = randn() * 2; + return center; + }); + + for (let i = 0; i < nSamples; i++) { + const cls = i % nClasses; + y[i] = cls; + const xi = X[i] ?? new Float64Array(nFeatures); + const center = centers[cls] ?? new Float64Array(nInformative); + + for (let j = 0; j < nInformative; j++) { + xi[j] = (center[j] ?? 0) + randn() * 0.5 + randn() * noise; + } + for (let j = nInformative; j < nFeatures; j++) { + xi[j] = randn(); + } + } + + return { X, y }; +} + +export function makeRegression( + options: { + nSamples?: number; + nFeatures?: number; + nInformative?: number; + noise?: number; + bias?: number; + } = {}, +): DatasetResult & { coef: Float64Array } { + const nSamples = options.nSamples ?? 100; + const nFeatures = options.nFeatures ?? 100; + const nInformative = Math.min(options.nInformative ?? 10, nFeatures); + const noise = options.noise ?? 0.0; + const bias = options.bias ?? 0.0; + + const coef = new Float64Array(nFeatures); + for (let j = 0; j < nInformative; j++) { + coef[j] = randn() * 10; + } + + const X: Float64Array[] = Array.from({ length: nSamples }, () => { + const xi = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) xi[j] = randn(); + return xi; + }); + + const y = new Float64Array(nSamples); + for (let i = 0; i < nSamples; i++) { + let yi = bias; + const xi = X[i] ?? new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) { + yi += (xi[j] ?? 0) * (coef[j] ?? 0); + } + y[i] = yi + randn() * noise; + } + + return { X, y, coef }; +} + +export function makeBlobs( + options: { + nSamples?: number; + nFeatures?: number; + centers?: number | Float64Array[]; + clusterStd?: number; + } = {}, +): DatasetResult { + const nSamples = options.nSamples ?? 100; + const nFeatures = options.nFeatures ?? 2; + const clusterStd = options.clusterStd ?? 1.0; + + let centers: Float64Array[]; + if (typeof options.centers === "number" || options.centers === undefined) { + const k = typeof options.centers === "number" ? options.centers : 3; + centers = Array.from({ length: k }, () => { + const c = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) c[j] = (Math.random() - 0.5) * 20; + return c; + }); + } else { + centers = options.centers; + } + + const k = centers.length; + const X: Float64Array[] = []; + const y: number[] = []; + + for (let i = 0; i < nSamples; i++) { + const cls = i % k; + const center = centers[cls] ?? new Float64Array(nFeatures); + const xi = new Float64Array(nFeatures); + for (let j = 0; j < nFeatures; j++) { + xi[j] = (center[j] ?? 0) + randn() * clusterStd; + } + X.push(xi); + y.push(cls); + } + + const order = shuffle(Array.from({ length: nSamples }, (_, i) => i)); + return { + X: order.map((i) => X[i] ?? new Float64Array(nFeatures)), + y: new Float64Array(order.map((i) => y[i] ?? 0)), + }; +} + +export function makeMoons( + options: { nSamples?: number; noise?: number } = {}, +): DatasetResult { + const nSamples = options.nSamples ?? 100; + const noise = options.noise ?? 0.0; + const half = Math.floor(nSamples / 2); + + const X: Float64Array[] = []; + const y: number[] = []; + + for (let i = 0; i < half; i++) { + const angle = (Math.PI * i) / half; + X.push(new Float64Array([Math.cos(angle) + randn() * noise, Math.sin(angle) + randn() * noise])); + y.push(0); + } + for (let i = 0; i < nSamples - half; i++) { + const angle = (Math.PI * i) / (nSamples - half); + X.push(new Float64Array([1 - Math.cos(angle) + randn() * noise, 1 - Math.sin(angle) - 0.5 + randn() * noise])); + y.push(1); + } + + const order = shuffle(Array.from({ length: nSamples }, (_, i) => i)); + return { + X: order.map((i) => X[i] ?? new Float64Array(2)), + y: new Float64Array(order.map((i) => y[i] ?? 0)), + }; +} + +export function makeCircles( + options: { nSamples?: number; noise?: number; factor?: number } = {}, +): DatasetResult { + const nSamples = options.nSamples ?? 100; + const noise = options.noise ?? 0.0; + const factor = options.factor ?? 0.8; + const half = Math.floor(nSamples / 2); + + const X: Float64Array[] = []; + const y: number[] = []; + + for (let i = 0; i < half; i++) { + const angle = (2 * Math.PI * i) / half; + X.push(new Float64Array([Math.cos(angle) + randn() * noise, Math.sin(angle) + randn() * noise])); + y.push(0); + } + for (let i = 0; i < nSamples - half; i++) { + const angle = (2 * Math.PI * i) / (nSamples - half); + X.push(new Float64Array([factor * Math.cos(angle) + randn() * noise, factor * Math.sin(angle) + randn() * noise])); + y.push(1); + } + + const order = shuffle(Array.from({ length: nSamples }, (_, i) => i)); + return { + X: order.map((i) => X[i] ?? new Float64Array(2)), + y: new Float64Array(order.map((i) => y[i] ?? 0)), + }; +} diff --git a/src/decomposition/index.ts b/src/decomposition/index.ts new file mode 100644 index 0000000..6bb90c3 --- /dev/null +++ b/src/decomposition/index.ts @@ -0,0 +1,2 @@ +export * from "./pca.js"; +export * from "./nmf.js"; diff --git a/src/decomposition/nmf.ts b/src/decomposition/nmf.ts new file mode 100644 index 0000000..4f12e86 --- /dev/null +++ b/src/decomposition/nmf.ts @@ -0,0 +1,154 @@ +/** + * Non-negative Matrix Factorization (NMF). + * Mirrors sklearn.decomposition.NMF. + * Uses multiplicative update rules. + */ + +import { NotFittedError } from "../exceptions.js"; + +function mulUpdate( + X: Float64Array[], + W: Float64Array[], + H: Float64Array[], + alpha: number, + maxIter: number, +): void { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const k = H.length; + const eps = 1e-10; + + for (let iter = 0; iter < maxIter; iter++) { + // Update H + for (let c = 0; c < k; c++) { + for (let j = 0; j < p; j++) { + let num = 0; + let den = 0; + for (let i = 0; i < n; i++) { + const wic = (W[i] ?? new Float64Array(k))[c] ?? 0; + const xij = (X[i] ?? new Float64Array(p))[j] ?? 0; + num += wic * xij; + let whij = 0; + for (let l = 0; l < k; l++) { + whij += + ((W[i] ?? new Float64Array(k))[l] ?? 0) * + ((H[l] ?? new Float64Array(p))[j] ?? 0); + } + den += wic * whij; + } + const hjc = (H[c] ?? new Float64Array(p))[j] ?? 0; + (H[c] ?? new Float64Array(p))[j] = + (hjc * (num + eps)) / (den + alpha + eps); + } + } + + // Update W + for (let i = 0; i < n; i++) { + for (let c = 0; c < k; c++) { + let num = 0; + let den = 0; + for (let j = 0; j < p; j++) { + const hjc = (H[c] ?? new Float64Array(p))[j] ?? 0; + const xij = (X[i] ?? new Float64Array(p))[j] ?? 0; + num += xij * hjc; + let whij = 0; + for (let l = 0; l < k; l++) { + whij += + ((W[i] ?? new Float64Array(k))[l] ?? 0) * + ((H[l] ?? new Float64Array(p))[j] ?? 0); + } + den += whij * hjc; + } + const wic = (W[i] ?? new Float64Array(k))[c] ?? 0; + (W[i] ?? new Float64Array(k))[c] = + (wic * (num + eps)) / (den + alpha + eps); + } + } + } +} + +export class NMF { + nComponents: number; + maxIter: number; + tol: number; + alpha: number; + + components_: Float64Array[] | null = null; + reconstructionErr_: number = 0; + + constructor( + options: { + nComponents?: number; + maxIter?: number; + tol?: number; + alpha?: number; + } = {}, + ) { + this.nComponents = options.nComponents ?? 2; + this.maxIter = options.maxIter ?? 200; + this.tol = options.tol ?? 1e-4; + this.alpha = options.alpha ?? 0.0; + } + + fit(X: Float64Array[]): this { + this._fitTransform(X); + return this; + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this._fitTransform(X); + } + + private _fitTransform(X: Float64Array[]): Float64Array[] { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const k = Math.min(this.nComponents, n, p); + + const W: Float64Array[] = Array.from({ length: n }, () => { + const row = new Float64Array(k); + for (let j = 0; j < k; j++) row[j] = Math.random() * 0.1 + 0.01; + return row; + }); + const H: Float64Array[] = Array.from({ length: k }, () => { + const row = new Float64Array(p); + for (let j = 0; j < p; j++) row[j] = Math.random() * 0.1 + 0.01; + return row; + }); + + mulUpdate(X, W, H, this.alpha, this.maxIter); + + // Compute reconstruction error + let err = 0; + for (let i = 0; i < n; i++) { + for (let j = 0; j < p; j++) { + let approx = 0; + for (let c = 0; c < k; c++) { + approx += + ((W[i] ?? new Float64Array(k))[c] ?? 0) * + ((H[c] ?? new Float64Array(p))[j] ?? 0); + } + const diff = ((X[i] ?? new Float64Array(p))[j] ?? 0) - approx; + err += diff * diff; + } + } + this.reconstructionErr_ = Math.sqrt(err); + this.components_ = H; + return W; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.components_ === null) throw new NotFittedError("NMF"); + const n = X.length; + const k = this.components_.length; + + const W: Float64Array[] = Array.from({ length: n }, () => { + const row = new Float64Array(k); + for (let j = 0; j < k; j++) row[j] = Math.random() * 0.1 + 0.01; + return row; + }); + const H = this.components_; + + mulUpdate(X, W, H, this.alpha, this.maxIter); + return W; + } +} diff --git a/src/decomposition/pca.ts b/src/decomposition/pca.ts new file mode 100644 index 0000000..e1ae890 --- /dev/null +++ b/src/decomposition/pca.ts @@ -0,0 +1,244 @@ +/** + * PCA (Principal Component Analysis) and TruncatedSVD. + * Mirrors sklearn.decomposition.PCA and TruncatedSVD. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** Compute mean of each column. */ +function colMeans(X: Float64Array[], p: number): Float64Array { + const means = new Float64Array(p); + for (const xi of X) { + for (let j = 0; j < p; j++) { + means[j] = (means[j] ?? 0) + (xi[j] ?? 0); + } + } + for (let j = 0; j < p; j++) { + means[j] = (means[j] ?? 0) / X.length; + } + return means; +} + +/** Power iteration to find top-k eigenvectors (randomized SVD). */ +function randomizedSVD( + X: Float64Array[], + nComponents: number, + nIter = 5, +): { components: Float64Array[]; explainedVariance: Float64Array } { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const k = Math.min(nComponents, n, p); + + // Build components via power iteration + const components: Float64Array[] = []; + const explainedVariance = new Float64Array(k); + + // Make a copy to deflate + const Xwork: Float64Array[] = X.map((xi) => new Float64Array(xi)); + + for (let c = 0; c < k; c++) { + // Random init + let v = new Float64Array(p); + for (let j = 0; j < p; j++) v[j] = Math.random() - 0.5; + + // Normalize + let norm = Math.sqrt(v.reduce((s, x) => s + x ** 2, 0)); + if (norm > 0) { + for (let j = 0; j < p; j++) v[j] = (v[j] ?? 0) / norm; + } + + for (let iter = 0; iter < nIter * 10; iter++) { + // v = X^T X v + const u = new Float64Array(p); + // First compute Xv + const Xv = new Float64Array(n); + for (let i = 0; i < n; i++) { + let dot = 0; + const xi = Xwork[i] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + dot += (xi[j] ?? 0) * (v[j] ?? 0); + } + Xv[i] = dot; + } + // Then X^T (Xv) + for (let i = 0; i < n; i++) { + const xi = Xwork[i] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + u[j] = (u[j] ?? 0) + (Xv[i] ?? 0) * (xi[j] ?? 0); + } + } + norm = Math.sqrt(u.reduce((s, x) => s + x ** 2, 0)); + if (norm === 0) break; + for (let j = 0; j < p; j++) u[j] = (u[j] ?? 0) / norm; + + let diff = 0; + for (let j = 0; j < p; j++) diff += (u[j] ?? 0 - (v[j] ?? 0)) ** 2; + v = u; + if (diff < 1e-10) break; + } + + components.push(v); + + // Compute eigenvalue (variance along this component) + let variance = 0; + for (let i = 0; i < n; i++) { + let dot = 0; + const xi = Xwork[i] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + dot += (xi[j] ?? 0) * (v[j] ?? 0); + } + variance += dot ** 2; + } + explainedVariance[c] = variance / n; + + // Deflate X + for (let i = 0; i < n; i++) { + const xi = Xwork[i] ?? new Float64Array(p); + let dot = 0; + for (let j = 0; j < p; j++) dot += (xi[j] ?? 0) * (v[j] ?? 0); + for (let j = 0; j < p; j++) { + xi[j] = (xi[j] ?? 0) - dot * (v[j] ?? 0); + } + } + } + + return { components, explainedVariance }; +} + +export class PCA { + nComponents: number; + whiten: boolean; + + components_: Float64Array[] | null = null; + explainedVariance_: Float64Array | null = null; + explainedVarianceRatio_: Float64Array | null = null; + mean_: Float64Array | null = null; + + constructor( + options: { nComponents?: number; whiten?: boolean } = {}, + ) { + this.nComponents = options.nComponents ?? 2; + this.whiten = options.whiten ?? false; + } + + fit(X: Float64Array[]): this { + const p = (X[0] ?? new Float64Array(0)).length; + this.mean_ = colMeans(X, p); + const centered = X.map((xi) => { + const row = new Float64Array(p); + for (let j = 0; j < p; j++) { + row[j] = (xi[j] ?? 0) - ((this.mean_ as Float64Array)[j] ?? 0); + } + return row; + }); + + const { components, explainedVariance } = randomizedSVD(centered, this.nComponents); + this.components_ = components; + this.explainedVariance_ = explainedVariance; + const totalVar = Array.from(explainedVariance).reduce((a, b) => a + b, 0); + this.explainedVarianceRatio_ = new Float64Array( + explainedVariance.map((v) => (totalVar > 0 ? v / totalVar : 0)), + ); + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.components_ === null || this.mean_ === null) + throw new NotFittedError("PCA"); + + const p = (X[0] ?? new Float64Array(0)).length; + const k = this.components_.length; + + return X.map((xi) => { + const result = new Float64Array(k); + for (let c = 0; c < k; c++) { + const comp = (this.components_ as Float64Array[])[c] ?? new Float64Array(p); + let dot = 0; + for (let j = 0; j < p; j++) { + dot += ((xi[j] ?? 0) - ((this.mean_ as Float64Array)[j] ?? 0)) * (comp[j] ?? 0); + } + if (this.whiten) { + const ev = ((this.explainedVariance_ as Float64Array)[c] ?? 1); + result[c] = ev > 0 ? dot / Math.sqrt(ev) : dot; + } else { + result[c] = dot; + } + } + return result; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } + + inverseTransform(X: Float64Array[]): Float64Array[] { + if (this.components_ === null || this.mean_ === null) + throw new NotFittedError("PCA"); + const k = (X[0] ?? new Float64Array(0)).length; + const p = (this.components_[0] ?? new Float64Array(0)).length; + return X.map((xi) => { + const result = new Float64Array(p); + for (let c = 0; c < k; c++) { + const comp = (this.components_ as Float64Array[])[c] ?? new Float64Array(p); + const scale = this.whiten + ? (xi[c] ?? 0) * Math.sqrt((this.explainedVariance_ as Float64Array)[c] ?? 1) + : (xi[c] ?? 0); + for (let j = 0; j < p; j++) { + result[j] = (result[j] ?? 0) + scale * (comp[j] ?? 0); + } + } + for (let j = 0; j < p; j++) { + result[j] = (result[j] ?? 0) + ((this.mean_ as Float64Array)[j] ?? 0); + } + return result; + }); + } +} + +export class TruncatedSVD { + nComponents: number; + nIter: number; + + components_: Float64Array[] | null = null; + explainedVariance_: Float64Array | null = null; + explainedVarianceRatio_: Float64Array | null = null; + + constructor( + options: { nComponents?: number; nIter?: number } = {}, + ) { + this.nComponents = options.nComponents ?? 2; + this.nIter = options.nIter ?? 5; + } + + fit(X: Float64Array[]): this { + const { components, explainedVariance } = randomizedSVD(X, this.nComponents, this.nIter); + this.components_ = components; + this.explainedVariance_ = explainedVariance; + const totalVar = Array.from(explainedVariance).reduce((a, b) => a + b, 0); + this.explainedVarianceRatio_ = new Float64Array( + explainedVariance.map((v) => (totalVar > 0 ? v / totalVar : 0)), + ); + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.components_ === null) throw new NotFittedError("TruncatedSVD"); + const p = (X[0] ?? new Float64Array(0)).length; + const k = this.components_.length; + return X.map((xi) => { + const result = new Float64Array(k); + for (let c = 0; c < k; c++) { + const comp = (this.components_ as Float64Array[])[c] ?? new Float64Array(p); + let dot = 0; + for (let j = 0; j < p; j++) dot += (xi[j] ?? 0) * (comp[j] ?? 0); + result[c] = dot; + } + return result; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} diff --git a/src/discriminant_analysis/index.ts b/src/discriminant_analysis/index.ts new file mode 100644 index 0000000..c4645b0 --- /dev/null +++ b/src/discriminant_analysis/index.ts @@ -0,0 +1 @@ +export * from "./lda.js"; diff --git a/src/discriminant_analysis/lda.ts b/src/discriminant_analysis/lda.ts new file mode 100644 index 0000000..9b936d4 --- /dev/null +++ b/src/discriminant_analysis/lda.ts @@ -0,0 +1,301 @@ +/** + * Linear Discriminant Analysis (LDA) and Quadratic Discriminant Analysis (QDA). + * Mirrors sklearn.discriminant_analysis.LinearDiscriminantAnalysis and + * QuadraticDiscriminantAnalysis. + */ + +import { NotFittedError } from "../exceptions.js"; + +function dotVec(a: Float64Array, b: Float64Array): number { + let s = 0; + for (let i = 0; i < a.length; i++) s += (a[i] ?? 0) * (b[i] ?? 0); + return s; +} + +function matVec(M: Float64Array[], v: Float64Array): Float64Array { + return new Float64Array(M.map((row) => dotVec(row, v))); +} + +/** Solve Ax = b via Gaussian elimination. */ +function solveLinear(A: Float64Array[], b: Float64Array): Float64Array { + const n = A.length; + const aug: number[][] = A.map((row, i) => [...Array.from(row), b[i] ?? 0]); + + for (let col = 0; col < n; col++) { + let pivotRow = col; + for (let r = col + 1; r < n; r++) { + if (Math.abs((aug[r] as number[])[col] ?? 0) > Math.abs((aug[pivotRow] as number[])[col] ?? 0)) { + pivotRow = r; + } + } + [aug[col], aug[pivotRow]] = [aug[pivotRow] as number[], aug[col] as number[]]; + + const pivot = (aug[col] as number[])[col] ?? 0; + if (Math.abs(pivot) < 1e-12) continue; + + for (let r = 0; r < n; r++) { + if (r === col) continue; + const factor = ((aug[r] as number[])[col] ?? 0) / pivot; + for (let c = col; c <= n; c++) { + (aug[r] as number[])[c] = ((aug[r] as number[])[c] ?? 0) - factor * ((aug[col] as number[])[c] ?? 0); + } + } + } + + const result = new Float64Array(n); + for (let i = 0; i < n; i++) { + const pivot = (aug[i] as number[])[i] ?? 0; + result[i] = pivot !== 0 ? ((aug[i] as number[])[n] ?? 0) / pivot : 0; + } + return result; +} + +export class LinearDiscriminantAnalysis { + nComponents: number | null; + solverTol: number; + + coef_: Float64Array[] | null = null; + intercept_: Float64Array | null = null; + classes_: Float64Array | null = null; + means_: Float64Array[] | null = null; + scalings_: Float64Array[] | null = null; + priors_: Float64Array | null = null; + + constructor(options: { nComponents?: number; solverTol?: number } = {}) { + this.nComponents = options.nComponents ?? null; + this.solverTol = options.solverTol ?? 1e-4; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const uniqueClasses = Array.from(new Set(Array.from(y))).sort((a, b) => a - b); + this.classes_ = new Float64Array(uniqueClasses); + const nClasses = uniqueClasses.length; + const classToIdx = new Map(uniqueClasses.map((c, i) => [c, i])); + + // Compute class means and priors + const means: Float64Array[] = Array.from({ length: nClasses }, () => new Float64Array(p)); + const counts = new Int32Array(nClasses); + + for (let i = 0; i < n; i++) { + const c = classToIdx.get(y[i] ?? 0) ?? 0; + counts[c] = (counts[c] ?? 0) + 1; + const xi = X[i] ?? new Float64Array(p); + const mean = means[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) + (xi[j] ?? 0); + } + for (let c = 0; c < nClasses; c++) { + const cnt = counts[c] ?? 1; + const mean = means[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) / cnt; + } + + this.means_ = means; + this.priors_ = new Float64Array(nClasses); + for (let c = 0; c < nClasses; c++) { + this.priors_[c] = (counts[c] ?? 0) / n; + } + + // Compute within-class scatter matrix (pooled covariance) + const Sw: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p)); + for (let i = 0; i < n; i++) { + const c = classToIdx.get(y[i] ?? 0) ?? 0; + const xi = X[i] ?? new Float64Array(p); + const mean = means[c] ?? new Float64Array(p); + const diff = new Float64Array(p); + for (let j = 0; j < p; j++) diff[j] = (xi[j] ?? 0) - (mean[j] ?? 0); + for (let j = 0; j < p; j++) { + const sw = Sw[j] ?? new Float64Array(p); + for (let k = 0; k < p; k++) { + sw[k] = (sw[k] ?? 0) + (diff[j] ?? 0) * (diff[k] ?? 0); + } + } + } + + // Add regularization + for (let j = 0; j < p; j++) { + const sw = Sw[j] ?? new Float64Array(p); + sw[j] = (sw[j] ?? 0) + this.solverTol * n; + } + + // Compute coefficients: coef = Sw^{-1} (mu_1 - mu_0) for binary case + // For multi-class, compute coef for each class + this.coef_ = []; + this.intercept_ = new Float64Array(nClasses); + + for (let c = 0; c < nClasses; c++) { + const meanC = means[c] ?? new Float64Array(p); + const coefC = solveLinear(Sw, meanC); + this.coef_.push(coefC); + const prior = (this.priors_[c] ?? 0); + let dotMeanCCoef = dotVec(meanC, coefC); + this.intercept_[c] = -0.5 * dotMeanCCoef + Math.log(prior + 1e-10); + } + + return this; + } + + decisionFunction(X: Float64Array[]): Float64Array[] { + if (this.coef_ === null) throw new NotFittedError("LinearDiscriminantAnalysis"); + return X.map((xi) => { + return new Float64Array( + (this.coef_ as Float64Array[]).map((coefC, c) => + dotVec(xi, coefC) + ((this.intercept_ as Float64Array)[c] ?? 0), + ), + ); + }); + } + + predict(X: Float64Array[]): Float64Array { + if (this.classes_ === null) throw new NotFittedError("LinearDiscriminantAnalysis"); + const classes = this.classes_; + const decisions = this.decisionFunction(X); + return new Float64Array( + decisions.map((d) => { + let maxIdx = 0; + let maxVal = d[0] ?? -Infinity; + for (let c = 1; c < d.length; c++) { + if ((d[c] ?? -Infinity) > maxVal) { + maxVal = d[c] ?? -Infinity; + maxIdx = c; + } + } + return classes[maxIdx] ?? 0; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } + + transform(X: Float64Array[]): Float64Array[] { + return this.decisionFunction(X); + } +} + +export class QuadraticDiscriminantAnalysis { + regParam: number; + + classes_: Float64Array | null = null; + means_: Float64Array[] | null = null; + covariances_: Float64Array[][] | null = null; + priors_: Float64Array | null = null; + + constructor(options: { regParam?: number } = {}) { + this.regParam = options.regParam ?? 0.0; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const uniqueClasses = Array.from(new Set(Array.from(y))).sort((a, b) => a - b); + this.classes_ = new Float64Array(uniqueClasses); + const nClasses = uniqueClasses.length; + const classToIdx = new Map(uniqueClasses.map((c, i) => [c, i])); + + const means: Float64Array[] = Array.from({ length: nClasses }, () => new Float64Array(p)); + const covs: Float64Array[][] = Array.from({ length: nClasses }, () => + Array.from({ length: p }, () => new Float64Array(p)), + ); + const counts = new Int32Array(nClasses); + + for (let i = 0; i < n; i++) { + const c = classToIdx.get(y[i] ?? 0) ?? 0; + counts[c] = (counts[c] ?? 0) + 1; + const xi = X[i] ?? new Float64Array(p); + const mean = means[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) + (xi[j] ?? 0); + } + for (let c = 0; c < nClasses; c++) { + const cnt = counts[c] ?? 1; + const mean = means[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) / cnt; + } + + for (let i = 0; i < n; i++) { + const c = classToIdx.get(y[i] ?? 0) ?? 0; + const xi = X[i] ?? new Float64Array(p); + const mean = means[c] ?? new Float64Array(p); + const cov = covs[c] ?? []; + const diff = new Float64Array(p); + for (let j = 0; j < p; j++) diff[j] = (xi[j] ?? 0) - (mean[j] ?? 0); + for (let j = 0; j < p; j++) { + const row = cov[j] ?? new Float64Array(p); + for (let k = 0; k < p; k++) { + row[k] = (row[k] ?? 0) + (diff[j] ?? 0) * (diff[k] ?? 0); + } + } + } + + for (let c = 0; c < nClasses; c++) { + const cnt = counts[c] ?? 1; + const cov = covs[c] ?? []; + for (let j = 0; j < p; j++) { + const row = cov[j] ?? new Float64Array(p); + for (let k = 0; k < p; k++) { + row[k] = (row[k] ?? 0) / cnt; + if (j === k) row[k] = (row[k] ?? 0) + this.regParam; + } + } + } + + this.means_ = means; + this.covariances_ = covs; + this.priors_ = new Float64Array(nClasses); + for (let c = 0; c < nClasses; c++) { + this.priors_[c] = (counts[c] ?? 0) / n; + } + + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.classes_ === null) throw new NotFittedError("QuadraticDiscriminantAnalysis"); + const classes = this.classes_; + const nClasses = classes.length; + const p = (X[0] ?? new Float64Array(0)).length; + + return new Float64Array( + X.map((xi) => { + let maxScore = -Infinity; + let maxIdx = 0; + for (let c = 0; c < nClasses; c++) { + const mean = (this.means_ as Float64Array[])[c] ?? new Float64Array(p); + const cov = (this.covariances_ as Float64Array[][])[c] ?? []; + const prior = (this.priors_ as Float64Array)[c] ?? 0; + + const diff = new Float64Array(p); + for (let j = 0; j < p; j++) diff[j] = (xi[j] ?? 0) - (mean[j] ?? 0); + + const solved = solveLinear(cov.length > 0 ? cov as Float64Array[] : [new Float64Array(p)], diff); + let mahal = dotVec(diff, solved); + + const score = -0.5 * mahal + Math.log(prior + 1e-10); + if (score > maxScore) { + maxScore = score; + maxIdx = c; + } + } + return classes[maxIdx] ?? 0; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} + +void matVec; // suppress unused diff --git a/src/ensemble/gradient_boosting.ts b/src/ensemble/gradient_boosting.ts new file mode 100644 index 0000000..0a712d7 --- /dev/null +++ b/src/ensemble/gradient_boosting.ts @@ -0,0 +1,195 @@ +/** + * Gradient Boosting Classifier and Regressor. + * Mirrors sklearn.ensemble.GradientBoostingClassifier / GradientBoostingRegressor. + */ + +import { NotFittedError } from "../exceptions.js"; +import { DecisionTreeRegressor } from "../tree/decision_tree.js"; + +function sigmoid(x: number): number { + return 1 / (1 + Math.exp(-Math.max(-500, Math.min(500, x)))); +} + +export class GradientBoostingRegressor { + nEstimators: number; + learningRate: number; + maxDepth: number; + subsample: number; + + estimators_: DecisionTreeRegressor[] | null = null; + initialPred_: number = 0; + + constructor( + options: { + nEstimators?: number; + learningRate?: number; + maxDepth?: number; + subsample?: number; + } = {}, + ) { + this.nEstimators = options.nEstimators ?? 100; + this.learningRate = options.learningRate ?? 0.1; + this.maxDepth = options.maxDepth ?? 3; + this.subsample = options.subsample ?? 1.0; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + this.initialPred_ = Array.from(y).reduce((a, b) => a + b, 0) / n; + const pred = new Float64Array(n).fill(this.initialPred_); + + this.estimators_ = []; + for (let t = 0; t < this.nEstimators; t++) { + const residuals = new Float64Array(n); + for (let i = 0; i < n; i++) { + residuals[i] = (y[i] ?? 0) - (pred[i] ?? 0); + } + + // Subsample + let sampleIdx: number[]; + if (this.subsample < 1.0) { + const k = Math.max(1, Math.round(n * this.subsample)); + sampleIdx = Array.from({ length: n }, (_, i) => i); + for (let i = n - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + const tmp = sampleIdx[i] as number; + sampleIdx[i] = sampleIdx[j] as number; + sampleIdx[j] = tmp; + } + sampleIdx = sampleIdx.slice(0, k); + } else { + sampleIdx = Array.from({ length: n }, (_, i) => i); + } + + const XSub = sampleIdx.map((i) => X[i] ?? new Float64Array(0)); + const rSub = new Float64Array(sampleIdx.map((i) => residuals[i] ?? 0)); + + const tree = new DecisionTreeRegressor({ maxDepth: this.maxDepth }); + tree.fit(XSub, rSub); + this.estimators_.push(tree); + + const treePred = tree.predict(X); + for (let i = 0; i < n; i++) { + pred[i] = (pred[i] ?? 0) + this.learningRate * (treePred[i] ?? 0); + } + } + + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.estimators_ === null) throw new NotFittedError("GradientBoostingRegressor"); + const pred = new Float64Array(X.length).fill(this.initialPred_); + for (const tree of this.estimators_) { + const tp = tree.predict(X); + for (let i = 0; i < pred.length; i++) { + pred[i] = (pred[i] ?? 0) + this.learningRate * (tp[i] ?? 0); + } + } + return pred; + } + + score(X: Float64Array[], y: Float64Array): number { + const yPred = this.predict(X); + const yMean = Array.from(y).reduce((a, b) => a + b, 0) / y.length; + let ssTot = 0; + let ssRes = 0; + for (let i = 0; i < y.length; i++) { + ssTot += ((y[i] ?? 0) - yMean) ** 2; + ssRes += ((y[i] ?? 0) - (yPred[i] ?? 0)) ** 2; + } + return ssTot > 0 ? 1 - ssRes / ssTot : 0; + } +} + +export class GradientBoostingClassifier { + nEstimators: number; + learningRate: number; + maxDepth: number; + + estimators_: DecisionTreeRegressor[] | null = null; + initialPred_: number = 0; + classes_: Float64Array | null = null; + + constructor( + options: { + nEstimators?: number; + learningRate?: number; + maxDepth?: number; + } = {}, + ) { + this.nEstimators = options.nEstimators ?? 100; + this.learningRate = options.learningRate ?? 0.1; + this.maxDepth = options.maxDepth ?? 3; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const uniqueClasses = Array.from(new Set(Array.from(y))).sort((a, b) => a - b); + this.classes_ = new Float64Array(uniqueClasses); + const posClass = uniqueClasses[uniqueClasses.length - 1] ?? 1; + + // Binary cross-entropy + const yBin = new Float64Array(n); + for (let i = 0; i < n; i++) { + yBin[i] = (y[i] ?? 0) === posClass ? 1 : 0; + } + + const posRate = Array.from(yBin).reduce((a, b) => a + b, 0) / n; + this.initialPred_ = Math.log((posRate + 1e-10) / (1 - posRate + 1e-10)); + const F = new Float64Array(n).fill(this.initialPred_); + + this.estimators_ = []; + for (let t = 0; t < this.nEstimators; t++) { + const residuals = new Float64Array(n); + for (let i = 0; i < n; i++) { + const p = sigmoid(F[i] ?? 0); + residuals[i] = (yBin[i] ?? 0) - p; + } + + const tree = new DecisionTreeRegressor({ maxDepth: this.maxDepth }); + tree.fit(X, residuals); + this.estimators_.push(tree); + + const tp = tree.predict(X); + for (let i = 0; i < n; i++) { + F[i] = (F[i] ?? 0) + this.learningRate * (tp[i] ?? 0); + } + } + + return this; + } + + predictProba(X: Float64Array[]): Float64Array[] { + if (this.estimators_ === null) throw new NotFittedError("GradientBoostingClassifier"); + const F = new Float64Array(X.length).fill(this.initialPred_); + for (const tree of this.estimators_) { + const tp = tree.predict(X); + for (let i = 0; i < F.length; i++) { + F[i] = (F[i] ?? 0) + this.learningRate * (tp[i] ?? 0); + } + } + return Array.from(F).map((f) => { + const p = sigmoid(f); + return new Float64Array([1 - p, p]); + }); + } + + predict(X: Float64Array[]): Float64Array { + if (this.classes_ === null) throw new NotFittedError("GradientBoostingClassifier"); + const classes = this.classes_; + const proba = this.predictProba(X); + const posClass = classes[classes.length - 1] ?? 1; + const negClass = classes[0] ?? 0; + return new Float64Array(proba.map((p) => ((p[1] ?? 0) >= 0.5 ? posClass : negClass))); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} diff --git a/src/ensemble/index.ts b/src/ensemble/index.ts new file mode 100644 index 0000000..eed96db --- /dev/null +++ b/src/ensemble/index.ts @@ -0,0 +1,2 @@ +export * from "./random_forest.js"; +export * from "./gradient_boosting.js"; diff --git a/src/ensemble/random_forest.ts b/src/ensemble/random_forest.ts new file mode 100644 index 0000000..f1cf50b --- /dev/null +++ b/src/ensemble/random_forest.ts @@ -0,0 +1,215 @@ +/** + * Random Forest Classifier and Regressor. + * Mirrors sklearn.ensemble.RandomForestClassifier / RandomForestRegressor. + */ + +import { NotFittedError } from "../exceptions.js"; +import { DecisionTreeClassifier, DecisionTreeRegressor } from "../tree/decision_tree.js"; + +function bootstrapSample(n: number): number[] { + const indices: number[] = []; + for (let i = 0; i < n; i++) { + indices.push(Math.floor(Math.random() * n)); + } + return indices; +} + +export class RandomForestClassifier { + nEstimators: number; + maxDepth: number; + minSamplesSplit: number; + maxFeatures: number | "sqrt" | "log2"; + + estimators_: DecisionTreeClassifier[] | null = null; + classes_: Float64Array | null = null; + + constructor( + options: { + nEstimators?: number; + maxDepth?: number; + minSamplesSplit?: number; + maxFeatures?: number | "sqrt" | "log2"; + } = {}, + ) { + this.nEstimators = options.nEstimators ?? 100; + this.maxDepth = options.maxDepth ?? Infinity; + this.minSamplesSplit = options.minSamplesSplit ?? 2; + this.maxFeatures = options.maxFeatures ?? "sqrt"; + } + + private _getFeatureSubset(nFeatures: number): number[] { + let k: number; + if (this.maxFeatures === "sqrt") k = Math.max(1, Math.round(Math.sqrt(nFeatures))); + else if (this.maxFeatures === "log2") k = Math.max(1, Math.round(Math.log2(nFeatures))); + else k = Math.min(nFeatures, this.maxFeatures as number); + + const indices = Array.from({ length: nFeatures }, (_, i) => i); + for (let i = indices.length - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + const tmp = indices[i] as number; + indices[i] = indices[j] as number; + indices[j] = tmp; + } + return indices.slice(0, k); + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const nFeatures = (X[0] ?? new Float64Array(0)).length; + this.classes_ = new Float64Array( + Array.from(new Set(Array.from(y))).sort((a, b) => a - b), + ); + + this.estimators_ = []; + for (let t = 0; t < this.nEstimators; t++) { + const sampleIdx = bootstrapSample(n); + const featIdx = this._getFeatureSubset(nFeatures); + + const XSub = sampleIdx.map((i) => { + const xi = X[i] ?? new Float64Array(nFeatures); + return new Float64Array(featIdx.map((f) => xi[f] ?? 0)); + }); + const ySub = new Float64Array(sampleIdx.map((i) => y[i] ?? 0)); + + const tree = new DecisionTreeClassifier({ + maxDepth: this.maxDepth, + minSamplesSplit: this.minSamplesSplit, + }); + tree.fit(XSub, ySub); + // Store feature indices with tree + (tree as DecisionTreeClassifier & { featIdx_: number[] }).featIdx_ = featIdx; + this.estimators_.push(tree); + } + + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.estimators_ === null || this.classes_ === null) + throw new NotFittedError("RandomForestClassifier"); + + const classes = this.classes_; + return new Float64Array( + X.map((xi) => { + const votes = new Map(); + for (const tree of this.estimators_ as (DecisionTreeClassifier & { featIdx_: number[] })[]) { + const featIdx = tree.featIdx_; + const xSub = new Float64Array(featIdx.map((f) => xi[f] ?? 0)); + const pred = (tree.predict([xSub]))[0] ?? 0; + votes.set(pred, (votes.get(pred) ?? 0) + 1); + } + let bestClass = classes[0] ?? 0; + let bestCount = 0; + for (const [cls, cnt] of votes) { + if (cnt > bestCount) { + bestCount = cnt; + bestClass = cls; + } + } + return bestClass; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} + +export class RandomForestRegressor { + nEstimators: number; + maxDepth: number; + minSamplesSplit: number; + maxFeatures: number | "sqrt" | "log2"; + + estimators_: DecisionTreeRegressor[] | null = null; + + constructor( + options: { + nEstimators?: number; + maxDepth?: number; + minSamplesSplit?: number; + maxFeatures?: number | "sqrt" | "log2"; + } = {}, + ) { + this.nEstimators = options.nEstimators ?? 100; + this.maxDepth = options.maxDepth ?? Infinity; + this.minSamplesSplit = options.minSamplesSplit ?? 2; + this.maxFeatures = options.maxFeatures ?? "sqrt"; + } + + private _getFeatureSubset(nFeatures: number): number[] { + let k: number; + if (this.maxFeatures === "sqrt") k = Math.max(1, Math.round(Math.sqrt(nFeatures))); + else if (this.maxFeatures === "log2") k = Math.max(1, Math.round(Math.log2(nFeatures))); + else k = Math.min(nFeatures, this.maxFeatures as number); + + const indices = Array.from({ length: nFeatures }, (_, i) => i); + for (let i = indices.length - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + const tmp = indices[i] as number; + indices[i] = indices[j] as number; + indices[j] = tmp; + } + return indices.slice(0, k); + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const nFeatures = (X[0] ?? new Float64Array(0)).length; + + this.estimators_ = []; + for (let t = 0; t < this.nEstimators; t++) { + const sampleIdx = bootstrapSample(n); + const featIdx = this._getFeatureSubset(nFeatures); + + const XSub = sampleIdx.map((i) => { + const xi = X[i] ?? new Float64Array(nFeatures); + return new Float64Array(featIdx.map((f) => xi[f] ?? 0)); + }); + const ySub = new Float64Array(sampleIdx.map((i) => y[i] ?? 0)); + + const tree = new DecisionTreeRegressor({ + maxDepth: this.maxDepth, + minSamplesSplit: this.minSamplesSplit, + }); + tree.fit(XSub, ySub); + (tree as DecisionTreeRegressor & { featIdx_: number[] }).featIdx_ = featIdx; + this.estimators_.push(tree); + } + + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.estimators_ === null) throw new NotFittedError("RandomForestRegressor"); + return new Float64Array( + X.map((xi) => { + let sum = 0; + for (const tree of this.estimators_ as (DecisionTreeRegressor & { featIdx_: number[] })[]) { + const featIdx = tree.featIdx_; + const xSub = new Float64Array(featIdx.map((f) => xi[f] ?? 0)); + sum += (tree.predict([xSub]))[0] ?? 0; + } + return sum / (this.estimators_?.length ?? 1); + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const yPred = this.predict(X); + const yMean = Array.from(y).reduce((a, b) => a + b, 0) / y.length; + let ssTot = 0; + let ssRes = 0; + for (let i = 0; i < y.length; i++) { + ssTot += ((y[i] ?? 0) - yMean) ** 2; + ssRes += ((y[i] ?? 0) - (yPred[i] ?? 0)) ** 2; + } + return ssTot > 0 ? 1 - ssRes / ssTot : 0; + } +} diff --git a/src/feature_selection/index.ts b/src/feature_selection/index.ts new file mode 100644 index 0000000..e8b722a --- /dev/null +++ b/src/feature_selection/index.ts @@ -0,0 +1 @@ +export * from "./univariate.js"; diff --git a/src/feature_selection/univariate.ts b/src/feature_selection/univariate.ts new file mode 100644 index 0000000..ce9d945 --- /dev/null +++ b/src/feature_selection/univariate.ts @@ -0,0 +1,248 @@ +/** + * Feature selection utilities. + * Mirrors sklearn.feature_selection: SelectKBest, SelectPercentile, + * VarianceThreshold, chi2, f_classif, f_regression. + */ + +import { NotFittedError } from "../exceptions.js"; + +export type ScoreFn = (X: Float64Array[], y: Float64Array) => [Float64Array, Float64Array]; + +/** F-score for classification (ANOVA F-test). */ +export function fClassif(X: Float64Array[], y: Float64Array): [Float64Array, Float64Array] { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const uniqueClasses = Array.from(new Set(Array.from(y))); + const k = uniqueClasses.length; + + const fScores = new Float64Array(p); + const pValues = new Float64Array(p); + + for (let j = 0; j < p; j++) { + const overall = Array.from(X).map((xi) => xi[j] ?? 0); + const grandMean = overall.reduce((a, b) => a + b, 0) / n; + + let ssBetween = 0; + let ssWithin = 0; + + for (const cls of uniqueClasses) { + const groupVals = Array.from(y) + .map((yi, i) => (yi === cls ? (X[i] ?? new Float64Array(p))[j] ?? 0 : null)) + .filter((v): v is number => v !== null); + const groupMean = groupVals.reduce((a, b) => a + b, 0) / (groupVals.length || 1); + ssBetween += groupVals.length * (groupMean - grandMean) ** 2; + ssWithin += groupVals.reduce((s, v) => s + (v - groupMean) ** 2, 0); + } + + const dfBetween = k - 1; + const dfWithin = n - k; + const msBetween = dfBetween > 0 ? ssBetween / dfBetween : 0; + const msWithin = dfWithin > 0 ? ssWithin / dfWithin : 1e-10; + + fScores[j] = msWithin > 0 ? msBetween / msWithin : 0; + // Approximate p-value (simplified: not exact F distribution CDF) + pValues[j] = Math.exp(-(fScores[j] ?? 0) / 2); + } + + return [fScores, pValues]; +} + +/** F-score for regression. */ +export function fRegression(X: Float64Array[], y: Float64Array): [Float64Array, Float64Array] { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const yMean = Array.from(y).reduce((a, b) => a + b, 0) / n; + + const fScores = new Float64Array(p); + const pValues = new Float64Array(p); + + for (let j = 0; j < p; j++) { + const xVals = Array.from(X).map((xi) => xi[j] ?? 0); + const xMean = xVals.reduce((a, b) => a + b, 0) / n; + + let ssXY = 0; + let ssXX = 0; + for (let i = 0; i < n; i++) { + const dx = (xVals[i] ?? 0) - xMean; + ssXY += dx * ((y[i] ?? 0) - yMean); + ssXX += dx ** 2; + } + + if (ssXX === 0) { + fScores[j] = 0; + pValues[j] = 1; + continue; + } + + const slope = ssXY / ssXX; + const intercept = yMean - slope * xMean; + + let ssRes = 0; + let ssTot = 0; + for (let i = 0; i < n; i++) { + const pred = slope * (xVals[i] ?? 0) + intercept; + ssRes += ((y[i] ?? 0) - pred) ** 2; + ssTot += ((y[i] ?? 0) - yMean) ** 2; + } + + const r2 = ssTot > 0 ? 1 - ssRes / ssTot : 0; + fScores[j] = r2 > 0 && r2 < 1 ? (r2 / 1) / ((1 - r2) / (n - 2)) : 0; + pValues[j] = Math.exp(-(fScores[j] ?? 0) / 2); + } + + return [fScores, pValues]; +} + +/** Chi-squared test statistic for non-negative features. */ +export function chi2(X: Float64Array[], y: Float64Array): [Float64Array, Float64Array] { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const uniqueClasses = Array.from(new Set(Array.from(y))); + + const chiScores = new Float64Array(p); + const pValues = new Float64Array(p); + + for (let j = 0; j < p; j++) { + let chi = 0; + for (const cls of uniqueClasses) { + const classIdx = Array.from(y).map((yi, i) => yi === cls ? i : -1).filter(i => i >= 0); + const expected = classIdx.length / n; + for (let i of classIdx) { + const obs = (X[i] ?? new Float64Array(p))[j] ?? 0; + const exp = expected * Array.from(X).reduce((s, xi) => s + (xi[j] ?? 0), 0) / n; + if (exp > 0) chi += (obs - exp) ** 2 / exp; + } + } + chiScores[j] = chi; + pValues[j] = Math.exp(-chi / 2); + } + + return [chiScores, pValues]; +} + +export class SelectKBest { + k: number; + scoreFunc: ScoreFn; + + scores_: Float64Array | null = null; + pValues_: Float64Array | null = null; + selectedIndices_: number[] | null = null; + + constructor( + scoreFunc: ScoreFn = fClassif, + options: { k?: number } = {}, + ) { + this.scoreFunc = scoreFunc; + this.k = options.k ?? 10; + } + + fit(X: Float64Array[], y: Float64Array): this { + const [scores, pValues] = this.scoreFunc(X, y); + this.scores_ = scores; + this.pValues_ = pValues; + + const k = Math.min(this.k, scores.length); + const indices = Array.from({ length: scores.length }, (_, i) => i); + indices.sort((a, b) => (scores[b] ?? 0) - (scores[a] ?? 0)); + this.selectedIndices_ = indices.slice(0, k).sort((a, b) => a - b); + + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.selectedIndices_ === null) throw new NotFittedError("SelectKBest"); + const sel = this.selectedIndices_; + return X.map((xi) => new Float64Array(sel.map((j) => xi[j] ?? 0))); + } + + fitTransform(X: Float64Array[], y: Float64Array): Float64Array[] { + return this.fit(X, y).transform(X); + } + + getSupport(): boolean[] { + if (this.selectedIndices_ === null || this.scores_ === null) + throw new NotFittedError("SelectKBest"); + const n = this.scores_.length; + const selected = new Set(this.selectedIndices_); + return Array.from({ length: n }, (_, i) => selected.has(i)); + } +} + +export class SelectPercentile { + percentile: number; + scoreFunc: ScoreFn; + + scores_: Float64Array | null = null; + selectedIndices_: number[] | null = null; + + constructor( + scoreFunc: ScoreFn = fClassif, + options: { percentile?: number } = {}, + ) { + this.scoreFunc = scoreFunc; + this.percentile = options.percentile ?? 10; + } + + fit(X: Float64Array[], y: Float64Array): this { + const [scores] = this.scoreFunc(X, y); + this.scores_ = scores; + + const k = Math.max(1, Math.round((this.percentile / 100) * scores.length)); + const indices = Array.from({ length: scores.length }, (_, i) => i); + indices.sort((a, b) => (scores[b] ?? 0) - (scores[a] ?? 0)); + this.selectedIndices_ = indices.slice(0, k).sort((a, b) => a - b); + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.selectedIndices_ === null) throw new NotFittedError("SelectPercentile"); + const sel = this.selectedIndices_; + return X.map((xi) => new Float64Array(sel.map((j) => xi[j] ?? 0))); + } + + fitTransform(X: Float64Array[], y: Float64Array): Float64Array[] { + return this.fit(X, y).transform(X); + } +} + +export class VarianceThreshold { + threshold: number; + + variances_: Float64Array | null = null; + selectedIndices_: number[] | null = null; + + constructor(options: { threshold?: number } = {}) { + this.threshold = options.threshold ?? 0.0; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + + const variances = new Float64Array(p); + for (let j = 0; j < p; j++) { + let mean = 0; + for (const xi of X) mean += xi[j] ?? 0; + mean /= n; + let variance = 0; + for (const xi of X) variance += ((xi[j] ?? 0) - mean) ** 2; + variances[j] = variance / n; + } + + this.variances_ = variances; + this.selectedIndices_ = Array.from({ length: p }, (_, i) => i).filter( + (i) => (variances[i] ?? 0) > this.threshold, + ); + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.selectedIndices_ === null) throw new NotFittedError("VarianceThreshold"); + const sel = this.selectedIndices_; + return X.map((xi) => new Float64Array(sel.map((j) => xi[j] ?? 0))); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} diff --git a/src/impute/index.ts b/src/impute/index.ts new file mode 100644 index 0000000..70555a5 --- /dev/null +++ b/src/impute/index.ts @@ -0,0 +1 @@ +export * from "./simple_imputer.js"; diff --git a/src/impute/simple_imputer.ts b/src/impute/simple_imputer.ts new file mode 100644 index 0000000..b261724 --- /dev/null +++ b/src/impute/simple_imputer.ts @@ -0,0 +1,88 @@ +/** + * Imputers for missing values. + * Mirrors sklearn.impute.SimpleImputer. + */ + +import { NotFittedError } from "../exceptions.js"; + +export class SimpleImputer { + strategy: string; + fillValue: number; + missingValues: number; + + statistics_: Float64Array | null = null; + + constructor( + options: { + strategy?: string; + fillValue?: number; + missingValues?: number; + } = {}, + ) { + this.strategy = options.strategy ?? "mean"; + this.fillValue = options.fillValue ?? 0; + this.missingValues = options.missingValues ?? NaN; + } + + private _isMissing(x: number): boolean { + return isNaN(this.missingValues) ? isNaN(x) : x === this.missingValues; + } + + fit(X: Float64Array[]): this { + const p = (X[0] ?? new Float64Array(0)).length; + const stats = new Float64Array(p); + + for (let j = 0; j < p; j++) { + const vals: number[] = []; + for (const xi of X) { + const v = xi[j] ?? 0; + if (!this._isMissing(v)) vals.push(v); + } + + if (this.strategy === "mean") { + stats[j] = vals.length > 0 ? vals.reduce((a, b) => a + b, 0) / vals.length : 0; + } else if (this.strategy === "median") { + vals.sort((a, b) => a - b); + const mid = Math.floor(vals.length / 2); + stats[j] = + vals.length % 2 === 0 + ? ((vals[mid - 1] ?? 0) + (vals[mid] ?? 0)) / 2 + : (vals[mid] ?? 0); + } else if (this.strategy === "most_frequent") { + const counts = new Map(); + for (const v of vals) counts.set(v, (counts.get(v) ?? 0) + 1); + let best = 0; + let bestCnt = 0; + for (const [v, cnt] of counts) { + if (cnt > bestCnt) { + bestCnt = cnt; + best = v; + } + } + stats[j] = best; + } else { + stats[j] = this.fillValue; + } + } + + this.statistics_ = stats; + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.statistics_ === null) throw new NotFittedError("SimpleImputer"); + const stats = this.statistics_; + return X.map((xi) => { + const result = new Float64Array(xi.length); + for (let j = 0; j < xi.length; j++) { + const v = xi[j] ?? 0; + result[j] = this._isMissing(v) ? (stats[j] ?? 0) : v; + } + return result; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} diff --git a/src/index.ts b/src/index.ts index 0d022c2..56dcb93 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,14 +1,5 @@ /** * tsikit-learn — A complete TypeScript port of scikit-learn. - * - * Ported modules (Phase 1 + Phase 2 + linear_model): - * - exceptions: NotFittedError, ConvergenceWarning, ValueError - * - base: BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin, ClusterMixin - * - utils: extmath, validation, multiclass, class_weight - * - preprocessing: StandardScaler, MinMaxScaler, LabelEncoder, Normalizer - * - metrics: regression (mse, mae, r2), classification (accuracy, precision, recall, f1) - * - model_selection: train_test_split, KFold, StratifiedKFold - * - linear_model: LinearRegression, Ridge */ // Core @@ -29,3 +20,55 @@ export * from "./model_selection/index.js"; // Linear models export * from "./linear_model/index.js"; + +// SVM +export * from "./svm/index.js"; + +// Tree +export * from "./tree/index.js"; + +// Ensemble +export * from "./ensemble/index.js"; + +// Neighbors +export * from "./neighbors/index.js"; + +// Naive Bayes +export * from "./naive_bayes/index.js"; + +// Cluster +export * from "./cluster/index.js"; + +// Decomposition +export * from "./decomposition/index.js"; + +// Neural network +export * from "./neural_network/index.js"; + +// Pipeline +export * from "./pipeline/index.js"; + +// Impute +export * from "./impute/index.js"; + +// Feature selection +export * from "./feature_selection/index.js"; + +// Compose +export * from "./compose/index.js"; + +// Datasets +export * from "./datasets/index.js"; + +// Discriminant analysis +export * from "./discriminant_analysis/index.js"; + +// Isotonic +export * from "./isotonic/index.js"; + +// Multiclass +export * from "./multiclass/index.js"; + +// Calibration +export * from "./calibration/index.js"; + diff --git a/src/isotonic/index.ts b/src/isotonic/index.ts new file mode 100644 index 0000000..664687a --- /dev/null +++ b/src/isotonic/index.ts @@ -0,0 +1 @@ +export * from "./isotonic.js"; diff --git a/src/isotonic/isotonic.ts b/src/isotonic/isotonic.ts new file mode 100644 index 0000000..0c04394 --- /dev/null +++ b/src/isotonic/isotonic.ts @@ -0,0 +1,121 @@ +/** + * Isotonic Regression. + * Mirrors sklearn.isotonic.IsotonicRegression. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** Pool Adjacent Violators (PAV) algorithm for isotonic regression. */ +function poolAdjacentViolators(y: Float64Array, increasing: boolean): Float64Array { + const n = y.length; + const result = new Float64Array(y); + + // Simple PAVA + let changed = true; + while (changed) { + changed = false; + let i = 0; + while (i < n - 1) { + if (increasing ? (result[i] ?? 0) > (result[i + 1] ?? 0) : (result[i] ?? 0) < (result[i + 1] ?? 0)) { + // Merge block + const mean = ((result[i] ?? 0) + (result[i + 1] ?? 0)) / 2; + result[i] = mean; + result[i + 1] = mean; + changed = true; + } + i++; + } + } + + return result; +} + +export class IsotonicRegression { + increasing: boolean | "auto"; + outOfBounds: string; + + XThresholds_: Float64Array | null = null; + yThresholds_: Float64Array | null = null; + + constructor( + options: { increasing?: boolean | "auto"; outOfBounds?: string } = {}, + ) { + this.increasing = options.increasing ?? true; + this.outOfBounds = options.outOfBounds ?? "nan"; + } + + fit(X: Float64Array, y: Float64Array): this { + const n = X.length; + const order = Array.from({ length: n }, (_, i) => i).sort( + (a, b) => (X[a] ?? 0) - (X[b] ?? 0), + ); + + const xSorted = new Float64Array(order.map((i) => X[i] ?? 0)); + const ySorted = new Float64Array(order.map((i) => y[i] ?? 0)); + + const incr = + this.increasing === "auto" + ? (() => { + // Estimate direction from correlation + const xMean = Array.from(xSorted).reduce((a, b) => a + b, 0) / n; + const yMean = Array.from(ySorted).reduce((a, b) => a + b, 0) / n; + let cov = 0; + for (let i = 0; i < n; i++) { + cov += ((xSorted[i] ?? 0) - xMean) * ((ySorted[i] ?? 0) - yMean); + } + return cov >= 0; + })() + : this.increasing; + + const fitted = poolAdjacentViolators(ySorted, incr as boolean); + + this.XThresholds_ = xSorted; + this.yThresholds_ = fitted; + + return this; + } + + predict(X: Float64Array): Float64Array { + if (this.XThresholds_ === null || this.yThresholds_ === null) + throw new NotFittedError("IsotonicRegression"); + + const xThresh = this.XThresholds_; + const yThresh = this.yThresholds_; + + return new Float64Array( + Array.from(X).map((xi) => { + if (xi <= (xThresh[0] ?? xi)) return yThresh[0] ?? 0; + if (xi >= (xThresh[xThresh.length - 1] ?? xi)) return yThresh[yThresh.length - 1] ?? 0; + + // Binary search for interpolation + let lo = 0; + let hi = xThresh.length - 1; + while (lo < hi - 1) { + const mid = Math.floor((lo + hi) / 2); + if ((xThresh[mid] ?? 0) <= xi) lo = mid; + else hi = mid; + } + + const x0 = xThresh[lo] ?? 0; + const x1 = xThresh[hi] ?? 0; + const y0 = yThresh[lo] ?? 0; + const y1 = yThresh[hi] ?? 0; + + if (x1 === x0) return (y0 + y1) / 2; + return y0 + ((y1 - y0) * (xi - x0)) / (x1 - x0); + }), + ); + } + + score(X: Float64Array, y: Float64Array): number { + const yPred = this.predict(X); + const yMean = Array.from(y).reduce((a, b) => a + b, 0) / y.length; + let ssTot = 0; + let ssRes = 0; + for (let i = 0; i < y.length; i++) { + ssTot += ((y[i] ?? 0) - yMean) ** 2; + ssRes += ((y[i] ?? 0) - (yPred[i] ?? 0)) ** 2; + } + return ssTot > 0 ? 1 - ssRes / ssTot : 0; + } +} diff --git a/src/linear_model/index.ts b/src/linear_model/index.ts index 1875ef5..45c27d0 100644 --- a/src/linear_model/index.ts +++ b/src/linear_model/index.ts @@ -1,2 +1,6 @@ export * from "./linear_regression.js"; export * from "./ridge.js"; +export * from "./logistic_regression.js"; +export * from "./lasso.js"; +export * from "./sgd.js"; +export * from "./perceptron.js"; diff --git a/src/linear_model/lasso.ts b/src/linear_model/lasso.ts new file mode 100644 index 0000000..e226add --- /dev/null +++ b/src/linear_model/lasso.ts @@ -0,0 +1,180 @@ +/** + * Lasso and ElasticNet regression via coordinate descent. + * Mirrors sklearn.linear_model.Lasso and ElasticNet. + */ + +import { NotFittedError } from "../exceptions.js"; + +function softThreshold(x: number, threshold: number): number { + if (x > threshold) return x - threshold; + if (x < -threshold) return x + threshold; + return 0; +} + +export class Lasso { + alpha: number; + fitIntercept: boolean; + maxIter: number; + tol: number; + + coef_: Float64Array | null = null; + intercept_: number = 0; + + constructor( + options: { + alpha?: number; + fitIntercept?: boolean; + maxIter?: number; + tol?: number; + } = {}, + ) { + this.alpha = options.alpha ?? 1.0; + this.fitIntercept = options.fitIntercept ?? true; + this.maxIter = options.maxIter ?? 1000; + this.tol = options.tol ?? 1e-4; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const w = new Float64Array(p); + let intercept = 0; + + // Center data if fitIntercept + const yMean = this.fitIntercept + ? Array.from(y).reduce((a, b) => a + b, 0) / n + : 0; + + // Coordinate descent + for (let iter = 0; iter < this.maxIter; iter++) { + let maxDelta = 0; + for (let j = 0; j < p; j++) { + // Compute partial residual + let rho = 0; + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(p); + let pred = intercept; + for (let k = 0; k < p; k++) { + if (k !== j) pred += (w[k] ?? 0) * (xi[k] ?? 0); + } + rho += (xi[j] ?? 0) * ((y[i] ?? 0) - yMean - pred); + } + rho /= n; + const normSq = + Array.from(X).reduce((s, xi) => s + (xi[j] ?? 0) ** 2, 0) / n; + const wOld = w[j] ?? 0; + w[j] = normSq > 0 ? softThreshold(rho, this.alpha) / normSq : 0; + const delta = Math.abs((w[j] ?? 0) - wOld); + if (delta > maxDelta) maxDelta = delta; + } + if (this.fitIntercept) { + let predSum = 0; + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(p); + let pred = 0; + for (let j = 0; j < p; j++) { + pred += (w[j] ?? 0) * (xi[j] ?? 0); + } + predSum += (y[i] ?? 0) - pred; + } + intercept = predSum / n; + } + if (maxDelta < this.tol) break; + } + + this.coef_ = w; + this.intercept_ = intercept; + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.coef_ === null) throw new NotFittedError("Lasso"); + const coef = this.coef_; + return new Float64Array( + X.map((xi) => { + let pred = this.intercept_; + for (let j = 0; j < xi.length; j++) { + pred += (coef[j] ?? 0) * (xi[j] ?? 0); + } + return pred; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const yPred = this.predict(X); + const yMean = Array.from(y).reduce((a, b) => a + b, 0) / y.length; + let ssTot = 0; + let ssRes = 0; + for (let i = 0; i < y.length; i++) { + ssTot += ((y[i] ?? 0) - yMean) ** 2; + ssRes += ((y[i] ?? 0) - (yPred[i] ?? 0)) ** 2; + } + return ssTot > 0 ? 1 - ssRes / ssTot : 0; + } +} + +export class ElasticNet extends Lasso { + l1Ratio: number; + + constructor( + options: { + alpha?: number; + l1Ratio?: number; + fitIntercept?: boolean; + maxIter?: number; + tol?: number; + } = {}, + ) { + super(options); + this.l1Ratio = options.l1Ratio ?? 0.5; + } + + override fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const w = new Float64Array(p); + let intercept = 0; + const l1 = this.alpha * this.l1Ratio; + const l2 = this.alpha * (1 - this.l1Ratio); + + for (let iter = 0; iter < this.maxIter; iter++) { + let maxDelta = 0; + for (let j = 0; j < p; j++) { + let rho = 0; + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(p); + let pred = intercept; + for (let k = 0; k < p; k++) { + if (k !== j) pred += (w[k] ?? 0) * (xi[k] ?? 0); + } + rho += (xi[j] ?? 0) * ((y[i] ?? 0) - pred); + } + rho /= n; + const normSq = + Array.from(X).reduce((s, xi) => s + (xi[j] ?? 0) ** 2, 0) / n + l2; + const wOld = w[j] ?? 0; + w[j] = normSq > 0 ? softThreshold(rho, l1) / normSq : 0; + const delta = Math.abs((w[j] ?? 0) - wOld); + if (delta > maxDelta) maxDelta = delta; + } + if (this.fitIntercept) { + let predSum = 0; + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(p); + let pred = 0; + for (let j = 0; j < p; j++) { + pred += (w[j] ?? 0) * (xi[j] ?? 0); + } + predSum += (y[i] ?? 0) - pred; + } + intercept = predSum / n; + } + if (maxDelta < this.tol) break; + } + + this.coef_ = w; + this.intercept_ = intercept; + return this; + } +} diff --git a/src/linear_model/logistic_regression.ts b/src/linear_model/logistic_regression.ts new file mode 100644 index 0000000..0150602 --- /dev/null +++ b/src/linear_model/logistic_regression.ts @@ -0,0 +1,120 @@ +/** + * Logistic Regression classifier. + * Mirrors sklearn.linear_model.LogisticRegression. + */ + +import { NotFittedError } from "../exceptions.js"; + +function sigmoid(x: number): number { + return 1 / (1 + Math.exp(-x)); +} + +export class LogisticRegression { + C: number; + maxIter: number; + tol: number; + fitIntercept: boolean; + + coef_: Float64Array | null = null; + intercept_: number = 0; + classes_: Float64Array | null = null; + + constructor( + options: { + C?: number; + maxIter?: number; + tol?: number; + fitIntercept?: boolean; + } = {}, + ) { + this.C = options.C ?? 1.0; + this.maxIter = options.maxIter ?? 100; + this.tol = options.tol ?? 1e-4; + this.fitIntercept = options.fitIntercept ?? true; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const nFeatures = (X[0] ?? new Float64Array(0)).length; + const uniqueClasses = Array.from(new Set(Array.from(y))).sort( + (a, b) => a - b, + ); + this.classes_ = new Float64Array(uniqueClasses); + + // Binary logistic regression via gradient descent + const w = new Float64Array(nFeatures); + let b = 0; + const lr = 0.1; + const lambda = 1 / (this.C * n); + + // Map labels to 0/1 + const yBin = new Float64Array(n); + const posClass = uniqueClasses[uniqueClasses.length - 1] ?? 1; + for (let i = 0; i < n; i++) { + yBin[i] = (y[i] ?? 0) === posClass ? 1 : 0; + } + + for (let iter = 0; iter < this.maxIter; iter++) { + const gradW = new Float64Array(nFeatures); + let gradB = 0; + + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(nFeatures); + let dot = b; + for (let j = 0; j < nFeatures; j++) { + dot += (w[j] ?? 0) * (xi[j] ?? 0); + } + const p = sigmoid(dot); + const err = p - (yBin[i] ?? 0); + for (let j = 0; j < nFeatures; j++) { + gradW[j] = (gradW[j] ?? 0) + err * (xi[j] ?? 0); + } + gradB += err; + } + + let maxGrad = 0; + for (let j = 0; j < nFeatures; j++) { + const g = (gradW[j] ?? 0) / n + lambda * (w[j] ?? 0); + w[j] = (w[j] ?? 0) - lr * g; + if (Math.abs(g) > maxGrad) maxGrad = Math.abs(g); + } + if (this.fitIntercept) { + b -= lr * (gradB / n); + } + if (maxGrad < this.tol) break; + } + + this.coef_ = w; + this.intercept_ = b; + return this; + } + + predictProba(X: Float64Array[]): Float64Array[] { + if (this.coef_ === null) throw new NotFittedError("LogisticRegression"); + return X.map((xi) => { + let dot = this.intercept_; + for (let j = 0; j < xi.length; j++) { + dot += ((this.coef_ as Float64Array)[j] ?? 0) * (xi[j] ?? 0); + } + const p = sigmoid(dot); + return new Float64Array([1 - p, p]); + }); + } + + predict(X: Float64Array[]): Float64Array { + const proba = this.predictProba(X); + const classes = this.classes_ as Float64Array; + return new Float64Array( + proba.map((p) => ((p[1] ?? 0) >= 0.5 ? (classes[1] ?? 1) : (classes[0] ?? 0))), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} diff --git a/src/linear_model/perceptron.ts b/src/linear_model/perceptron.ts new file mode 100644 index 0000000..b3e511d --- /dev/null +++ b/src/linear_model/perceptron.ts @@ -0,0 +1,97 @@ +/** + * Perceptron classifier. + * Mirrors sklearn.linear_model.Perceptron. + */ + +import { NotFittedError } from "../exceptions.js"; + +export class Perceptron { + alpha: number; + maxIter: number; + tol: number; + fitIntercept: boolean; + eta0: number; + + coef_: Float64Array | null = null; + intercept_: number = 0; + classes_: Float64Array | null = null; + + constructor( + options: { + alpha?: number; + maxIter?: number; + tol?: number; + fitIntercept?: boolean; + eta0?: number; + } = {}, + ) { + this.alpha = options.alpha ?? 1e-4; + this.maxIter = options.maxIter ?? 1000; + this.tol = options.tol ?? 1e-3; + this.fitIntercept = options.fitIntercept ?? true; + this.eta0 = options.eta0 ?? 1.0; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + this.classes_ = new Float64Array( + Array.from(new Set(Array.from(y))).sort((a, b) => a - b), + ); + + const w = new Float64Array(p); + let b = 0; + const posClass = (this.classes_[this.classes_.length - 1]) ?? 1; + + for (let iter = 0; iter < this.maxIter; iter++) { + let errors = 0; + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(p); + let dot = b; + for (let j = 0; j < p; j++) { + dot += (w[j] ?? 0) * (xi[j] ?? 0); + } + const yBin = (y[i] ?? 0) === posClass ? 1 : -1; + const pred = dot >= 0 ? 1 : -1; + if (pred !== yBin) { + errors++; + for (let j = 0; j < p; j++) { + w[j] = (w[j] ?? 0) + this.eta0 * yBin * (xi[j] ?? 0); + } + if (this.fitIntercept) { + b += this.eta0 * yBin; + } + } + } + if (errors === 0) break; + } + + this.coef_ = w; + this.intercept_ = b; + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.coef_ === null) throw new NotFittedError("Perceptron"); + const classes = this.classes_ as Float64Array; + const coef = this.coef_; + return new Float64Array( + X.map((xi) => { + let dot = this.intercept_; + for (let j = 0; j < xi.length; j++) { + dot += (coef[j] ?? 0) * (xi[j] ?? 0); + } + return dot >= 0 ? (classes[classes.length - 1] ?? 1) : (classes[0] ?? 0); + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} diff --git a/src/linear_model/sgd.ts b/src/linear_model/sgd.ts new file mode 100644 index 0000000..11dfd1c --- /dev/null +++ b/src/linear_model/sgd.ts @@ -0,0 +1,199 @@ +/** + * SGD Classifier and Regressor. + * Mirrors sklearn.linear_model.SGDClassifier / SGDRegressor. + */ + +import { NotFittedError } from "../exceptions.js"; + +function sigmoid(x: number): number { + return 1 / (1 + Math.exp(-x)); +} + +export class SGDClassifier { + loss: string; + alpha: number; + maxIter: number; + tol: number; + eta0: number; + fitIntercept: boolean; + + coef_: Float64Array | null = null; + intercept_: number = 0; + classes_: Float64Array | null = null; + + constructor( + options: { + loss?: string; + alpha?: number; + maxIter?: number; + tol?: number; + eta0?: number; + fitIntercept?: boolean; + } = {}, + ) { + this.loss = options.loss ?? "hinge"; + this.alpha = options.alpha ?? 1e-4; + this.maxIter = options.maxIter ?? 1000; + this.tol = options.tol ?? 1e-3; + this.eta0 = options.eta0 ?? 0.01; + this.fitIntercept = options.fitIntercept ?? true; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + this.classes_ = new Float64Array( + Array.from(new Set(Array.from(y))).sort((a, b) => a - b), + ); + + const w = new Float64Array(p); + let b = 0; + const posClass = (this.classes_[this.classes_.length - 1]) ?? 1; + + for (let iter = 0; iter < this.maxIter; iter++) { + let totalLoss = 0; + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(p); + let dot = b; + for (let j = 0; j < p; j++) { + dot += (w[j] ?? 0) * (xi[j] ?? 0); + } + const yLabel = (y[i] ?? 0) === posClass ? 1 : -1; + + let grad = 0; + if (this.loss === "hinge") { + const margin = yLabel * dot; + if (margin < 1) { + grad = -yLabel; + totalLoss += 1 - margin; + } + } else { + // log loss + const p2 = sigmoid(yLabel * dot); + grad = -(1 - p2) * yLabel; + totalLoss += -Math.log(p2 + 1e-15); + } + + for (let j = 0; j < p; j++) { + w[j] = (w[j] ?? 0) * (1 - this.eta0 * this.alpha) - this.eta0 * grad * (xi[j] ?? 0); + } + if (this.fitIntercept) { + b -= this.eta0 * grad; + } + } + if (totalLoss / n < this.tol) break; + } + + this.coef_ = w; + this.intercept_ = b; + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.coef_ === null) throw new NotFittedError("SGDClassifier"); + const classes = this.classes_ as Float64Array; + const coef = this.coef_; + return new Float64Array( + X.map((xi) => { + let dot = this.intercept_; + for (let j = 0; j < xi.length; j++) { + dot += (coef[j] ?? 0) * (xi[j] ?? 0); + } + return dot >= 0 ? (classes[classes.length - 1] ?? 1) : (classes[0] ?? 0); + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} + +export class SGDRegressor { + alpha: number; + maxIter: number; + tol: number; + eta0: number; + fitIntercept: boolean; + + coef_: Float64Array | null = null; + intercept_: number = 0; + + constructor( + options: { + alpha?: number; + maxIter?: number; + tol?: number; + eta0?: number; + fitIntercept?: boolean; + } = {}, + ) { + this.alpha = options.alpha ?? 1e-4; + this.maxIter = options.maxIter ?? 1000; + this.tol = options.tol ?? 1e-3; + this.eta0 = options.eta0 ?? 0.01; + this.fitIntercept = options.fitIntercept ?? true; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const w = new Float64Array(p); + let b = 0; + + for (let iter = 0; iter < this.maxIter; iter++) { + let totalLoss = 0; + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(p); + let pred = b; + for (let j = 0; j < p; j++) { + pred += (w[j] ?? 0) * (xi[j] ?? 0); + } + const err = pred - (y[i] ?? 0); + totalLoss += err ** 2; + for (let j = 0; j < p; j++) { + w[j] = (w[j] ?? 0) * (1 - this.eta0 * this.alpha) - this.eta0 * err * (xi[j] ?? 0); + } + if (this.fitIntercept) { + b -= this.eta0 * err; + } + } + if (totalLoss / n < this.tol) break; + } + + this.coef_ = w; + this.intercept_ = b; + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.coef_ === null) throw new NotFittedError("SGDRegressor"); + const coef = this.coef_; + return new Float64Array( + X.map((xi) => { + let pred = this.intercept_; + for (let j = 0; j < xi.length; j++) { + pred += (coef[j] ?? 0) * (xi[j] ?? 0); + } + return pred; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const yPred = this.predict(X); + const yMean = Array.from(y).reduce((a, b) => a + b, 0) / y.length; + let ssTot = 0; + let ssRes = 0; + for (let i = 0; i < y.length; i++) { + ssTot += ((y[i] ?? 0) - yMean) ** 2; + ssRes += ((y[i] ?? 0) - (yPred[i] ?? 0)) ** 2; + } + return ssTot > 0 ? 1 - ssRes / ssTot : 0; + } +} diff --git a/src/metrics/clustering.ts b/src/metrics/clustering.ts new file mode 100644 index 0000000..9a1cecd --- /dev/null +++ b/src/metrics/clustering.ts @@ -0,0 +1,155 @@ +/** + * Clustering metrics. + * Mirrors sklearn.metrics.cluster. + */ + +export function silhouetteScore(X: Float64Array[], labels: Int32Array): number { + const n = X.length; + if (n === 0) return 0; + + function dist(a: Float64Array, b: Float64Array): number { + let s = 0; + for (let i = 0; i < a.length; i++) { + s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2; + } + return Math.sqrt(s); + } + + const scores = new Float64Array(n); + const uniqueLabels = Array.from(new Set(Array.from(labels))); + + for (let i = 0; i < n; i++) { + const li = labels[i] ?? 0; + const xi = X[i] ?? new Float64Array(0); + + // a(i): mean distance to same cluster + let aSumI = 0; + let aCountI = 0; + for (let j = 0; j < n; j++) { + if (i !== j && labels[j] === li) { + aSumI += dist(xi, X[j] ?? new Float64Array(0)); + aCountI++; + } + } + const ai = aCountI > 0 ? aSumI / aCountI : 0; + + // b(i): min mean distance to other clusters + let bi = Infinity; + for (const otherLabel of uniqueLabels) { + if (otherLabel === li) continue; + let bSum = 0; + let bCount = 0; + for (let j = 0; j < n; j++) { + if (labels[j] === otherLabel) { + bSum += dist(xi, X[j] ?? new Float64Array(0)); + bCount++; + } + } + if (bCount > 0) { + const bMean = bSum / bCount; + if (bMean < bi) bi = bMean; + } + } + if (!isFinite(bi)) bi = 0; + + const maxAB = Math.max(ai, bi); + scores[i] = maxAB > 0 ? (bi - ai) / maxAB : 0; + } + + return Array.from(scores).reduce((a, b) => a + b, 0) / n; +} + +export function adjustedRandScore( + labelsTrue: Int32Array, + labelsPred: Int32Array, +): number { + const n = labelsTrue.length; + const uniqueTrue = Array.from(new Set(Array.from(labelsTrue))); + const uniquePred = Array.from(new Set(Array.from(labelsPred))); + + // Contingency table + const contingency = new Map(); + for (let i = 0; i < n; i++) { + const key = `${labelsTrue[i] ?? 0},${labelsPred[i] ?? 0}`; + contingency.set(key, (contingency.get(key) ?? 0) + 1); + } + + function comb2(x: number): number { + return x < 2 ? 0 : (x * (x - 1)) / 2; + } + + let sumComb = 0; + for (const val of contingency.values()) { + sumComb += comb2(val); + } + + const rowSums = new Map(); + const colSums = new Map(); + for (let i = 0; i < n; i++) { + const r = labelsTrue[i] ?? 0; + const c = labelsPred[i] ?? 0; + rowSums.set(r, (rowSums.get(r) ?? 0) + 1); + colSums.set(c, (colSums.get(c) ?? 0) + 1); + } + + let sumRowComb = 0; + for (const v of rowSums.values()) sumRowComb += comb2(v); + let sumColComb = 0; + for (const v of colSums.values()) sumColComb += comb2(v); + + const total = comb2(n); + const expected = (sumRowComb * sumColComb) / (total || 1); + const maxVal = (sumRowComb + sumColComb) / 2; + const denom = maxVal - expected; + + return denom === 0 ? (sumComb === expected ? 1 : 0) : (sumComb - expected) / denom; +} + +export function homogeneityScore( + labelsTrue: Int32Array, + labelsPred: Int32Array, +): number { + const n = labelsTrue.length; + if (n === 0) return 1; + + function entropy(labels: Int32Array): number { + const counts = new Map(); + for (const l of labels) counts.set(l, (counts.get(l) ?? 0) + 1); + let h = 0; + for (const c of counts.values()) { + const p = c / n; + h -= p * Math.log(p); + } + return h; + } + + const hC = entropy(labelsTrue); + if (hC === 0) return 1; + + // Conditional entropy H(C|K) + const contingency = new Map>(); + for (let i = 0; i < n; i++) { + const k = labelsPred[i] ?? 0; + const c = labelsTrue[i] ?? 0; + if (!contingency.has(k)) contingency.set(k, new Map()); + const m = contingency.get(k) as Map; + m.set(c, (m.get(c) ?? 0) + 1); + } + + const kCounts = new Map(); + for (let i = 0; i < n; i++) { + const k = labelsPred[i] ?? 0; + kCounts.set(k, (kCounts.get(k) ?? 0) + 1); + } + + let hCK = 0; + for (const [k, cMap] of contingency) { + const nK = kCounts.get(k) ?? 0; + for (const cnt of cMap.values()) { + const p = cnt / nK; + hCK += (nK / n) * (-p * Math.log(p + 1e-15)); + } + } + + return 1 - hCK / hC; +} diff --git a/src/metrics/index.ts b/src/metrics/index.ts index 96b3cab..7e7d7a2 100644 --- a/src/metrics/index.ts +++ b/src/metrics/index.ts @@ -1,2 +1,3 @@ export * from "./regression.js"; export * from "./classification.js"; +export * from "./clustering.js"; diff --git a/src/model_selection/index.ts b/src/model_selection/index.ts index 35a025e..8b94168 100644 --- a/src/model_selection/index.ts +++ b/src/model_selection/index.ts @@ -1 +1,2 @@ export * from "./split.js"; +export * from "./search.js"; diff --git a/src/model_selection/search.ts b/src/model_selection/search.ts new file mode 100644 index 0000000..2c2148e --- /dev/null +++ b/src/model_selection/search.ts @@ -0,0 +1,145 @@ +/** + * Grid search and cross-validation utilities. + * Mirrors sklearn.model_selection.GridSearchCV and cross_val_score. + */ + +import { KFold } from "./split.js"; + +export interface Estimator { + fit(X: Float64Array[], y: Float64Array): this; + score(X: Float64Array[], y: Float64Array): number; +} + +export interface GridParams { + [key: string]: number | string | boolean; +} + +function cartesianProduct(paramGrid: Record): GridParams[] { + const keys = Object.keys(paramGrid); + if (keys.length === 0) return [{}]; + const result: GridParams[] = [{}]; + for (const key of keys) { + const values = paramGrid[key] ?? []; + const newResult: GridParams[] = []; + for (const existing of result) { + for (const val of values) { + newResult.push({ ...existing, [key]: val }); + } + } + result.length = 0; + result.push(...newResult); + } + return result; +} + +export class GridSearchCV { + estimator: Estimator; + paramGrid: Record; + cv: number; + scoring: string; + + bestParams_: GridParams | null = null; + bestScore_: number = -Infinity; + bestEstimator_: Estimator | null = null; + cvResults_: { params: GridParams; meanTestScore: number }[] = []; + + constructor( + estimator: Estimator, + paramGrid: Record, + options: { cv?: number; scoring?: string } = {}, + ) { + this.estimator = estimator; + this.paramGrid = paramGrid; + this.cv = options.cv ?? 5; + this.scoring = options.scoring ?? "score"; + } + + fit(X: Float64Array[], y: Float64Array): this { + const candidates = cartesianProduct(this.paramGrid); + const kfold = new KFold({ nSplits: this.cv }); + + this.cvResults_ = []; + let bestScore = -Infinity; + let bestParams: GridParams = {}; + + for (const params of candidates) { + const scores: number[] = []; + for (const fold of kfold.split(X)) { + const trainIdx = fold.trainIndex; + const testIdx = fold.testIndex; + const XTrain = Array.from(trainIdx).map((i) => X[i] ?? new Float64Array(0)); + const yTrain = new Float64Array(Array.from(trainIdx).map((i) => y[i] ?? 0)); + const XTest = Array.from(testIdx).map((i) => X[i] ?? new Float64Array(0)); + const yTest = new Float64Array(Array.from(testIdx).map((i) => y[i] ?? 0)); + + // Clone and set params + const est = Object.create( + Object.getPrototypeOf(this.estimator) as object, + ) as Estimator & Record; + Object.assign(est, this.estimator); + for (const [k, v] of Object.entries(params)) { + est[k] = v; + } + // Reset fitted attributes + est.fit(XTrain, yTrain); + scores.push(est.score(XTest, yTest)); + } + const meanScore = scores.reduce((a, b) => a + b, 0) / scores.length; + this.cvResults_.push({ params, meanTestScore: meanScore }); + + if (meanScore > bestScore) { + bestScore = meanScore; + bestParams = params; + } + } + + this.bestParams_ = bestParams; + this.bestScore_ = bestScore; + + // Refit best estimator on full data + const best = Object.create( + Object.getPrototypeOf(this.estimator) as object, + ) as Estimator & Record; + Object.assign(best, this.estimator); + for (const [k, v] of Object.entries(bestParams)) { + best[k] = v; + } + best.fit(X, y); + this.bestEstimator_ = best as Estimator; + + return this; + } + + score(X: Float64Array[], y: Float64Array): number { + if (this.bestEstimator_ === null) throw new Error("GridSearchCV not fitted"); + return this.bestEstimator_.score(X, y); + } +} + +export function crossValScore( + estimator: Estimator, + X: Float64Array[], + y: Float64Array, + cv = 5, +): Float64Array { + const kfold = new KFold({ nSplits: cv }); + const scores: number[] = []; + + for (const fold of kfold.split(X)) { + const trainIdx = fold.trainIndex; + const testIdx = fold.testIndex; + const XTrain = Array.from(trainIdx).map((i) => X[i] ?? new Float64Array(0)); + const yTrain = new Float64Array(Array.from(trainIdx).map((i) => y[i] ?? 0)); + const XTest = Array.from(testIdx).map((i) => X[i] ?? new Float64Array(0)); + const yTest = new Float64Array(Array.from(testIdx).map((i) => y[i] ?? 0)); + + const est = Object.create( + Object.getPrototypeOf(estimator) as object, + ) as Estimator; + Object.assign(est, estimator); + est.fit(XTrain, yTrain); + scores.push(est.score(XTest, yTest)); + } + + return new Float64Array(scores); +} diff --git a/src/multiclass/index.ts b/src/multiclass/index.ts new file mode 100644 index 0000000..2a4032c --- /dev/null +++ b/src/multiclass/index.ts @@ -0,0 +1 @@ +export * from "./one_vs_rest.js"; diff --git a/src/multiclass/one_vs_rest.ts b/src/multiclass/one_vs_rest.ts new file mode 100644 index 0000000..c7eec9b --- /dev/null +++ b/src/multiclass/one_vs_rest.ts @@ -0,0 +1,159 @@ +/** + * Multiclass meta-estimators. + * Mirrors sklearn.multiclass: OneVsRestClassifier, OneVsOneClassifier. + */ + +import { NotFittedError } from "../exceptions.js"; + +export interface BinaryClassifier { + fit(X: Float64Array[], y: Float64Array): this; + predict(X: Float64Array[]): Float64Array; + score?(X: Float64Array[], y: Float64Array): number; +} + +export class OneVsRestClassifier { + estimator: BinaryClassifier; + estimators_: BinaryClassifier[] | null = null; + classes_: Float64Array | null = null; + + constructor(estimator: BinaryClassifier) { + this.estimator = estimator; + } + + fit(X: Float64Array[], y: Float64Array): this { + const uniqueClasses = Array.from(new Set(Array.from(y))).sort((a, b) => a - b); + this.classes_ = new Float64Array(uniqueClasses); + this.estimators_ = []; + + for (const cls of uniqueClasses) { + const yBin = new Float64Array(y.length); + for (let i = 0; i < y.length; i++) { + yBin[i] = (y[i] ?? 0) === cls ? 1 : 0; + } + const est = Object.create(Object.getPrototypeOf(this.estimator) as object) as BinaryClassifier; + Object.assign(est, this.estimator); + est.fit(X, yBin); + this.estimators_.push(est); + } + + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.estimators_ === null || this.classes_ === null) + throw new NotFittedError("OneVsRestClassifier"); + + const classes = this.classes_; + const n = X.length; + const nClasses = classes.length; + + // Get decision scores for each class + const scores: Float64Array[] = this.estimators_.map((est) => est.predict(X)); + + return new Float64Array( + Array.from({ length: n }, (_, i) => { + let maxScore = -Infinity; + let bestClass = classes[0] ?? 0; + for (let c = 0; c < nClasses; c++) { + const score = (scores[c] ?? new Float64Array(n))[i] ?? 0; + if (score > maxScore) { + maxScore = score; + bestClass = classes[c] ?? 0; + } + } + return bestClass; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} + +export class OneVsOneClassifier { + estimator: BinaryClassifier; + estimators_: BinaryClassifier[] | null = null; + classes_: Float64Array | null = null; + pairIndices_: [number, number][] | null = null; + + constructor(estimator: BinaryClassifier) { + this.estimator = estimator; + } + + fit(X: Float64Array[], y: Float64Array): this { + const uniqueClasses = Array.from(new Set(Array.from(y))).sort((a, b) => a - b); + this.classes_ = new Float64Array(uniqueClasses); + this.estimators_ = []; + this.pairIndices_ = []; + + for (let i = 0; i < uniqueClasses.length; i++) { + for (let j = i + 1; j < uniqueClasses.length; j++) { + const ci = uniqueClasses[i] as number; + const cj = uniqueClasses[j] as number; + this.pairIndices_.push([i, j]); + + // Filter samples for these two classes + const mask: number[] = []; + for (let k = 0; k < y.length; k++) { + if ((y[k] ?? 0) === ci || (y[k] ?? 0) === cj) mask.push(k); + } + const XSub = mask.map((k) => X[k] ?? new Float64Array(0)); + const ySub = new Float64Array(mask.map((k) => ((y[k] ?? 0) === ci ? 0 : 1))); + + const est = Object.create(Object.getPrototypeOf(this.estimator) as object) as BinaryClassifier; + Object.assign(est, this.estimator); + est.fit(XSub, ySub); + this.estimators_.push(est); + } + } + + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.estimators_ === null || this.classes_ === null || this.pairIndices_ === null) + throw new NotFittedError("OneVsOneClassifier"); + + const classes = this.classes_; + const n = X.length; + const nClasses = classes.length; + + return new Float64Array( + Array.from({ length: n }, (_, i) => { + const votes = new Int32Array(nClasses); + for (let e = 0; e < this.estimators_!.length; e++) { + const est = this.estimators_![e] as BinaryClassifier; + const [ci, cj] = this.pairIndices_![e] as [number, number]; + const pred = (est.predict([X[i] ?? new Float64Array(0)]))[0] ?? 0; + if (pred === 0) votes[ci] = (votes[ci] ?? 0) + 1; + else votes[cj] = (votes[cj] ?? 0) + 1; + } + + let bestIdx = 0; + let bestVotes = votes[0] ?? 0; + for (let c = 1; c < nClasses; c++) { + if ((votes[c] ?? 0) > bestVotes) { + bestVotes = votes[c] ?? 0; + bestIdx = c; + } + } + return classes[bestIdx] ?? 0; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} diff --git a/src/naive_bayes/index.ts b/src/naive_bayes/index.ts new file mode 100644 index 0000000..538de54 --- /dev/null +++ b/src/naive_bayes/index.ts @@ -0,0 +1 @@ +export * from "./naive_bayes.js"; diff --git a/src/naive_bayes/naive_bayes.ts b/src/naive_bayes/naive_bayes.ts new file mode 100644 index 0000000..eed0c26 --- /dev/null +++ b/src/naive_bayes/naive_bayes.ts @@ -0,0 +1,300 @@ +/** + * Naive Bayes classifiers. + * Mirrors sklearn.naive_bayes: GaussianNB, MultinomialNB, BernoulliNB. + */ + +import { NotFittedError } from "../exceptions.js"; + +export class GaussianNB { + varSmoothing: number; + + classPrior_: Float64Array | null = null; + thetaMean_: Float64Array[] | null = null; + thetaVar_: Float64Array[] | null = null; + classes_: Float64Array | null = null; + + constructor(options: { varSmoothing?: number } = {}) { + this.varSmoothing = options.varSmoothing ?? 1e-9; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const uniqueClasses = Array.from(new Set(Array.from(y))).sort((a, b) => a - b); + this.classes_ = new Float64Array(uniqueClasses); + const nClasses = uniqueClasses.length; + const classToIdx = new Map(uniqueClasses.map((c, i) => [c, i])); + + const means: Float64Array[] = Array.from({ length: nClasses }, () => new Float64Array(p)); + const vars: Float64Array[] = Array.from({ length: nClasses }, () => new Float64Array(p)); + const counts = new Int32Array(nClasses); + + for (let i = 0; i < n; i++) { + const c = classToIdx.get(y[i] ?? 0) ?? 0; + counts[c] = (counts[c] ?? 0) + 1; + const xi = X[i] ?? new Float64Array(p); + const mean = means[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + mean[j] = (mean[j] ?? 0) + (xi[j] ?? 0); + } + } + + for (let c = 0; c < nClasses; c++) { + const cnt = counts[c] ?? 1; + const mean = means[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + mean[j] = (mean[j] ?? 0) / cnt; + } + } + + // Compute variance + for (let i = 0; i < n; i++) { + const c = classToIdx.get(y[i] ?? 0) ?? 0; + const xi = X[i] ?? new Float64Array(p); + const mean = means[c] ?? new Float64Array(p); + const variance = vars[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + variance[j] = (variance[j] ?? 0) + ((xi[j] ?? 0) - (mean[j] ?? 0)) ** 2; + } + } + + for (let c = 0; c < nClasses; c++) { + const cnt = counts[c] ?? 1; + const variance = vars[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + variance[j] = (variance[j] ?? 0) / cnt + this.varSmoothing; + } + } + + this.thetaMean_ = means; + this.thetaVar_ = vars; + this.classPrior_ = new Float64Array(nClasses); + for (let c = 0; c < nClasses; c++) { + this.classPrior_[c] = (counts[c] ?? 0) / n; + } + + return this; + } + + predictLogProba(X: Float64Array[]): Float64Array[] { + if (this.classes_ === null) throw new NotFittedError("GaussianNB"); + const nClasses = this.classes_.length; + const p = (X[0] ?? new Float64Array(0)).length; + + return X.map((xi) => { + const logProba = new Float64Array(nClasses); + for (let c = 0; c < nClasses; c++) { + let logP = Math.log((this.classPrior_ as Float64Array)[c] ?? 1e-10); + const mean = (this.thetaMean_ as Float64Array[])[c] ?? new Float64Array(p); + const variance = (this.thetaVar_ as Float64Array[])[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + const xij = xi[j] ?? 0; + const mu = mean[j] ?? 0; + const sig2 = variance[j] ?? 1e-9; + logP -= 0.5 * Math.log(2 * Math.PI * sig2); + logP -= ((xij - mu) ** 2) / (2 * sig2); + } + logProba[c] = logP; + } + return logProba; + }); + } + + predict(X: Float64Array[]): Float64Array { + if (this.classes_ === null) throw new NotFittedError("GaussianNB"); + const classes = this.classes_; + const logProba = this.predictLogProba(X); + return new Float64Array( + logProba.map((lp) => { + let maxIdx = 0; + let maxVal = lp[0] ?? -Infinity; + for (let c = 1; c < lp.length; c++) { + if ((lp[c] ?? -Infinity) > maxVal) { + maxVal = lp[c] ?? -Infinity; + maxIdx = c; + } + } + return classes[maxIdx] ?? 0; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} + +export class MultinomialNB { + alpha: number; + + featureLogProb_: Float64Array[] | null = null; + classLogPrior_: Float64Array | null = null; + classes_: Float64Array | null = null; + + constructor(options: { alpha?: number } = {}) { + this.alpha = options.alpha ?? 1.0; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const uniqueClasses = Array.from(new Set(Array.from(y))).sort((a, b) => a - b); + this.classes_ = new Float64Array(uniqueClasses); + const nClasses = uniqueClasses.length; + const classToIdx = new Map(uniqueClasses.map((c, i) => [c, i])); + + const counts: Float64Array[] = Array.from({ length: nClasses }, () => new Float64Array(p)); + const classCounts = new Float64Array(nClasses); + + for (let i = 0; i < n; i++) { + const c = classToIdx.get(y[i] ?? 0) ?? 0; + classCounts[c] = (classCounts[c] ?? 0) + 1; + const xi = X[i] ?? new Float64Array(p); + const count = counts[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + count[j] = (count[j] ?? 0) + (xi[j] ?? 0); + } + } + + this.classLogPrior_ = new Float64Array( + Array.from(classCounts).map((c) => Math.log(c / n)), + ); + + this.featureLogProb_ = counts.map((count) => { + const total = Array.from(count).reduce((a, b) => a + b, 0) + this.alpha * p; + return new Float64Array(count.map((c) => Math.log((c + this.alpha) / total))); + }); + + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.classes_ === null) throw new NotFittedError("MultinomialNB"); + const classes = this.classes_; + const nClasses = classes.length; + const p = (X[0] ?? new Float64Array(0)).length; + + return new Float64Array( + X.map((xi) => { + let maxIdx = 0; + let maxScore = -Infinity; + for (let c = 0; c < nClasses; c++) { + let score = (this.classLogPrior_ as Float64Array)[c] ?? 0; + const flp = (this.featureLogProb_ as Float64Array[])[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + score += (xi[j] ?? 0) * (flp[j] ?? 0); + } + if (score > maxScore) { + maxScore = score; + maxIdx = c; + } + } + return classes[maxIdx] ?? 0; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} + +export class BernoulliNB { + alpha: number; + binarize: number | null; + + featureLogProb_: Float64Array[] | null = null; + featureLogNegProb_: Float64Array[] | null = null; + classLogPrior_: Float64Array | null = null; + classes_: Float64Array | null = null; + + constructor(options: { alpha?: number; binarize?: number | null } = {}) { + this.alpha = options.alpha ?? 1.0; + this.binarize = options.binarize ?? 0.0; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const threshold = this.binarize ?? 0.0; + const uniqueClasses = Array.from(new Set(Array.from(y))).sort((a, b) => a - b); + this.classes_ = new Float64Array(uniqueClasses); + const nClasses = uniqueClasses.length; + const classToIdx = new Map(uniqueClasses.map((c, i) => [c, i])); + + const counts: Float64Array[] = Array.from({ length: nClasses }, () => new Float64Array(p)); + const classCounts = new Float64Array(nClasses); + + for (let i = 0; i < n; i++) { + const c = classToIdx.get(y[i] ?? 0) ?? 0; + classCounts[c] = (classCounts[c] ?? 0) + 1; + const xi = X[i] ?? new Float64Array(p); + const count = counts[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + if ((xi[j] ?? 0) > threshold) count[j] = (count[j] ?? 0) + 1; + } + } + + this.classLogPrior_ = new Float64Array( + Array.from(classCounts).map((c) => Math.log(c / n)), + ); + + this.featureLogProb_ = counts.map((count, c) => { + const total = classCounts[c] ?? 1; + return new Float64Array(count.map((cnt) => Math.log((cnt + this.alpha) / (total + 2 * this.alpha)))); + }); + + this.featureLogNegProb_ = this.featureLogProb_.map((logProb) => + new Float64Array(logProb.map((lp) => Math.log(1 - Math.exp(lp)))), + ); + + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.classes_ === null) throw new NotFittedError("BernoulliNB"); + const classes = this.classes_; + const nClasses = classes.length; + const p = (X[0] ?? new Float64Array(0)).length; + const threshold = this.binarize ?? 0.0; + + return new Float64Array( + X.map((xi) => { + let maxIdx = 0; + let maxScore = -Infinity; + for (let c = 0; c < nClasses; c++) { + let score = (this.classLogPrior_ as Float64Array)[c] ?? 0; + const flp = (this.featureLogProb_ as Float64Array[])[c] ?? new Float64Array(p); + const flnp = (this.featureLogNegProb_ as Float64Array[])[c] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + score += (xi[j] ?? 0) > threshold ? (flp[j] ?? 0) : (flnp[j] ?? 0); + } + if (score > maxScore) { + maxScore = score; + maxIdx = c; + } + } + return classes[maxIdx] ?? 0; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} diff --git a/src/neighbors/index.ts b/src/neighbors/index.ts new file mode 100644 index 0000000..624f811 --- /dev/null +++ b/src/neighbors/index.ts @@ -0,0 +1,2 @@ +export * from "./knn.js"; +export * from "./radius.js"; diff --git a/src/neighbors/knn.ts b/src/neighbors/knn.ts new file mode 100644 index 0000000..1c0c0f1 --- /dev/null +++ b/src/neighbors/knn.ts @@ -0,0 +1,177 @@ +/** + * K-Nearest Neighbors Classifier and Regressor. + * Mirrors sklearn.neighbors.KNeighborsClassifier / KNeighborsRegressor. + */ + +import { NotFittedError } from "../exceptions.js"; + +function euclidean(a: Float64Array, b: Float64Array): number { + let s = 0; + for (let i = 0; i < a.length; i++) { + s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2; + } + return Math.sqrt(s); +} + +function manhattan(a: Float64Array, b: Float64Array): number { + let s = 0; + for (let i = 0; i < a.length; i++) { + s += Math.abs((a[i] ?? 0) - (b[i] ?? 0)); + } + return s; +} + +type MetricFn = (a: Float64Array, b: Float64Array) => number; + +function getMetric(metric: string): MetricFn { + if (metric === "manhattan") return manhattan; + return euclidean; +} + +export class KNeighborsClassifier { + k: number; + metric: string; + weights: string; + + XTrain_: Float64Array[] | null = null; + yTrain_: Float64Array | null = null; + classes_: Float64Array | null = null; + + constructor( + options: { + k?: number; + nNeighbors?: number; + metric?: string; + weights?: string; + } = {}, + ) { + this.k = options.k ?? options.nNeighbors ?? 5; + this.metric = options.metric ?? "euclidean"; + this.weights = options.weights ?? "uniform"; + } + + fit(X: Float64Array[], y: Float64Array): this { + this.XTrain_ = X; + this.yTrain_ = y; + this.classes_ = new Float64Array( + Array.from(new Set(Array.from(y))).sort((a, b) => a - b), + ); + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.XTrain_ === null || this.yTrain_ === null) + throw new NotFittedError("KNeighborsClassifier"); + + const metricFn = getMetric(this.metric); + const XTrain = this.XTrain_; + const yTrain = this.yTrain_; + const k = Math.min(this.k, XTrain.length); + + return new Float64Array( + X.map((xi) => { + const dists = XTrain.map((xj, idx) => ({ + dist: metricFn(xi, xj), + label: yTrain[idx] ?? 0, + })); + dists.sort((a, b) => a.dist - b.dist); + const neighbors = dists.slice(0, k); + + const votes = new Map(); + for (const { dist, label } of neighbors) { + const w = this.weights === "distance" ? (dist > 0 ? 1 / dist : 1e10) : 1; + votes.set(label, (votes.get(label) ?? 0) + w); + } + + let bestLabel = 0; + let bestVotes = -Infinity; + for (const [label, v] of votes) { + if (v > bestVotes) { + bestVotes = v; + bestLabel = label; + } + } + return bestLabel; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} + +export class KNeighborsRegressor { + k: number; + metric: string; + weights: string; + + XTrain_: Float64Array[] | null = null; + yTrain_: Float64Array | null = null; + + constructor( + options: { + k?: number; + nNeighbors?: number; + metric?: string; + weights?: string; + } = {}, + ) { + this.k = options.k ?? options.nNeighbors ?? 5; + this.metric = options.metric ?? "euclidean"; + this.weights = options.weights ?? "uniform"; + } + + fit(X: Float64Array[], y: Float64Array): this { + this.XTrain_ = X; + this.yTrain_ = y; + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.XTrain_ === null || this.yTrain_ === null) + throw new NotFittedError("KNeighborsRegressor"); + + const metricFn = getMetric(this.metric); + const XTrain = this.XTrain_; + const yTrain = this.yTrain_; + const k = Math.min(this.k, XTrain.length); + + return new Float64Array( + X.map((xi) => { + const dists = XTrain.map((xj, idx) => ({ + dist: metricFn(xi, xj), + y: yTrain[idx] ?? 0, + })); + dists.sort((a, b) => a.dist - b.dist); + const neighbors = dists.slice(0, k); + + let wSum = 0; + let ySum = 0; + for (const { dist, y: yVal } of neighbors) { + const w = this.weights === "distance" ? (dist > 0 ? 1 / dist : 1e10) : 1; + wSum += w; + ySum += w * yVal; + } + return wSum > 0 ? ySum / wSum : 0; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const yPred = this.predict(X); + const yMean = Array.from(y).reduce((a, b) => a + b, 0) / y.length; + let ssTot = 0; + let ssRes = 0; + for (let i = 0; i < y.length; i++) { + ssTot += ((y[i] ?? 0) - yMean) ** 2; + ssRes += ((y[i] ?? 0) - (yPred[i] ?? 0)) ** 2; + } + return ssTot > 0 ? 1 - ssRes / ssTot : 0; + } +} diff --git a/src/neighbors/radius.ts b/src/neighbors/radius.ts new file mode 100644 index 0000000..759de09 --- /dev/null +++ b/src/neighbors/radius.ts @@ -0,0 +1,149 @@ +/** + * Radius Neighbors Classifier and Regressor. + * Mirrors sklearn.neighbors.RadiusNeighborsClassifier / RadiusNeighborsRegressor. + */ + +import { NotFittedError } from "../exceptions.js"; + +function euclidean(a: Float64Array, b: Float64Array): number { + let s = 0; + for (let i = 0; i < a.length; i++) { + s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2; + } + return Math.sqrt(s); +} + +export class RadiusNeighborsClassifier { + radius: number; + weights: string; + outlierLabel: number; + + XTrain_: Float64Array[] | null = null; + yTrain_: Float64Array | null = null; + classes_: Float64Array | null = null; + + constructor( + options: { + radius?: number; + weights?: string; + outlierLabel?: number; + } = {}, + ) { + this.radius = options.radius ?? 1.0; + this.weights = options.weights ?? "uniform"; + this.outlierLabel = options.outlierLabel ?? -1; + } + + fit(X: Float64Array[], y: Float64Array): this { + this.XTrain_ = X; + this.yTrain_ = y; + this.classes_ = new Float64Array( + Array.from(new Set(Array.from(y))).sort((a, b) => a - b), + ); + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.XTrain_ === null || this.yTrain_ === null) + throw new NotFittedError("RadiusNeighborsClassifier"); + + const XTrain = this.XTrain_; + const yTrain = this.yTrain_; + + return new Float64Array( + X.map((xi) => { + const neighbors: { dist: number; label: number }[] = []; + for (let j = 0; j < XTrain.length; j++) { + const d = euclidean(xi, XTrain[j] ?? new Float64Array(0)); + if (d <= this.radius) { + neighbors.push({ dist: d, label: yTrain[j] ?? 0 }); + } + } + + if (neighbors.length === 0) return this.outlierLabel; + + const votes = new Map(); + for (const { dist, label } of neighbors) { + const w = this.weights === "distance" ? (dist > 0 ? 1 / dist : 1e10) : 1; + votes.set(label, (votes.get(label) ?? 0) + w); + } + + let bestLabel = 0; + let bestVotes = -Infinity; + for (const [label, v] of votes) { + if (v > bestVotes) { + bestVotes = v; + bestLabel = label; + } + } + return bestLabel; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} + +export class RadiusNeighborsRegressor { + radius: number; + weights: string; + + XTrain_: Float64Array[] | null = null; + yTrain_: Float64Array | null = null; + + constructor( + options: { radius?: number; weights?: string } = {}, + ) { + this.radius = options.radius ?? 1.0; + this.weights = options.weights ?? "uniform"; + } + + fit(X: Float64Array[], y: Float64Array): this { + this.XTrain_ = X; + this.yTrain_ = y; + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.XTrain_ === null || this.yTrain_ === null) + throw new NotFittedError("RadiusNeighborsRegressor"); + + const XTrain = this.XTrain_; + const yTrain = this.yTrain_; + + return new Float64Array( + X.map((xi) => { + let wSum = 0; + let ySum = 0; + for (let j = 0; j < XTrain.length; j++) { + const d = euclidean(xi, XTrain[j] ?? new Float64Array(0)); + if (d <= this.radius) { + const w = this.weights === "distance" ? (d > 0 ? 1 / d : 1e10) : 1; + wSum += w; + ySum += w * (yTrain[j] ?? 0); + } + } + return wSum > 0 ? ySum / wSum : 0; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const yPred = this.predict(X); + const yMean = Array.from(y).reduce((a, b) => a + b, 0) / y.length; + let ssTot = 0; + let ssRes = 0; + for (let i = 0; i < y.length; i++) { + ssTot += ((y[i] ?? 0) - yMean) ** 2; + ssRes += ((y[i] ?? 0) - (yPred[i] ?? 0)) ** 2; + } + return ssTot > 0 ? 1 - ssRes / ssTot : 0; + } +} diff --git a/src/neural_network/index.ts b/src/neural_network/index.ts new file mode 100644 index 0000000..892d48e --- /dev/null +++ b/src/neural_network/index.ts @@ -0,0 +1 @@ +export * from "./mlp.js"; diff --git a/src/neural_network/mlp.ts b/src/neural_network/mlp.ts new file mode 100644 index 0000000..43336bb --- /dev/null +++ b/src/neural_network/mlp.ts @@ -0,0 +1,402 @@ +/** + * MLP Classifier and Regressor (Multi-Layer Perceptron). + * Mirrors sklearn.neural_network.MLPClassifier / MLPRegressor. + */ + +import { NotFittedError } from "../exceptions.js"; + +function relu(x: number): number { + return Math.max(0, x); +} + +function reluDeriv(x: number): number { + return x > 0 ? 1 : 0; +} + +function tanhDeriv(x: number): number { + const t = Math.tanh(x); + return 1 - t * t; +} + +function softmax(arr: Float64Array): Float64Array { + const maxVal = Math.max(...arr); + const exp = arr.map((x) => Math.exp(x - maxVal)); + const sum = exp.reduce((a, b) => a + b, 0); + return new Float64Array(exp.map((x) => x / sum)); +} + +type ActivationFn = (x: number) => number; +type ActivationDerivFn = (x: number) => number; + +function getActivation(name: string): [ActivationFn, ActivationDerivFn] { + if (name === "relu") return [relu, reluDeriv]; + if (name === "tanh") return [Math.tanh, tanhDeriv]; + // logistic + const sig = (x: number) => 1 / (1 + Math.exp(-x)); + return [sig, (x: number) => { const s = sig(x); return s * (1 - s); }]; +} + +interface LayerWeights { + W: Float64Array[]; + b: Float64Array; +} + +export class MLPClassifier { + hiddenLayerSizes: number[]; + activation: string; + alpha: number; + learningRate: number; + maxIter: number; + tol: number; + batchSize: number; + + coefs_: LayerWeights[] | null = null; + classes_: Float64Array | null = null; + nOutputs_: number = 0; + + constructor( + options: { + hiddenLayerSizes?: number[]; + activation?: string; + alpha?: number; + learningRate?: number; + maxIter?: number; + tol?: number; + batchSize?: number; + } = {}, + ) { + this.hiddenLayerSizes = options.hiddenLayerSizes ?? [100]; + this.activation = options.activation ?? "relu"; + this.alpha = options.alpha ?? 1e-4; + this.learningRate = options.learningRate ?? 1e-3; + this.maxIter = options.maxIter ?? 200; + this.tol = options.tol ?? 1e-4; + this.batchSize = options.batchSize ?? 32; + } + + private _initWeights(layerSizes: number[]): LayerWeights[] { + const weights: LayerWeights[] = []; + for (let i = 0; i < layerSizes.length - 1; i++) { + const fan_in = layerSizes[i] ?? 1; + const fan_out = layerSizes[i + 1] ?? 1; + const scale = Math.sqrt(2 / fan_in); + const W: Float64Array[] = []; + for (let r = 0; r < fan_out; r++) { + const row = new Float64Array(fan_in); + for (let c = 0; c < fan_in; c++) { + row[c] = (Math.random() * 2 - 1) * scale; + } + W.push(row); + } + weights.push({ W, b: new Float64Array(fan_out) }); + } + return weights; + } + + private _forward( + x: Float64Array, + weights: LayerWeights[], + activFn: ActivationFn, + isOutput = false, + ): { activations: Float64Array[]; zs: Float64Array[] } { + const activations: Float64Array[] = [x]; + const zs: Float64Array[] = []; + + for (let l = 0; l < weights.length; l++) { + const layer = weights[l] as LayerWeights; + const prev = activations[activations.length - 1] as Float64Array; + const z = new Float64Array(layer.W.length); + for (let j = 0; j < layer.W.length; j++) { + let sum = layer.b[j] ?? 0; + const wRow = layer.W[j] ?? new Float64Array(0); + for (let k = 0; k < prev.length; k++) { + sum += (wRow[k] ?? 0) * (prev[k] ?? 0); + } + z[j] = sum; + } + zs.push(z); + + const isLast = l === weights.length - 1; + let a: Float64Array; + if (isLast && isOutput) { + a = softmax(z); + } else if (isLast && !isOutput) { + a = new Float64Array(z); + } else { + a = new Float64Array(z.map(activFn)); + } + activations.push(a); + } + + return { activations, zs }; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const nFeatures = (X[0] ?? new Float64Array(0)).length; + const uniqueClasses = Array.from(new Set(Array.from(y))).sort((a, b) => a - b); + this.classes_ = new Float64Array(uniqueClasses); + const nClasses = uniqueClasses.length; + this.nOutputs_ = nClasses; + + const classToIdx = new Map(uniqueClasses.map((c, i) => [c, i])); + const [activFn, activDeriv] = getActivation(this.activation); + + const layerSizes = [nFeatures, ...this.hiddenLayerSizes, nClasses]; + const weights = this._initWeights(layerSizes); + + for (let iter = 0; iter < this.maxIter; iter++) { + let totalLoss = 0; + + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(nFeatures); + const yi = classToIdx.get(y[i] ?? 0) ?? 0; + const yOneHot = new Float64Array(nClasses); + yOneHot[yi] = 1; + + const { activations, zs } = this._forward(xi, weights, activFn, true); + const output = activations[activations.length - 1] as Float64Array; + + // Cross-entropy loss + totalLoss += -Math.log((output[yi] ?? 0) + 1e-15); + + // Backprop + const deltas: Float64Array[] = new Array(weights.length); + // Output delta + const outDelta = new Float64Array(nClasses); + for (let j = 0; j < nClasses; j++) { + outDelta[j] = (output[j] ?? 0) - (yOneHot[j] ?? 0); + } + deltas[weights.length - 1] = outDelta; + + for (let l = weights.length - 2; l >= 0; l--) { + const nextLayer = weights[l + 1] as LayerWeights; + const nextDelta = deltas[l + 1] as Float64Array; + const z = zs[l] as Float64Array; + const delta = new Float64Array(z.length); + for (let j = 0; j < z.length; j++) { + let sum = 0; + for (let k = 0; k < nextLayer.W.length; k++) { + sum += ((nextLayer.W[k] ?? new Float64Array(0))[j] ?? 0) * (nextDelta[k] ?? 0); + } + delta[j] = sum * activDeriv(z[j] ?? 0); + } + deltas[l] = delta; + } + + // Update weights + for (let l = 0; l < weights.length; l++) { + const layer = weights[l] as LayerWeights; + const prevA = activations[l] as Float64Array; + const delta = deltas[l] as Float64Array; + for (let j = 0; j < layer.W.length; j++) { + const wRow = layer.W[j] as Float64Array; + for (let k = 0; k < prevA.length; k++) { + wRow[k] = + (wRow[k] ?? 0) - + this.learningRate * ((delta[j] ?? 0) * (prevA[k] ?? 0) + this.alpha * (wRow[k] ?? 0)); + } + layer.b[j] = (layer.b[j] ?? 0) - this.learningRate * (delta[j] ?? 0); + } + } + } + + if (totalLoss / n < this.tol) break; + } + + this.coefs_ = weights; + return this; + } + + predictProba(X: Float64Array[]): Float64Array[] { + if (this.coefs_ === null) throw new NotFittedError("MLPClassifier"); + const [activFn] = getActivation(this.activation); + return X.map((xi) => { + const { activations } = this._forward(xi, this.coefs_ as LayerWeights[], activFn, true); + return activations[activations.length - 1] as Float64Array; + }); + } + + predict(X: Float64Array[]): Float64Array { + if (this.classes_ === null) throw new NotFittedError("MLPClassifier"); + const proba = this.predictProba(X); + const classes = this.classes_; + return new Float64Array( + proba.map((p) => { + let maxIdx = 0; + let maxVal = p[0] ?? 0; + for (let j = 1; j < p.length; j++) { + if ((p[j] ?? 0) > maxVal) { + maxVal = p[j] ?? 0; + maxIdx = j; + } + } + return classes[maxIdx] ?? 0; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} + +export class MLPRegressor { + hiddenLayerSizes: number[]; + activation: string; + alpha: number; + learningRate: number; + maxIter: number; + tol: number; + + coefs_: LayerWeights[] | null = null; + + constructor( + options: { + hiddenLayerSizes?: number[]; + activation?: string; + alpha?: number; + learningRate?: number; + maxIter?: number; + tol?: number; + } = {}, + ) { + this.hiddenLayerSizes = options.hiddenLayerSizes ?? [100]; + this.activation = options.activation ?? "relu"; + this.alpha = options.alpha ?? 1e-4; + this.learningRate = options.learningRate ?? 1e-3; + this.maxIter = options.maxIter ?? 200; + this.tol = options.tol ?? 1e-4; + } + + private _initWeights(layerSizes: number[]): LayerWeights[] { + const weights: LayerWeights[] = []; + for (let i = 0; i < layerSizes.length - 1; i++) { + const fan_in = layerSizes[i] ?? 1; + const fan_out = layerSizes[i + 1] ?? 1; + const scale = Math.sqrt(2 / fan_in); + const W: Float64Array[] = []; + for (let r = 0; r < fan_out; r++) { + const row = new Float64Array(fan_in); + for (let c = 0; c < fan_in; c++) { + row[c] = (Math.random() * 2 - 1) * scale; + } + W.push(row); + } + weights.push({ W, b: new Float64Array(fan_out) }); + } + return weights; + } + + private _forward( + x: Float64Array, + weights: LayerWeights[], + activFn: ActivationFn, + ): { activations: Float64Array[]; zs: Float64Array[] } { + const activations: Float64Array[] = [x]; + const zs: Float64Array[] = []; + + for (let l = 0; l < weights.length; l++) { + const layer = weights[l] as LayerWeights; + const prev = activations[activations.length - 1] as Float64Array; + const z = new Float64Array(layer.W.length); + for (let j = 0; j < layer.W.length; j++) { + let sum = layer.b[j] ?? 0; + const wRow = layer.W[j] ?? new Float64Array(0); + for (let k = 0; k < prev.length; k++) { + sum += (wRow[k] ?? 0) * (prev[k] ?? 0); + } + z[j] = sum; + } + zs.push(z); + const isLast = l === weights.length - 1; + activations.push(isLast ? new Float64Array(z) : new Float64Array(z.map(activFn))); + } + return { activations, zs }; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const nFeatures = (X[0] ?? new Float64Array(0)).length; + const [activFn, activDeriv] = getActivation(this.activation); + + const layerSizes = [nFeatures, ...this.hiddenLayerSizes, 1]; + const weights = this._initWeights(layerSizes); + + for (let iter = 0; iter < this.maxIter; iter++) { + let totalLoss = 0; + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(nFeatures); + const { activations, zs } = this._forward(xi, weights, activFn); + const output = (activations[activations.length - 1] as Float64Array)[0] ?? 0; + const err = output - (y[i] ?? 0); + totalLoss += err ** 2; + + const deltas: Float64Array[] = new Array(weights.length); + deltas[weights.length - 1] = new Float64Array([err]); + + for (let l = weights.length - 2; l >= 0; l--) { + const nextLayer = weights[l + 1] as LayerWeights; + const nextDelta = deltas[l + 1] as Float64Array; + const z = zs[l] as Float64Array; + const delta = new Float64Array(z.length); + for (let j = 0; j < z.length; j++) { + let sum = 0; + for (let k = 0; k < nextLayer.W.length; k++) { + sum += ((nextLayer.W[k] ?? new Float64Array(0))[j] ?? 0) * (nextDelta[k] ?? 0); + } + delta[j] = sum * activDeriv(z[j] ?? 0); + } + deltas[l] = delta; + } + + for (let l = 0; l < weights.length; l++) { + const layer = weights[l] as LayerWeights; + const prevA = activations[l] as Float64Array; + const delta = deltas[l] as Float64Array; + for (let j = 0; j < layer.W.length; j++) { + const wRow = layer.W[j] as Float64Array; + for (let k = 0; k < prevA.length; k++) { + wRow[k] = + (wRow[k] ?? 0) - + this.learningRate * ((delta[j] ?? 0) * (prevA[k] ?? 0) + this.alpha * (wRow[k] ?? 0)); + } + layer.b[j] = (layer.b[j] ?? 0) - this.learningRate * (delta[j] ?? 0); + } + } + } + if (totalLoss / n < this.tol) break; + } + + this.coefs_ = weights; + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.coefs_ === null) throw new NotFittedError("MLPRegressor"); + const [activFn] = getActivation(this.activation); + return new Float64Array( + X.map((xi) => { + const { activations } = this._forward(xi, this.coefs_ as LayerWeights[], activFn); + return (activations[activations.length - 1] as Float64Array)[0] ?? 0; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const yPred = this.predict(X); + const yMean = Array.from(y).reduce((a, b) => a + b, 0) / y.length; + let ssTot = 0; + let ssRes = 0; + for (let i = 0; i < y.length; i++) { + ssTot += ((y[i] ?? 0) - yMean) ** 2; + ssRes += ((y[i] ?? 0) - (yPred[i] ?? 0)) ** 2; + } + return ssTot > 0 ? 1 - ssRes / ssTot : 0; + } +} diff --git a/src/pipeline/index.ts b/src/pipeline/index.ts new file mode 100644 index 0000000..939b367 --- /dev/null +++ b/src/pipeline/index.ts @@ -0,0 +1 @@ +export * from "./pipeline.js"; diff --git a/src/pipeline/pipeline.ts b/src/pipeline/pipeline.ts new file mode 100644 index 0000000..4c9b152 --- /dev/null +++ b/src/pipeline/pipeline.ts @@ -0,0 +1,95 @@ +/** + * Pipeline: chained estimators. + * Mirrors sklearn.pipeline.Pipeline and make_pipeline. + */ + +import { NotFittedError } from "../exceptions.js"; + +export interface PipelineStep { + fit?(X: Float64Array[], y?: Float64Array): this; + transform?(X: Float64Array[]): Float64Array[]; + fitTransform?(X: Float64Array[], y?: Float64Array): Float64Array[]; + predict?(X: Float64Array[]): Float64Array; + score?(X: Float64Array[], y: Float64Array): number; +} + +export class Pipeline { + steps: [string, PipelineStep][]; + + constructor(steps: [string, PipelineStep][]) { + this.steps = steps; + } + + fit(X: Float64Array[], y?: Float64Array): this { + let Xt = X; + for (let i = 0; i < this.steps.length - 1; i++) { + const [, step] = this.steps[i] as [string, PipelineStep]; + if (step.fitTransform) { + Xt = step.fitTransform(Xt, y); + } else { + step.fit?.(Xt, y); + Xt = step.transform?.(Xt) ?? Xt; + } + } + const [, lastStep] = this.steps[this.steps.length - 1] as [string, PipelineStep]; + if (y !== undefined) { + lastStep.fit?.(Xt, y); + } else { + if (lastStep.fitTransform) { + lastStep.fitTransform(Xt); + } else { + lastStep.fit?.(Xt); + } + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + let Xt = X; + for (const [, step] of this.steps) { + if (!step.transform) throw new Error("Step does not have transform method"); + Xt = step.transform(Xt); + } + return Xt; + } + + fitTransform(X: Float64Array[], y?: Float64Array): Float64Array[] { + return this.fit(X, y).transform(X); + } + + predict(X: Float64Array[]): Float64Array { + let Xt = X; + for (let i = 0; i < this.steps.length - 1; i++) { + const [, step] = this.steps[i] as [string, PipelineStep]; + if (!step.transform) throw new NotFittedError("Pipeline"); + Xt = step.transform(Xt); + } + const [, lastStep] = this.steps[this.steps.length - 1] as [string, PipelineStep]; + if (!lastStep.predict) throw new Error("Last step has no predict method"); + return lastStep.predict(Xt); + } + + score(X: Float64Array[], y: Float64Array): number { + let Xt = X; + for (let i = 0; i < this.steps.length - 1; i++) { + const [, step] = this.steps[i] as [string, PipelineStep]; + if (!step.transform) throw new NotFittedError("Pipeline"); + Xt = step.transform(Xt); + } + const [, lastStep] = this.steps[this.steps.length - 1] as [string, PipelineStep]; + if (!lastStep.score) throw new Error("Last step has no score method"); + return lastStep.score(Xt, y); + } + + getParams(): Record { + const params: Record = {}; + for (const [name, step] of this.steps) { + params[name] = step; + } + return params; + } +} + +export function makePipeline(...steps: PipelineStep[]): Pipeline { + return new Pipeline(steps.map((step, i) => [`step_${i}`, step])); +} diff --git a/src/preprocessing/encoders.ts b/src/preprocessing/encoders.ts new file mode 100644 index 0000000..2cfa60c --- /dev/null +++ b/src/preprocessing/encoders.ts @@ -0,0 +1,124 @@ +/** + * OneHotEncoder and OrdinalEncoder. + * Mirrors sklearn.preprocessing.OneHotEncoder and OrdinalEncoder. + */ + +import { NotFittedError } from "../exceptions.js"; + +export class OneHotEncoder { + sparse: boolean; + handleUnknown: string; + + categories_: Float64Array[] | null = null; + featureNamesOut_: string[] | null = null; + + constructor( + options: { sparse?: boolean; handleUnknown?: string } = {}, + ) { + this.sparse = options.sparse ?? false; + this.handleUnknown = options.handleUnknown ?? "error"; + } + + fit(X: Float64Array[]): this { + const p = (X[0] ?? new Float64Array(0)).length; + this.categories_ = []; + for (let j = 0; j < p; j++) { + const vals = Array.from(new Set(X.map((xi) => xi[j] ?? 0))).sort((a, b) => a - b); + this.categories_.push(new Float64Array(vals)); + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.categories_ === null) throw new NotFittedError("OneHotEncoder"); + const cats = this.categories_; + + return X.map((xi) => { + const parts: number[] = []; + for (let j = 0; j < xi.length; j++) { + const cat = cats[j] ?? new Float64Array(0); + const val = xi[j] ?? 0; + for (let k = 0; k < cat.length; k++) { + parts.push(cat[k] === val ? 1 : 0); + } + } + return new Float64Array(parts); + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } + + inverseTransform(X: Float64Array[]): Float64Array[] { + if (this.categories_ === null) throw new NotFittedError("OneHotEncoder"); + const cats = this.categories_; + const p = cats.length; + + return X.map((xi) => { + const result = new Float64Array(p); + let offset = 0; + for (let j = 0; j < p; j++) { + const cat = cats[j] ?? new Float64Array(0); + let maxVal = -Infinity; + let bestIdx = 0; + for (let k = 0; k < cat.length; k++) { + if ((xi[offset + k] ?? 0) > maxVal) { + maxVal = xi[offset + k] ?? 0; + bestIdx = k; + } + } + result[j] = cat[bestIdx] ?? 0; + offset += cat.length; + } + return result; + }); + } +} + +export class OrdinalEncoder { + categories_: Float64Array[] | null = null; + + fit(X: Float64Array[]): this { + const p = (X[0] ?? new Float64Array(0)).length; + this.categories_ = []; + for (let j = 0; j < p; j++) { + const vals = Array.from(new Set(X.map((xi) => xi[j] ?? 0))).sort((a, b) => a - b); + this.categories_.push(new Float64Array(vals)); + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.categories_ === null) throw new NotFittedError("OrdinalEncoder"); + const cats = this.categories_; + return X.map((xi) => { + const result = new Float64Array(xi.length); + for (let j = 0; j < xi.length; j++) { + const cat = cats[j] ?? new Float64Array(0); + const val = xi[j] ?? 0; + const idx = Array.from(cat).indexOf(val); + result[j] = idx >= 0 ? idx : 0; + } + return result; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } + + inverseTransform(X: Float64Array[]): Float64Array[] { + if (this.categories_ === null) throw new NotFittedError("OrdinalEncoder"); + const cats = this.categories_; + return X.map((xi) => { + const result = new Float64Array(xi.length); + for (let j = 0; j < xi.length; j++) { + const cat = cats[j] ?? new Float64Array(0); + const idx = Math.round(xi[j] ?? 0); + result[j] = cat[Math.min(idx, cat.length - 1)] ?? 0; + } + return result; + }); + } +} diff --git a/src/preprocessing/index.ts b/src/preprocessing/index.ts index 7c8f35b..4e22045 100644 --- a/src/preprocessing/index.ts +++ b/src/preprocessing/index.ts @@ -2,3 +2,5 @@ export * from "./standard_scaler.js"; export * from "./minmax_scaler.js"; export * from "./label_encoder.js"; export * from "./normalizer.js"; +export * from "./polynomial_features.js"; +export * from "./encoders.js"; diff --git a/src/preprocessing/polynomial_features.ts b/src/preprocessing/polynomial_features.ts new file mode 100644 index 0000000..49b1d06 --- /dev/null +++ b/src/preprocessing/polynomial_features.ts @@ -0,0 +1,106 @@ +/** + * Polynomial features transformer. + * Mirrors sklearn.preprocessing.PolynomialFeatures. + */ + +import { NotFittedError } from "../exceptions.js"; + +export class PolynomialFeatures { + degree: number; + interactionOnly: boolean; + includeBias: boolean; + + nOutputFeatures_: number = 0; + powers_: number[][] | null = null; + + constructor( + options: { + degree?: number; + interactionOnly?: boolean; + includeBias?: boolean; + } = {}, + ) { + this.degree = options.degree ?? 2; + this.interactionOnly = options.interactionOnly ?? false; + this.includeBias = options.includeBias ?? true; + } + + private _generatePowers(nFeatures: number): number[][] { + const includeBias = this.includeBias; + const interactionOnly = this.interactionOnly; + const degree = this.degree; + const powers: number[][] = []; + + const gen = (fi: number, rem: number, cur: number[], targetDeg: number): void => { + if (fi === nFeatures) { + const sum = cur.reduce((a, b) => a + b, 0); + if (sum !== targetDeg) return; + if (!includeBias && sum === 0) return; + if (interactionOnly && cur.some((d) => d > 1)) return; + powers.push([...cur]); + return; + } + for (let d = 0; d <= rem; d++) { + cur.push(d); + gen(fi + 1, rem - d, cur, targetDeg); + cur.pop(); + } + }; + + for (let deg = 0; deg <= degree; deg++) { + gen(0, deg, [], deg); + } + + // Remove duplicates and sort + const seen = new Set(); + const unique: number[][] = []; + for (const p of powers) { + const key = p.join(","); + if (!seen.has(key)) { + seen.add(key); + unique.push(p); + } + } + + return unique.sort((a, b) => { + const sumA = a.reduce((s, v) => s + v, 0); + const sumB = b.reduce((s, v) => s + v, 0); + if (sumA !== sumB) return sumA - sumB; + for (let i = 0; i < a.length; i++) { + if ((a[i] ?? 0) !== (b[i] ?? 0)) return (a[i] ?? 0) - (b[i] ?? 0); + } + return 0; + }); + } + + fit(X: Float64Array[]): this { + const nFeatures = (X[0] ?? new Float64Array(0)).length; + this.powers_ = this._generatePowers(nFeatures); + this.nOutputFeatures_ = this.powers_.length; + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.powers_ === null) throw new NotFittedError("PolynomialFeatures"); + const powers = this.powers_; + const nOut = powers.length; + + return X.map((xi) => { + const result = new Float64Array(nOut); + for (let k = 0; k < nOut; k++) { + const power = powers[k] ?? []; + let val = 1; + for (let j = 0; j < power.length; j++) { + const exp = power[j] ?? 0; + if (exp !== 0) val *= (xi[j] ?? 0) ** exp; + } + result[k] = val; + } + return result; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} diff --git a/src/svm/index.ts b/src/svm/index.ts new file mode 100644 index 0000000..13e147f --- /dev/null +++ b/src/svm/index.ts @@ -0,0 +1 @@ +export * from "./svc.js"; diff --git a/src/svm/svc.ts b/src/svm/svc.ts new file mode 100644 index 0000000..20f5a73 --- /dev/null +++ b/src/svm/svc.ts @@ -0,0 +1,412 @@ +/** + * Support Vector Classifier and Regressor. + * Mirrors sklearn.svm.SVC and SVR. + * Uses a simplified SMO (Sequential Minimal Optimization) for binary SVC. + */ + +import { NotFittedError } from "../exceptions.js"; + +function rbfKernel( + a: Float64Array, + b: Float64Array, + gamma: number, +): number { + let dist2 = 0; + for (let i = 0; i < a.length; i++) { + dist2 += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2; + } + return Math.exp(-gamma * dist2); +} + +function linearKernel(a: Float64Array, b: Float64Array): number { + let dot = 0; + for (let i = 0; i < a.length; i++) { + dot += (a[i] ?? 0) * (b[i] ?? 0); + } + return dot; +} + +function polyKernel( + a: Float64Array, + b: Float64Array, + degree: number, + coef0: number, +): number { + let dot = coef0; + for (let i = 0; i < a.length; i++) { + dot += (a[i] ?? 0) * (b[i] ?? 0); + } + return dot ** degree; +} + +export class SVC { + C: number; + kernel: string; + degree: number; + gamma: number | "scale" | "auto"; + coef0: number; + tol: number; + maxIter: number; + + alpha_: Float64Array | null = null; + b_: number = 0; + supportVectors_: Float64Array[] | null = null; + supportLabels_: Float64Array | null = null; + classes_: Float64Array | null = null; + + private _gamma: number = 1; + + constructor( + options: { + C?: number; + kernel?: string; + degree?: number; + gamma?: number | "scale" | "auto"; + coef0?: number; + tol?: number; + maxIter?: number; + } = {}, + ) { + this.C = options.C ?? 1.0; + this.kernel = options.kernel ?? "rbf"; + this.degree = options.degree ?? 3; + this.gamma = options.gamma ?? "scale"; + this.coef0 = options.coef0 ?? 0.0; + this.tol = options.tol ?? 1e-3; + this.maxIter = options.maxIter ?? 1000; + } + + private _kernelFn(a: Float64Array, b: Float64Array): number { + if (this.kernel === "linear") return linearKernel(a, b); + if (this.kernel === "poly") return polyKernel(a, b, this.degree, this.coef0); + return rbfKernel(a, b, this._gamma); + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const uniqueClasses = Array.from(new Set(Array.from(y))).sort((a, b) => a - b); + this.classes_ = new Float64Array(uniqueClasses); + + // Compute gamma + if (this.gamma === "scale") { + let varSum = 0; + for (let j = 0; j < p; j++) { + let mean = 0; + for (let i = 0; i < n; i++) mean += (X[i] ?? new Float64Array(p))[j] ?? 0; + mean /= n; + for (let i = 0; i < n; i++) varSum += ((X[i] ?? new Float64Array(p))[j] ?? 0 - mean) ** 2; + } + this._gamma = p > 0 && varSum > 0 ? 1 / (p * varSum / (n * p)) : 1; + } else if (this.gamma === "auto") { + this._gamma = p > 0 ? 1 / p : 1; + } else { + this._gamma = this.gamma; + } + + // Map to ±1 + const posClass = uniqueClasses[uniqueClasses.length - 1] ?? 1; + const yLabels = new Float64Array(n); + for (let i = 0; i < n; i++) { + yLabels[i] = (y[i] ?? 0) === posClass ? 1 : -1; + } + + // SMO-lite + const alpha = new Float64Array(n); + let b = 0; + + // Compute kernel matrix + const K: number[][] = []; + for (let i = 0; i < n; i++) { + K[i] = []; + for (let j = 0; j < n; j++) { + (K[i] as number[])[j] = this._kernelFn( + X[i] ?? new Float64Array(p), + X[j] ?? new Float64Array(p), + ); + } + } + + for (let iter = 0; iter < this.maxIter; iter++) { + let numChanged = 0; + + for (let i = 0; i < n; i++) { + // Compute decision value + let fi = -b; + for (let k = 0; k < n; k++) { + fi += (alpha[k] ?? 0) * (yLabels[k] ?? 0) * ((K[i] as number[])[k] ?? 0); + } + const Ei = fi - (yLabels[i] ?? 0); + + if ( + ((yLabels[i] ?? 0) * Ei < -this.tol && (alpha[i] ?? 0) < this.C) || + ((yLabels[i] ?? 0) * Ei > this.tol && (alpha[i] ?? 0) > 0) + ) { + // Pick j randomly + let j = Math.floor(Math.random() * n); + if (j === i) j = (j + 1) % n; + + let fj = -b; + for (let k = 0; k < n; k++) { + fj += (alpha[k] ?? 0) * (yLabels[k] ?? 0) * ((K[j] as number[])[k] ?? 0); + } + const Ej = fj - (yLabels[j] ?? 0); + + const alphaIOld = alpha[i] ?? 0; + const alphaJOld = alpha[j] ?? 0; + + // Compute bounds + let L: number; + let H: number; + if ((yLabels[i] ?? 0) !== (yLabels[j] ?? 0)) { + L = Math.max(0, alphaJOld - alphaIOld); + H = Math.min(this.C, this.C + alphaJOld - alphaIOld); + } else { + L = Math.max(0, alphaIOld + alphaJOld - this.C); + H = Math.min(this.C, alphaIOld + alphaJOld); + } + if (L >= H) continue; + + const eta = + 2 * ((K[i] as number[])[j] ?? 0) - + ((K[i] as number[])[i] ?? 0) - + ((K[j] as number[])[j] ?? 0); + if (eta >= 0) continue; + + let alphaJNew = alphaJOld - (yLabels[j] ?? 0) * (Ei - Ej) / eta; + alphaJNew = Math.min(H, Math.max(L, alphaJNew)); + if (Math.abs(alphaJNew - alphaJOld) < 1e-5) continue; + + alpha[j] = alphaJNew; + alpha[i] = + alphaIOld + + (yLabels[i] ?? 0) * (yLabels[j] ?? 0) * (alphaJOld - alphaJNew); + + // Update b + const b1 = + b + + Ei + + (yLabels[i] ?? 0) * ((alpha[i] ?? 0) - alphaIOld) * ((K[i] as number[])[i] ?? 0) + + (yLabels[j] ?? 0) * ((alpha[j] ?? 0) - alphaJOld) * ((K[i] as number[])[j] ?? 0); + const b2 = + b + + Ej + + (yLabels[i] ?? 0) * ((alpha[i] ?? 0) - alphaIOld) * ((K[i] as number[])[j] ?? 0) + + (yLabels[j] ?? 0) * ((alpha[j] ?? 0) - alphaJOld) * ((K[j] as number[])[j] ?? 0); + + if ((alpha[i] ?? 0) > 0 && (alpha[i] ?? 0) < this.C) b = b1; + else if ((alpha[j] ?? 0) > 0 && (alpha[j] ?? 0) < this.C) b = b2; + else b = (b1 + b2) / 2; + + numChanged++; + } + } + + if (numChanged === 0) break; + } + + // Store support vectors + const svIdx: number[] = []; + for (let i = 0; i < n; i++) { + if ((alpha[i] ?? 0) > 1e-5) svIdx.push(i); + } + + this.alpha_ = new Float64Array(svIdx.map((i) => alpha[i] ?? 0)); + this.supportVectors_ = svIdx.map((i) => X[i] ?? new Float64Array(p)); + this.supportLabels_ = new Float64Array(svIdx.map((i) => yLabels[i] ?? 0)); + this.b_ = b; + + return this; + } + + decision_function(X: Float64Array[]): Float64Array { + if (this.alpha_ === null) throw new NotFittedError("SVC"); + const sv = this.supportVectors_ as Float64Array[]; + const svLabels = this.supportLabels_ as Float64Array; + return new Float64Array( + X.map((xi) => { + let val = -this.b_; + for (let k = 0; k < sv.length; k++) { + val += + (this.alpha_![k] ?? 0) * + (svLabels[k] ?? 0) * + this._kernelFn(xi, sv[k] ?? new Float64Array(0)); + } + return val; + }), + ); + } + + predict(X: Float64Array[]): Float64Array { + if (this.classes_ === null) throw new NotFittedError("SVC"); + const classes = this.classes_; + const dv = this.decision_function(X); + const posClass = classes[classes.length - 1] ?? 1; + const negClass = classes[0] ?? 0; + return new Float64Array(dv.map((v) => (v >= 0 ? posClass : negClass))); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } +} + +export class SVR { + C: number; + kernel: string; + degree: number; + gamma: number | "scale" | "auto"; + coef0: number; + epsilon: number; + tol: number; + maxIter: number; + + alpha_: Float64Array | null = null; + b_: number = 0; + supportVectors_: Float64Array[] | null = null; + dualCoef_: Float64Array | null = null; + + private _gamma: number = 1; + + constructor( + options: { + C?: number; + kernel?: string; + degree?: number; + gamma?: number | "scale" | "auto"; + coef0?: number; + epsilon?: number; + tol?: number; + maxIter?: number; + } = {}, + ) { + this.C = options.C ?? 1.0; + this.kernel = options.kernel ?? "rbf"; + this.degree = options.degree ?? 3; + this.gamma = options.gamma ?? "scale"; + this.coef0 = options.coef0 ?? 0.0; + this.epsilon = options.epsilon ?? 0.1; + this.tol = options.tol ?? 1e-3; + this.maxIter = options.maxIter ?? 1000; + } + + private _kernelFn(a: Float64Array, b: Float64Array): number { + if (this.kernel === "linear") return linearKernel(a, b); + if (this.kernel === "poly") return polyKernel(a, b, this.degree, this.coef0); + return rbfKernel(a, b, this._gamma); + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + + if (this.gamma === "scale") { + let varSum = 0; + for (let j = 0; j < p; j++) { + let mean = 0; + for (let i = 0; i < n; i++) mean += (X[i] ?? new Float64Array(p))[j] ?? 0; + mean /= n; + for (let i = 0; i < n; i++) varSum += (((X[i] ?? new Float64Array(p))[j] ?? 0) - mean) ** 2; + } + this._gamma = p > 0 && varSum > 0 ? n / varSum : 1; + } else if (this.gamma === "auto") { + this._gamma = p > 0 ? 1 / p : 1; + } else { + this._gamma = this.gamma; + } + + // Dual form: alpha - alpha* (simplified gradient descent) + const dualCoef = new Float64Array(n); // alpha_i - alpha_i* + let b = 0; + + const K: number[][] = []; + for (let i = 0; i < n; i++) { + K[i] = []; + for (let j = 0; j < n; j++) { + (K[i] as number[])[j] = this._kernelFn( + X[i] ?? new Float64Array(p), + X[j] ?? new Float64Array(p), + ); + } + } + + const lr = 0.01; + for (let iter = 0; iter < this.maxIter; iter++) { + let maxDelta = 0; + for (let i = 0; i < n; i++) { + let pred = b; + for (let k = 0; k < n; k++) { + pred += (dualCoef[k] ?? 0) * ((K[i] as number[])[k] ?? 0); + } + const err = pred - (y[i] ?? 0); + let grad = 0; + if (err > this.epsilon) grad = 1; + else if (err < -this.epsilon) grad = -1; + + const newCoef = Math.min( + this.C, + Math.max(-this.C, (dualCoef[i] ?? 0) - lr * grad), + ); + const delta = Math.abs(newCoef - (dualCoef[i] ?? 0)); + if (delta > maxDelta) maxDelta = delta; + dualCoef[i] = newCoef; + } + + let predSum = 0; + for (let i = 0; i < n; i++) { + let pred = 0; + for (let k = 0; k < n; k++) { + pred += (dualCoef[k] ?? 0) * ((K[i] as number[])[k] ?? 0); + } + predSum += (y[i] ?? 0) - pred; + } + b = predSum / n; + + if (maxDelta < this.tol) break; + } + + const svIdx: number[] = []; + for (let i = 0; i < n; i++) { + if (Math.abs(dualCoef[i] ?? 0) > 1e-5) svIdx.push(i); + } + + this.dualCoef_ = new Float64Array(svIdx.map((i) => dualCoef[i] ?? 0)); + this.supportVectors_ = svIdx.map((i) => X[i] ?? new Float64Array(p)); + this.b_ = b; + + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.dualCoef_ === null) throw new NotFittedError("SVR"); + const sv = this.supportVectors_ as Float64Array[]; + return new Float64Array( + X.map((xi) => { + let val = this.b_; + for (let k = 0; k < sv.length; k++) { + val += + (this.dualCoef_![k] ?? 0) * + this._kernelFn(xi, sv[k] ?? new Float64Array(0)); + } + return val; + }), + ); + } + + score(X: Float64Array[], y: Float64Array): number { + const yPred = this.predict(X); + const yMean = Array.from(y).reduce((a, b) => a + b, 0) / y.length; + let ssTot = 0; + let ssRes = 0; + for (let i = 0; i < y.length; i++) { + ssTot += ((y[i] ?? 0) - yMean) ** 2; + ssRes += ((y[i] ?? 0) - (yPred[i] ?? 0)) ** 2; + } + return ssTot > 0 ? 1 - ssRes / ssTot : 0; + } +} diff --git a/src/tree/decision_tree.ts b/src/tree/decision_tree.ts new file mode 100644 index 0000000..b481eea --- /dev/null +++ b/src/tree/decision_tree.ts @@ -0,0 +1,251 @@ +/** + * Decision Tree Classifier and Regressor. + * Mirrors sklearn.tree.DecisionTreeClassifier / DecisionTreeRegressor. + */ + +import { NotFittedError } from "../exceptions.js"; + +interface TreeNode { + featureIndex: number; + threshold: number; + left: TreeNode | null; + right: TreeNode | null; + value: Float64Array; + isLeaf: boolean; +} + +function giniImpurity(y: number[]): number { + const counts = new Map(); + for (const label of y) counts.set(label, (counts.get(label) ?? 0) + 1); + let impurity = 1; + for (const count of counts.values()) { + impurity -= (count / y.length) ** 2; + } + return impurity; +} + +function mse(y: number[]): number { + if (y.length === 0) return 0; + const mean = y.reduce((a, b) => a + b, 0) / y.length; + return y.reduce((s, v) => s + (v - mean) ** 2, 0) / y.length; +} + +function classificationLeafValue(y: number[]): Float64Array { + const counts = new Map(); + for (const label of y) counts.set(label, (counts.get(label) ?? 0) + 1); + let best = 0; + let bestCount = 0; + for (const [label, count] of counts) { + if (count > bestCount) { + bestCount = count; + best = label; + } + } + return new Float64Array([best]); +} + +function regressionLeafValue(y: number[]): Float64Array { + return new Float64Array([y.reduce((a, b) => a + b, 0) / y.length]); +} + +function buildTree( + X: Float64Array[], + y: number[], + depth: number, + maxDepth: number, + minSamplesSplit: number, + criterion: "gini" | "mse", +): TreeNode { + const leafValue = + criterion === "gini" + ? classificationLeafValue(y) + : regressionLeafValue(y); + + if ( + depth >= maxDepth || + y.length < minSamplesSplit || + new Set(y).size === 1 + ) { + return { featureIndex: -1, threshold: 0, left: null, right: null, value: leafValue, isLeaf: true }; + } + + const nFeatures = (X[0] ?? new Float64Array(0)).length; + let bestGain = -Infinity; + let bestFeature = 0; + let bestThreshold = 0; + + const currentImpurity = criterion === "gini" ? giniImpurity(y) : mse(y); + + for (let j = 0; j < nFeatures; j++) { + const vals = X.map((xi) => xi[j] ?? 0); + const sorted = Array.from(new Set(vals)).sort((a, b) => a - b); + for (let ti = 0; ti < sorted.length - 1; ti++) { + const threshold = ((sorted[ti] ?? 0) + (sorted[ti + 1] ?? 0)) / 2; + const leftY: number[] = []; + const rightY: number[] = []; + for (let i = 0; i < X.length; i++) { + ((X[i] ?? new Float64Array(0))[j] ?? 0) <= threshold + ? leftY.push(y[i] ?? 0) + : rightY.push(y[i] ?? 0); + } + if (leftY.length === 0 || rightY.length === 0) continue; + + const n = y.length; + const leftImpurity = criterion === "gini" ? giniImpurity(leftY) : mse(leftY); + const rightImpurity = criterion === "gini" ? giniImpurity(rightY) : mse(rightY); + const gain = + currentImpurity - + (leftY.length / n) * leftImpurity - + (rightY.length / n) * rightImpurity; + + if (gain > bestGain) { + bestGain = gain; + bestFeature = j; + bestThreshold = threshold; + } + } + } + + if (bestGain <= 0) { + return { featureIndex: -1, threshold: 0, left: null, right: null, value: leafValue, isLeaf: true }; + } + + const leftIdx: number[] = []; + const rightIdx: number[] = []; + for (let i = 0; i < X.length; i++) { + ((X[i] ?? new Float64Array(0))[bestFeature] ?? 0) <= bestThreshold + ? leftIdx.push(i) + : rightIdx.push(i); + } + + const leftX = leftIdx.map((i) => X[i] ?? new Float64Array(0)); + const leftY = leftIdx.map((i) => y[i] ?? 0); + const rightX = rightIdx.map((i) => X[i] ?? new Float64Array(0)); + const rightY = rightIdx.map((i) => y[i] ?? 0); + + return { + featureIndex: bestFeature, + threshold: bestThreshold, + left: buildTree(leftX, leftY, depth + 1, maxDepth, minSamplesSplit, criterion), + right: buildTree(rightX, rightY, depth + 1, maxDepth, minSamplesSplit, criterion), + value: leafValue, + isLeaf: false, + }; +} + +function predict1(node: TreeNode, x: Float64Array): number { + if (node.isLeaf) return node.value[0] ?? 0; + return (x[node.featureIndex] ?? 0) <= node.threshold + ? predict1(node.left as TreeNode, x) + : predict1(node.right as TreeNode, x); +} + +export class DecisionTreeClassifier { + maxDepth: number; + minSamplesSplit: number; + criterion: string; + + tree_: TreeNode | null = null; + classes_: Float64Array | null = null; + nFeatures_: number = 0; + + constructor( + options: { + maxDepth?: number; + minSamplesSplit?: number; + criterion?: string; + } = {}, + ) { + this.maxDepth = options.maxDepth ?? Infinity; + this.minSamplesSplit = options.minSamplesSplit ?? 2; + this.criterion = options.criterion ?? "gini"; + } + + fit(X: Float64Array[], y: Float64Array): this { + this.nFeatures_ = (X[0] ?? new Float64Array(0)).length; + this.classes_ = new Float64Array( + Array.from(new Set(Array.from(y))).sort((a, b) => a - b), + ); + this.tree_ = buildTree( + X, + Array.from(y), + 0, + this.maxDepth, + this.minSamplesSplit, + "gini", + ); + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.tree_ === null) throw new NotFittedError("DecisionTreeClassifier"); + return new Float64Array(X.map((xi) => predict1(this.tree_ as TreeNode, xi))); + } + + score(X: Float64Array[], y: Float64Array): number { + const pred = this.predict(X); + let correct = 0; + for (let i = 0; i < y.length; i++) { + if (pred[i] === y[i]) correct++; + } + return correct / y.length; + } + + predictProba(X: Float64Array[]): Float64Array[] { + if (this.tree_ === null || this.classes_ === null) + throw new NotFittedError("DecisionTreeClassifier"); + const classes = this.classes_; + return X.map((xi) => { + const pred = predict1(this.tree_ as TreeNode, xi); + const proba = new Float64Array(classes.length); + const idx = Array.from(classes).indexOf(pred); + if (idx >= 0) proba[idx] = 1; + return proba; + }); + } +} + +export class DecisionTreeRegressor { + maxDepth: number; + minSamplesSplit: number; + + tree_: TreeNode | null = null; + nFeatures_: number = 0; + + constructor( + options: { maxDepth?: number; minSamplesSplit?: number } = {}, + ) { + this.maxDepth = options.maxDepth ?? Infinity; + this.minSamplesSplit = options.minSamplesSplit ?? 2; + } + + fit(X: Float64Array[], y: Float64Array): this { + this.nFeatures_ = (X[0] ?? new Float64Array(0)).length; + this.tree_ = buildTree( + X, + Array.from(y), + 0, + this.maxDepth, + this.minSamplesSplit, + "mse", + ); + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (this.tree_ === null) throw new NotFittedError("DecisionTreeRegressor"); + return new Float64Array(X.map((xi) => predict1(this.tree_ as TreeNode, xi))); + } + + score(X: Float64Array[], y: Float64Array): number { + const yPred = this.predict(X); + const yMean = Array.from(y).reduce((a, b) => a + b, 0) / y.length; + let ssTot = 0; + let ssRes = 0; + for (let i = 0; i < y.length; i++) { + ssTot += ((y[i] ?? 0) - yMean) ** 2; + ssRes += ((y[i] ?? 0) - (yPred[i] ?? 0)) ** 2; + } + return ssTot > 0 ? 1 - ssRes / ssTot : 0; + } +} diff --git a/src/tree/index.ts b/src/tree/index.ts new file mode 100644 index 0000000..308694b --- /dev/null +++ b/src/tree/index.ts @@ -0,0 +1 @@ +export * from "./decision_tree.js"; From 471474f53c844bbb5ff130efe79482e336b4cf32 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 13 May 2026 23:08:20 +0000 Subject: [PATCH 2/5] ci: trigger checks From a516edef397c5d56f6216444f294a1148181638e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 14 May 2026 01:45:31 +0000 Subject: [PATCH 3/5] [Autoloop: build-tsikit-learn-scikit-learn-typescript-migration] Iteration 9: Add manifold, mixture, semi_supervised, feature_extraction, multioutput, kernel_ridge, gaussian_process, pairwise metrics, RobustScaler Run: https://github.com/githubnext/tsikit-learn/actions/runs/25836319463 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/feature_extraction/dict_vectorizer.ts | 134 +++++++++ src/feature_extraction/index.ts | 1 + src/gaussian_process/gp.ts | 183 ++++++++++++ src/gaussian_process/index.ts | 1 + src/index.ts | 21 ++ src/kernel_ridge/index.ts | 1 + src/kernel_ridge/kernel_ridge.ts | 147 ++++++++++ src/manifold/index.ts | 1 + src/manifold/tsne.ts | 339 ++++++++++++++++++++++ src/metrics/index.ts | 1 + src/metrics/pairwise.ts | 137 +++++++++ src/mixture/gaussian_mixture.ts | 179 ++++++++++++ src/mixture/index.ts | 1 + src/multioutput/index.ts | 1 + src/multioutput/multioutput.ts | 177 +++++++++++ src/preprocessing/index.ts | 1 + src/preprocessing/robust_scaler.ts | 118 ++++++++ src/semi_supervised/index.ts | 1 + src/semi_supervised/label_propagation.ts | 144 +++++++++ 19 files changed, 1588 insertions(+) create mode 100644 src/feature_extraction/dict_vectorizer.ts create mode 100644 src/feature_extraction/index.ts create mode 100644 src/gaussian_process/gp.ts create mode 100644 src/gaussian_process/index.ts create mode 100644 src/kernel_ridge/index.ts create mode 100644 src/kernel_ridge/kernel_ridge.ts create mode 100644 src/manifold/index.ts create mode 100644 src/manifold/tsne.ts create mode 100644 src/metrics/pairwise.ts create mode 100644 src/mixture/gaussian_mixture.ts create mode 100644 src/mixture/index.ts create mode 100644 src/multioutput/index.ts create mode 100644 src/multioutput/multioutput.ts create mode 100644 src/preprocessing/robust_scaler.ts create mode 100644 src/semi_supervised/index.ts create mode 100644 src/semi_supervised/label_propagation.ts diff --git a/src/feature_extraction/dict_vectorizer.ts b/src/feature_extraction/dict_vectorizer.ts new file mode 100644 index 0000000..008a8c4 --- /dev/null +++ b/src/feature_extraction/dict_vectorizer.ts @@ -0,0 +1,134 @@ +/** + * Feature extraction: DictVectorizer and FeatureHasher. + * Mirrors sklearn.feature_extraction.DictVectorizer and FeatureHasher. + */ + +import { NotFittedError } from "../exceptions.js"; + +export interface DictVectorizerOptions { + sparse?: boolean; + dtype?: "float64" | "float32"; + separator?: string; + sort?: boolean; +} + +export class DictVectorizer { + sparse: boolean; + separator: string; + sort: boolean; + + featureNames_: string[] | null = null; + vocabulary_: Map | null = null; + + constructor(options: DictVectorizerOptions = {}) { + this.sparse = options.sparse ?? false; + this.separator = options.separator ?? "="; + this.sort = options.sort ?? true; + } + + fit(X: Record[]): this { + const featureSet = new Set(); + for (const sample of X) { + for (const [key, value] of Object.entries(sample)) { + if (typeof value === "number") { + featureSet.add(key); + } else { + featureSet.add(`${key}${this.separator}${value}`); + } + } + } + let features = Array.from(featureSet); + if (this.sort) features = features.sort(); + this.featureNames_ = features; + this.vocabulary_ = new Map(features.map((f, i) => [f, i])); + return this; + } + + transform(X: Record[]): Float64Array[] { + if (!this.vocabulary_ || !this.featureNames_) throw new NotFittedError("DictVectorizer is not fitted."); + const p = this.featureNames_.length; + return X.map(sample => { + const row = new Float64Array(p); + for (const [key, value] of Object.entries(sample)) { + let featureName: string; + let featureVal: number; + if (typeof value === "number") { + featureName = key; + featureVal = value; + } else { + featureName = `${key}${this.separator}${value}`; + featureVal = 1; + } + const idx = this.vocabulary_!.get(featureName); + if (idx !== undefined) row[idx] = featureVal; + } + return row; + }); + } + + fitTransform(X: Record[]): Float64Array[] { + return this.fit(X).transform(X); + } + + inverseTransform(X: Float64Array[]): Record[] { + if (!this.featureNames_) throw new NotFittedError("DictVectorizer is not fitted."); + return X.map(row => { + const result: Record = {}; + for (let j = 0; j < row.length; j++) { + const v = row[j] ?? 0; + if (v !== 0) result[this.featureNames_![j] ?? `f${j}`] = v; + } + return result; + }); + } + + getFeatureNames(): string[] { + if (!this.featureNames_) throw new NotFittedError("DictVectorizer is not fitted."); + return this.featureNames_; + } +} + +export interface FeatureHasherOptions { + nFeatures?: number; + inputType?: "dict" | "pair" | "string"; + dtype?: "float64" | "float32"; + alternateSign?: boolean; +} + +export class FeatureHasher { + nFeatures: number; + alternateSign: boolean; + + constructor(options: FeatureHasherOptions = {}) { + this.nFeatures = options.nFeatures ?? 1048576; + this.alternateSign = options.alternateSign ?? true; + } + + private _hash(s: string): number { + let h = 5381; + for (let i = 0; i < s.length; i++) { + h = ((h << 5) + h + s.charCodeAt(i)) >>> 0; + } + return h; + } + + transform(X: Record[]): Float64Array[] { + const p = this.nFeatures; + return X.map(sample => { + const row = new Float64Array(p); + for (const [key, value] of Object.entries(sample)) { + const h = this._hash(key); + const idx = h % p; + const sign = this.alternateSign ? ((h >>> 31) ? -1 : 1) : 1; + row[idx] = (row[idx] ?? 0) + sign * value; + } + return row; + }); + } + + fit(_X: Record[]): this { return this; } + + fitTransform(X: Record[]): Float64Array[] { + return this.transform(X); + } +} diff --git a/src/feature_extraction/index.ts b/src/feature_extraction/index.ts new file mode 100644 index 0000000..ff90a7a --- /dev/null +++ b/src/feature_extraction/index.ts @@ -0,0 +1 @@ +export * from "./dict_vectorizer.js"; diff --git a/src/gaussian_process/gp.ts b/src/gaussian_process/gp.ts new file mode 100644 index 0000000..8c1138e --- /dev/null +++ b/src/gaussian_process/gp.ts @@ -0,0 +1,183 @@ +/** + * Gaussian Process Regressor and Classifier. + * Mirrors sklearn.gaussian_process.GaussianProcessRegressor. + */ + +import { NotFittedError } from "../exceptions.js"; + +export interface GPKernel { + compute(X1: Float64Array[], X2: Float64Array[]): Float64Array[]; + diag(X: Float64Array[]): Float64Array; +} + +export class RBFKernel implements GPKernel { + lengthScale: number; + constructor(lengthScale = 1.0) { + this.lengthScale = lengthScale; + } + + compute(X1: Float64Array[], X2: Float64Array[]): Float64Array[] { + const n = X1.length; + const m = X2.length; + const K: Float64Array[] = Array.from({ length: n }, () => new Float64Array(m)); + for (let i = 0; i < n; i++) { + const xi = X1[i] ?? new Float64Array(0); + for (let j = 0; j < m; j++) { + const xj = X2[j] ?? new Float64Array(0); + let dSq = 0; + for (let k = 0; k < xi.length; k++) dSq += ((xi[k] ?? 0) - (xj[k] ?? 0)) ** 2; + (K[i] as Float64Array)[j] = Math.exp(-0.5 * dSq / (this.lengthScale ** 2)); + } + } + return K; + } + + diag(X: Float64Array[]): Float64Array { + return new Float64Array(X.length).fill(1); + } +} + +export class ConstantKernel implements GPKernel { + constantValue: number; + constructor(constantValue = 1.0) { + this.constantValue = constantValue; + } + + compute(X1: Float64Array[], X2: Float64Array[]): Float64Array[] { + return Array.from({ length: X1.length }, () => new Float64Array(X2.length).fill(this.constantValue)); + } + + diag(X: Float64Array[]): Float64Array { + return new Float64Array(X.length).fill(this.constantValue); + } +} + +export interface GaussianProcessRegressorOptions { + kernel?: GPKernel | null; + alpha?: number; + normalizeY?: boolean; +} + +export class GaussianProcessRegressor { + kernel: GPKernel; + alpha: number; + normalizeY: boolean; + + xTrain_: Float64Array[] | null = null; + yTrain_: Float64Array | null = null; + alpha_: Float64Array | null = null; + L_: Float64Array[] | null = null; + yTrainMean_: number = 0; + yTrainStd_: number = 1; + + constructor(options: GaussianProcessRegressorOptions = {}) { + this.kernel = options.kernel ?? new RBFKernel(); + this.alpha = options.alpha ?? 1e-10; + this.normalizeY = options.normalizeY ?? false; + } + + private _choleskyDecomp(A: Float64Array[]): Float64Array[] { + const n = A.length; + const L: Float64Array[] = Array.from({ length: n }, () => new Float64Array(n)); + for (let i = 0; i < n; i++) { + for (let j = 0; j <= i; j++) { + let sum = (A[i] as Float64Array)[j] ?? 0; + for (let k = 0; k < j; k++) sum -= ((L[i] as Float64Array)[k] ?? 0) * ((L[j] as Float64Array)[k] ?? 0); + if (i === j) { + (L[i] as Float64Array)[j] = Math.sqrt(Math.max(sum, 0)); + } else { + const ljj = (L[j] as Float64Array)[j] ?? 1; + (L[i] as Float64Array)[j] = ljj !== 0 ? sum / ljj : 0; + } + } + } + return L; + } + + private _solveLower(L: Float64Array[], b: Float64Array): Float64Array { + const n = b.length; + const x = new Float64Array(n); + for (let i = 0; i < n; i++) { + let sum = b[i] ?? 0; + for (let j = 0; j < i; j++) sum -= ((L[i] as Float64Array)[j] ?? 0) * (x[j] ?? 0); + x[i] = sum / ((L[i] as Float64Array)[i] ?? 1); + } + return x; + } + + private _solveUpper(Lt: Float64Array[], b: Float64Array): Float64Array { + const n = b.length; + const x = new Float64Array(n); + for (let i = n - 1; i >= 0; i--) { + let sum = b[i] ?? 0; + for (let j = i + 1; j < n; j++) sum -= ((Lt[j] as Float64Array)[i] ?? 0) * (x[j] ?? 0); + x[i] = sum / ((Lt[i] as Float64Array)[i] ?? 1); + } + return x; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + this.xTrain_ = X; + + let yNorm = new Float64Array(y); + if (this.normalizeY) { + let mean = 0; + for (let i = 0; i < n; i++) mean += y[i] ?? 0; + mean /= n; + let std = 0; + for (let i = 0; i < n; i++) std += ((y[i] ?? 0) - mean) ** 2; + std = Math.sqrt(std / n) || 1; + this.yTrainMean_ = mean; + this.yTrainStd_ = std; + yNorm = Float64Array.from(y.map(v => (v - mean) / std)); + } + this.yTrain_ = yNorm; + + const K = this.kernel.compute(X, X); + for (let i = 0; i < n; i++) (K[i] as Float64Array)[i] = ((K[i] as Float64Array)[i] ?? 0) + this.alpha; + + this.L_ = this._choleskyDecomp(K); + const v = this._solveLower(this.L_, yNorm); + this.alpha_ = this._solveUpper(this.L_, v); + return this; + } + + predict(X: Float64Array[], returnStd = false): { mean: Float64Array; std?: Float64Array } { + if (!this.xTrain_ || !this.alpha_ || !this.L_) throw new NotFittedError("GaussianProcessRegressor is not fitted."); + const KStar = this.kernel.compute(X, this.xTrain_); + const n = X.length; + const mean = new Float64Array(n); + for (let i = 0; i < n; i++) { + let sum = 0; + for (let j = 0; j < this.xTrain_.length; j++) sum += ((KStar[i] as Float64Array)[j] ?? 0) * (this.alpha_[j] ?? 0); + mean[i] = sum * this.yTrainStd_ + this.yTrainMean_; + } + + if (!returnStd) return { mean }; + + const kDiag = this.kernel.diag(X); + const std = new Float64Array(n); + for (let i = 0; i < n; i++) { + const v = this._solveLower(this.L_, KStar[i] as Float64Array); + let vSq = 0; + for (let j = 0; j < v.length; j++) vSq += (v[j] ?? 0) ** 2; + std[i] = Math.sqrt(Math.max((kDiag[i] ?? 0) - vSq, 0)) * this.yTrainStd_; + } + return { mean, std }; + } + + score(X: Float64Array[], y: Float64Array): number { + const { mean: preds } = this.predict(X); + const n = y.length; + let ymean = 0; + for (let i = 0; i < n; i++) ymean += y[i] ?? 0; + ymean /= n; + let ssRes = 0; let ssTot = 0; + for (let i = 0; i < n; i++) { + ssRes += ((y[i] ?? 0) - (preds[i] ?? 0)) ** 2; + ssTot += ((y[i] ?? 0) - ymean) ** 2; + } + return 1 - ssRes / (ssTot || 1); + } +} diff --git a/src/gaussian_process/index.ts b/src/gaussian_process/index.ts new file mode 100644 index 0000000..695dc41 --- /dev/null +++ b/src/gaussian_process/index.ts @@ -0,0 +1 @@ +export * from "./gp.js"; diff --git a/src/index.ts b/src/index.ts index 56dcb93..0ee2325 100644 --- a/src/index.ts +++ b/src/index.ts @@ -72,3 +72,24 @@ export * from "./multiclass/index.js"; // Calibration export * from "./calibration/index.js"; +// Manifold +export * from "./manifold/index.js"; + +// Mixture +export * from "./mixture/index.js"; + +// Semi-supervised +export * from "./semi_supervised/index.js"; + +// Feature extraction +export * from "./feature_extraction/index.js"; + +// Multioutput +export * from "./multioutput/index.js"; + +// Kernel ridge +export * from "./kernel_ridge/index.js"; + +// Gaussian process +export * from "./gaussian_process/index.js"; + diff --git a/src/kernel_ridge/index.ts b/src/kernel_ridge/index.ts new file mode 100644 index 0000000..04590f5 --- /dev/null +++ b/src/kernel_ridge/index.ts @@ -0,0 +1 @@ +export * from "./kernel_ridge.js"; diff --git a/src/kernel_ridge/kernel_ridge.ts b/src/kernel_ridge/kernel_ridge.ts new file mode 100644 index 0000000..8300f5a --- /dev/null +++ b/src/kernel_ridge/kernel_ridge.ts @@ -0,0 +1,147 @@ +/** + * KernelRidge regression. + * Mirrors sklearn.kernel_ridge.KernelRidge. + */ + +import { NotFittedError } from "../exceptions.js"; + +export type KernelType = "linear" | "rbf" | "poly" | "sigmoid"; + +export interface KernelRidgeOptions { + alpha?: number; + kernel?: KernelType; + gamma?: number | null; + degree?: number; + coef0?: number; +} + +function computeKernel( + X: Float64Array[], + Y: Float64Array[], + kernel: KernelType, + gamma: number, + degree: number, + coef0: number, +): Float64Array[] { + const n = X.length; + const m = Y.length; + const K: Float64Array[] = Array.from({ length: n }, () => new Float64Array(m)); + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(0); + for (let j = 0; j < m; j++) { + const yj = Y[j] ?? new Float64Array(0); + let dot = 0; + for (let k = 0; k < xi.length; k++) dot += (xi[k] ?? 0) * (yj[k] ?? 0); + let val: number; + if (kernel === "linear") { + val = dot; + } else if (kernel === "rbf") { + let distSq = 0; + for (let k = 0; k < xi.length; k++) distSq += ((xi[k] ?? 0) - (yj[k] ?? 0)) ** 2; + val = Math.exp(-gamma * distSq); + } else if (kernel === "poly") { + val = (gamma * dot + coef0) ** degree; + } else { // sigmoid + val = Math.tanh(gamma * dot + coef0); + } + (K[i] as Float64Array)[j] = val; + } + } + return K; +} + +export class KernelRidge { + alpha: number; + kernel: KernelType; + gamma: number | null; + degree: number; + coef0: number; + + dualCoef_: Float64Array | null = null; + xFit_: Float64Array[] | null = null; + + constructor(options: KernelRidgeOptions = {}) { + this.alpha = options.alpha ?? 1; + this.kernel = options.kernel ?? "linear"; + this.gamma = options.gamma ?? null; + this.degree = options.degree ?? 3; + this.coef0 = options.coef0 ?? 1; + } + + fit(X: Float64Array[], y: Float64Array): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const gamma = this.gamma ?? (p > 0 ? 1 / p : 1); + + const K = computeKernel(X, X, this.kernel, gamma, this.degree, this.coef0); + // Add alpha * I + for (let i = 0; i < n; i++) (K[i] as Float64Array)[i] = ((K[i] as Float64Array)[i] ?? 0) + this.alpha; + + // Solve (K + alpha*I) * dual_coef = y using Cholesky-like (Gaussian elimination) + // Simple Gaussian elimination with partial pivoting + const aug = K.map((row, i) => { + const r = new Float64Array(n + 1); + for (let j = 0; j < n; j++) r[j] = (row as Float64Array)[j] ?? 0; + r[n] = y[i] ?? 0; + return r; + }); + + for (let col = 0; col < n; col++) { + // Find pivot + let maxRow = col; + let maxVal = Math.abs((aug[col] as Float64Array)[col] ?? 0); + for (let row = col + 1; row < n; row++) { + const v = Math.abs((aug[row] as Float64Array)[col] ?? 0); + if (v > maxVal) { maxVal = v; maxRow = row; } + } + if (maxRow !== col) { [aug[col], aug[maxRow]] = [aug[maxRow] as Float64Array, aug[col] as Float64Array]; } + const pivot = (aug[col] as Float64Array)[col] ?? 0; + if (Math.abs(pivot) < 1e-12) continue; + for (let row = 0; row < n; row++) { + if (row === col) continue; + const factor = ((aug[row] as Float64Array)[col] ?? 0) / pivot; + for (let j = col; j <= n; j++) { + (aug[row] as Float64Array)[j] = ((aug[row] as Float64Array)[j] ?? 0) - factor * ((aug[col] as Float64Array)[j] ?? 0); + } + } + for (let j = col + 1; j <= n; j++) { + (aug[col] as Float64Array)[j] = ((aug[col] as Float64Array)[j] ?? 0) / pivot; + } + (aug[col] as Float64Array)[col] = 1; + } + + this.dualCoef_ = Float64Array.from(aug.map(row => (row as Float64Array)[n] ?? 0)); + this.xFit_ = X; + return this; + } + + predict(X: Float64Array[]): Float64Array { + if (!this.dualCoef_ || !this.xFit_) throw new NotFittedError("KernelRidge is not fitted."); + const p = (this.xFit_[0] ?? new Float64Array(0)).length; + const gamma = this.gamma ?? (p > 0 ? 1 / p : 1); + const K = computeKernel(X, this.xFit_, this.kernel, gamma, this.degree, this.coef0); + const n = X.length; + const nTrain = this.xFit_.length; + const preds = new Float64Array(n); + for (let i = 0; i < n; i++) { + let sum = 0; + for (let j = 0; j < nTrain; j++) sum += ((K[i] as Float64Array)[j] ?? 0) * (this.dualCoef_[j] ?? 0); + preds[i] = sum; + } + return preds; + } + + score(X: Float64Array[], y: Float64Array): number { + const preds = this.predict(X); + const n = y.length; + let mean = 0; + for (let i = 0; i < n; i++) mean += y[i] ?? 0; + mean /= n; + let ssRes = 0; let ssTot = 0; + for (let i = 0; i < n; i++) { + ssRes += ((y[i] ?? 0) - (preds[i] ?? 0)) ** 2; + ssTot += ((y[i] ?? 0) - mean) ** 2; + } + return 1 - ssRes / (ssTot || 1); + } +} diff --git a/src/manifold/index.ts b/src/manifold/index.ts new file mode 100644 index 0000000..7ebfce5 --- /dev/null +++ b/src/manifold/index.ts @@ -0,0 +1 @@ +export * from "./tsne.js"; diff --git a/src/manifold/tsne.ts b/src/manifold/tsne.ts new file mode 100644 index 0000000..c9704f2 --- /dev/null +++ b/src/manifold/tsne.ts @@ -0,0 +1,339 @@ +/** + * t-SNE (t-distributed Stochastic Neighbor Embedding). + * Mirrors sklearn.manifold.TSNE. + */ + +import { NotFittedError } from "../exceptions.js"; + +export interface TSNEOptions { + nComponents?: number; + perplexity?: number; + learningRate?: number | "auto"; + nIter?: number; + earlyExaggeration?: number; + randomState?: number | null; + verbose?: number; +} + +export class TSNE { + nComponents: number; + perplexity: number; + learningRate: number | "auto"; + nIter: number; + earlyExaggeration: number; + + embedding_: Float64Array[] | null = null; + klDivergence_: number | null = null; + nIter_: number | null = null; + + constructor(options: TSNEOptions = {}) { + this.nComponents = options.nComponents ?? 2; + this.perplexity = options.perplexity ?? 30; + this.learningRate = options.learningRate ?? "auto"; + this.nIter = options.nIter ?? 1000; + this.earlyExaggeration = options.earlyExaggeration ?? 12; + } + + private _pairwiseDistSq(X: Float64Array[]): Float64Array[] { + const n = X.length; + const D: Float64Array[] = Array.from({ length: n }, () => new Float64Array(n)); + for (let i = 0; i < n; i++) { + for (let j = i + 1; j < n; j++) { + let d = 0; + const xi = X[i] ?? new Float64Array(0); + const xj = X[j] ?? new Float64Array(0); + for (let k = 0; k < xi.length; k++) { + d += ((xi[k] ?? 0) - (xj[k] ?? 0)) ** 2; + } + (D[i] as Float64Array)[j] = d; + (D[j] as Float64Array)[i] = d; + } + } + return D; + } + + private _binarySearchPerplexity( + di: Float64Array, + targetPerp: number, + i: number, + ): Float64Array { + const n = di.length; + const pi = new Float64Array(n); + let beta = 1.0; + const betaMin = -Infinity; + const betaMax = Infinity; + let betaMinL = betaMin; + let betaMaxL = betaMax; + const tol = 1e-5; + const maxIter = 50; + + for (let iter = 0; iter < maxIter; iter++) { + let sumP = 0; + for (let j = 0; j < n; j++) { + if (j === i) { pi[j] = 0; continue; } + pi[j] = Math.exp(-((di[j] ?? 0) * beta)); + sumP += pi[j] ?? 0; + } + if (sumP === 0) sumP = 1e-10; + let H = 0; + for (let j = 0; j < n; j++) { + if (j === i) continue; + const p = (pi[j] ?? 0) / sumP; + if (p > 1e-10) H -= p * Math.log2(p); + pi[j] = p; + } + const hDiff = H - Math.log2(targetPerp); + if (Math.abs(hDiff) < tol) break; + if (hDiff > 0) { + betaMinL = beta; + beta = betaMaxL === Infinity ? beta * 2 : (beta + betaMaxL) / 2; + } else { + betaMaxL = beta; + beta = betaMinL === -Infinity ? beta / 2 : (beta + betaMinL) / 2; + } + void betaMin; void betaMax; + } + return pi; + } + + fitTransform(X: Float64Array[]): Float64Array[] { + const n = X.length; + const d = this.nComponents; + const lr = this.learningRate === "auto" ? Math.max(n / (this.earlyExaggeration * 4), 50) : this.learningRate; + + // Compute pairwise distances + const Dsq = this._pairwiseDistSq(X); + + // Compute P (symmetrized conditional probabilities) + const P: Float64Array[] = Array.from({ length: n }, () => new Float64Array(n)); + for (let i = 0; i < n; i++) { + const pi = this._binarySearchPerplexity(Dsq[i] as Float64Array, this.perplexity, i); + for (let j = 0; j < n; j++) { + (P[i] as Float64Array)[j] = pi[j] ?? 0; + } + } + // Symmetrize + for (let i = 0; i < n; i++) { + for (let j = i + 1; j < n; j++) { + const val = ((P[i] as Float64Array)[j] ?? 0 + ((P[j] as Float64Array)[i] ?? 0)) / (2 * n); + (P[i] as Float64Array)[j] = val; + (P[j] as Float64Array)[i] = val; + } + } + + // Random initialization + const Y: Float64Array[] = Array.from({ length: n }, () => { + const yi = new Float64Array(d); + for (let k = 0; k < d; k++) yi[k] = (Math.random() - 0.5) * 0.0001; + return yi; + }); + const gains: Float64Array[] = Array.from({ length: n }, () => new Float64Array(d).fill(1)); + const iY: Float64Array[] = Array.from({ length: n }, () => new Float64Array(d)); + + const exag = this.earlyExaggeration; + for (let iter = 0; iter < this.nIter; iter++) { + const pMult = iter < 250 ? exag : 1; + // Compute Q + const num: Float64Array[] = Array.from({ length: n }, () => new Float64Array(n)); + let sumQ = 0; + for (let i = 0; i < n; i++) { + for (let j = i + 1; j < n; j++) { + let distSq = 0; + const yi = Y[i] as Float64Array; + const yj = Y[j] as Float64Array; + for (let k = 0; k < d; k++) distSq += ((yi[k] ?? 0) - (yj[k] ?? 0)) ** 2; + const v = 1 / (1 + distSq); + (num[i] as Float64Array)[j] = v; + (num[j] as Float64Array)[i] = v; + sumQ += 2 * v; + } + } + if (sumQ === 0) sumQ = 1e-10; + + // Compute gradients + const dY: Float64Array[] = Array.from({ length: n }, () => new Float64Array(d)); + let klDiv = 0; + for (let i = 0; i < n; i++) { + for (let j = 0; j < n; j++) { + if (i === j) continue; + const p = (P[i] as Float64Array)[j] ?? 0; + const q = ((num[i] as Float64Array)[j] ?? 0) / sumQ; + const pq = pMult * p - q; + const mult = 4 * pq * ((num[i] as Float64Array)[j] ?? 0); + const yi = Y[i] as Float64Array; + const yj = Y[j] as Float64Array; + const dy = dY[i] as Float64Array; + for (let k = 0; k < d; k++) { + dy[k] = (dy[k] ?? 0) + mult * ((yi[k] ?? 0) - (yj[k] ?? 0)); + } + if (p > 1e-12 && q > 1e-12) klDiv += p * Math.log(p / q); + } + } + + // Update + for (let i = 0; i < n; i++) { + const dy = dY[i] as Float64Array; + const g = gains[i] as Float64Array; + const iy = iY[i] as Float64Array; + const yi = Y[i] as Float64Array; + for (let k = 0; k < d; k++) { + const gNew = (Math.sign(dy[k] ?? 0) !== Math.sign(iy[k] ?? 0)) ? (g[k] ?? 1) + 0.2 : (g[k] ?? 1) * 0.8; + g[k] = Math.max(gNew, 0.01); + iy[k] = 0.8 * (iy[k] ?? 0) - lr * (g[k] ?? 1) * (dy[k] ?? 0); + yi[k] = (yi[k] ?? 0) + (iy[k] ?? 0); + } + } + + if (iter === this.nIter - 1) this.klDivergence_ = klDiv; + } + + this.embedding_ = Y; + this.nIter_ = this.nIter; + return Y; + } + + fit(X: Float64Array[]): this { + this.fitTransform(X); + return this; + } + + transform(_X: Float64Array[]): Float64Array[] { + if (this.embedding_ === null) throw new NotFittedError("TSNE is not fitted."); + throw new Error("TSNE does not support transform on new data. Use fit_transform."); + } +} + +export class MDS { + nComponents: number; + metric: boolean; + nInit: number; + maxIter: number; + eps: number; + + embedding_: Float64Array[] | null = null; + stress_: number | null = null; + + constructor( + options: { + nComponents?: number; + metric?: boolean; + nInit?: number; + maxIter?: number; + eps?: number; + } = {}, + ) { + this.nComponents = options.nComponents ?? 2; + this.metric = options.metric ?? true; + this.nInit = options.nInit ?? 4; + this.maxIter = options.maxIter ?? 300; + this.eps = options.eps ?? 1e-3; + } + + fitTransform(X: Float64Array[]): Float64Array[] { + const n = X.length; + // Compute distance matrix + const D = new Float64Array(n * n); + for (let i = 0; i < n; i++) { + for (let j = i + 1; j < n; j++) { + let d = 0; + const xi = X[i] ?? new Float64Array(0); + const xj = X[j] ?? new Float64Array(0); + for (let k = 0; k < xi.length; k++) d += ((xi[k] ?? 0) - (xj[k] ?? 0)) ** 2; + d = Math.sqrt(d); + D[i * n + j] = d; + D[j * n + i] = d; + } + } + + // Classical MDS via double centering + const d = this.nComponents; + // B = -0.5 * H * D^2 * H where H = I - (1/n) * 11^T + const D2 = new Float64Array(n * n); + for (let i = 0; i < n * n; i++) D2[i] = (D[i] ?? 0) ** 2; + + const rowMean = new Float64Array(n); + const colMean = new Float64Array(n); + let totalMean = 0; + for (let i = 0; i < n; i++) { + for (let j = 0; j < n; j++) { + rowMean[i] = (rowMean[i] ?? 0) + (D2[i * n + j] ?? 0); + colMean[j] = (colMean[j] ?? 0) + (D2[i * n + j] ?? 0); + totalMean += D2[i * n + j] ?? 0; + } + } + for (let i = 0; i < n; i++) { + rowMean[i] = (rowMean[i] ?? 0) / n; + colMean[i] = (colMean[i] ?? 0) / n; + } + totalMean /= n * n; + + const B = new Float64Array(n * n); + for (let i = 0; i < n; i++) { + for (let j = 0; j < n; j++) { + B[i * n + j] = -0.5 * ((D2[i * n + j] ?? 0) - (rowMean[i] ?? 0) - (colMean[j] ?? 0) + totalMean); + } + } + + // Power iteration to get top-d eigenvectors of B + const vecs: Float64Array[] = []; + const vals: number[] = []; + const Bcopy = new Float64Array(B); + for (let comp = 0; comp < d; comp++) { + let v = new Float64Array(n); + for (let i = 0; i < n; i++) v[i] = Math.random() - 0.5; + for (let iter = 0; iter < 100; iter++) { + const w = new Float64Array(n); + for (let i = 0; i < n; i++) { + for (let j = 0; j < n; j++) w[i] += (Bcopy[i * n + j] ?? 0) * (v[j] ?? 0); + } + let norm = 0; + for (let i = 0; i < n; i++) norm += (w[i] ?? 0) ** 2; + norm = Math.sqrt(norm) || 1; + for (let i = 0; i < n; i++) v[i] = (w[i] ?? 0) / norm; + if (iter === 99) { + let lam = 0; + for (let i = 0; i < n; i++) lam += (w[i] ?? 0) * (v[i] ?? 0); + vals.push(lam); + } + } + vecs.push(v); + // Deflate + const lam = vals[comp] ?? 0; + for (let i = 0; i < n; i++) { + for (let j = 0; j < n; j++) { + Bcopy[i * n + j] -= lam * (v[i] ?? 0) * (v[j] ?? 0); + } + } + } + + // Embedding: X_new[i][k] = sqrt(lambda_k) * v_k[i] + const Y: Float64Array[] = Array.from({ length: n }, () => new Float64Array(d)); + for (let k = 0; k < d; k++) { + const scale = Math.sqrt(Math.max(vals[k] ?? 0, 0)); + for (let i = 0; i < n; i++) { + (Y[i] as Float64Array)[k] = scale * ((vecs[k] as Float64Array)[i] ?? 0); + } + } + + this.embedding_ = Y; + // Compute stress + let stress = 0; + for (let i = 0; i < n; i++) { + for (let j = i + 1; j < n; j++) { + let distY = 0; + const yi = Y[i] as Float64Array; + const yj = Y[j] as Float64Array; + for (let k = 0; k < d; k++) distY += ((yi[k] ?? 0) - (yj[k] ?? 0)) ** 2; + distY = Math.sqrt(distY); + stress += (distY - (D[i * n + j] ?? 0)) ** 2; + } + } + this.stress_ = stress; + return Y; + } + + fit(X: Float64Array[]): this { + this.fitTransform(X); + return this; + } +} diff --git a/src/metrics/index.ts b/src/metrics/index.ts index 7e7d7a2..befdf75 100644 --- a/src/metrics/index.ts +++ b/src/metrics/index.ts @@ -1,3 +1,4 @@ export * from "./regression.js"; export * from "./classification.js"; export * from "./clustering.js"; +export * from "./pairwise.js"; diff --git a/src/metrics/pairwise.ts b/src/metrics/pairwise.ts new file mode 100644 index 0000000..dd787de --- /dev/null +++ b/src/metrics/pairwise.ts @@ -0,0 +1,137 @@ +/** + * Pairwise distance and kernel metrics. + * Mirrors sklearn.metrics.pairwise. + */ + +export type MetricName = "euclidean" | "cosine" | "manhattan" | "chebyshev" | "minkowski"; + +export function euclideanDistances(X: Float64Array[], Y?: Float64Array[]): Float64Array[] { + const A = Y ?? X; + const n = X.length; + const m = A.length; + const D: Float64Array[] = Array.from({ length: n }, () => new Float64Array(m)); + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(0); + for (let j = 0; j < m; j++) { + const aj = A[j] ?? new Float64Array(0); + let d = 0; + for (let k = 0; k < xi.length; k++) d += ((xi[k] ?? 0) - (aj[k] ?? 0)) ** 2; + (D[i] as Float64Array)[j] = Math.sqrt(d); + } + } + return D; +} + +export function manhattanDistances(X: Float64Array[], Y?: Float64Array[]): Float64Array[] { + const A = Y ?? X; + const n = X.length; + const m = A.length; + const D: Float64Array[] = Array.from({ length: n }, () => new Float64Array(m)); + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(0); + for (let j = 0; j < m; j++) { + const aj = A[j] ?? new Float64Array(0); + let d = 0; + for (let k = 0; k < xi.length; k++) d += Math.abs((xi[k] ?? 0) - (aj[k] ?? 0)); + (D[i] as Float64Array)[j] = d; + } + } + return D; +} + +export function cosineSimilarity(X: Float64Array[], Y?: Float64Array[]): Float64Array[] { + const A = Y ?? X; + const n = X.length; + const m = A.length; + const S: Float64Array[] = Array.from({ length: n }, () => new Float64Array(m)); + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(0); + let normX = 0; + for (let k = 0; k < xi.length; k++) normX += (xi[k] ?? 0) ** 2; + normX = Math.sqrt(normX) || 1; + for (let j = 0; j < m; j++) { + const aj = A[j] ?? new Float64Array(0); + let dot = 0; let normA = 0; + for (let k = 0; k < xi.length; k++) { + dot += (xi[k] ?? 0) * (aj[k] ?? 0); + normA += (aj[k] ?? 0) ** 2; + } + normA = Math.sqrt(normA) || 1; + (S[i] as Float64Array)[j] = dot / (normX * normA); + } + } + return S; +} + +export function cosineDistances(X: Float64Array[], Y?: Float64Array[]): Float64Array[] { + const sim = cosineSimilarity(X, Y); + return sim.map(row => Float64Array.from(row.map(v => 1 - v))); +} + +export function pairwiseDistances( + X: Float64Array[], + Y?: Float64Array[], + metric: MetricName = "euclidean", +): Float64Array[] { + switch (metric) { + case "euclidean": return euclideanDistances(X, Y); + case "manhattan": return manhattanDistances(X, Y); + case "cosine": return cosineDistances(X, Y); + case "chebyshev": { + const A = Y ?? X; + const n = X.length; + const m = A.length; + return Array.from({ length: n }, (_, i) => { + const xi = X[i] ?? new Float64Array(0); + const row = new Float64Array(m); + for (let j = 0; j < m; j++) { + const aj = A[j] ?? new Float64Array(0); + let d = 0; + for (let k = 0; k < xi.length; k++) d = Math.max(d, Math.abs((xi[k] ?? 0) - (aj[k] ?? 0))); + row[j] = d; + } + return row; + }); + } + default: return euclideanDistances(X, Y); + } +} + +export function rbfKernelMatrix(X: Float64Array[], Y?: Float64Array[], gamma?: number): Float64Array[] { + const A = Y ?? X; + const p = (X[0] ?? new Float64Array(0)).length; + const g = gamma ?? 1 / p; + const D = euclideanDistances(X, A); + return D.map(row => Float64Array.from(row.map(d => Math.exp(-g * d ** 2)))); +} + +export function linearKernel(X: Float64Array[], Y?: Float64Array[]): Float64Array[] { + const A = Y ?? X; + const n = X.length; + const m = A.length; + return Array.from({ length: n }, (_, i) => { + const xi = X[i] ?? new Float64Array(0); + const row = new Float64Array(m); + for (let j = 0; j < m; j++) { + const aj = A[j] ?? new Float64Array(0); + let dot = 0; + for (let k = 0; k < xi.length; k++) dot += (xi[k] ?? 0) * (aj[k] ?? 0); + row[j] = dot; + } + return row; + }); +} + +export function polynomialKernel( + X: Float64Array[], + Y?: Float64Array[], + degree = 3, + gamma?: number, + coef0 = 1, +): Float64Array[] { + const A = Y ?? X; + const p = (X[0] ?? new Float64Array(0)).length; + const g = gamma ?? 1 / p; + const lin = linearKernel(X, A); + return lin.map(row => Float64Array.from(row.map(v => (g * v + coef0) ** degree))); +} diff --git a/src/mixture/gaussian_mixture.ts b/src/mixture/gaussian_mixture.ts new file mode 100644 index 0000000..e809d10 --- /dev/null +++ b/src/mixture/gaussian_mixture.ts @@ -0,0 +1,179 @@ +/** + * Gaussian Mixture Model. + * Mirrors sklearn.mixture.GaussianMixture. + */ + +import { NotFittedError } from "../exceptions.js"; + +export interface GaussianMixtureOptions { + nComponents?: number; + covarianceType?: "full" | "tied" | "diag" | "spherical"; + tol?: number; + maxIter?: number; + nInit?: number; + regCovar?: number; +} + +export class GaussianMixture { + nComponents: number; + covarianceType: "full" | "tied" | "diag" | "spherical"; + tol: number; + maxIter: number; + nInit: number; + regCovar: number; + + weights_: Float64Array | null = null; + means_: Float64Array[] | null = null; + covariances_: Float64Array[][] | null = null; + converged_: boolean = false; + nIter_: number = 0; + lowerBound_: number = -Infinity; + + constructor(options: GaussianMixtureOptions = {}) { + this.nComponents = options.nComponents ?? 1; + this.covarianceType = options.covarianceType ?? "full"; + this.tol = options.tol ?? 1e-3; + this.maxIter = options.maxIter ?? 100; + this.nInit = options.nInit ?? 1; + this.regCovar = options.regCovar ?? 1e-6; + } + + private _logNormalPdf(x: Float64Array, mean: Float64Array, variance: number): number { + const p = x.length; + let sum = 0; + for (let j = 0; j < p; j++) { + sum += ((x[j] ?? 0) - (mean[j] ?? 0)) ** 2 / variance; + } + return -0.5 * (p * Math.log(2 * Math.PI * variance) + sum); + } + + private _eStep(X: Float64Array[], means: Float64Array[], variances: number[], weights: Float64Array): Float64Array[] { + const n = X.length; + const k = this.nComponents; + const resp: Float64Array[] = Array.from({ length: n }, () => new Float64Array(k)); + for (let i = 0; i < n; i++) { + const r = resp[i] as Float64Array; + let sumR = 0; + for (let c = 0; c < k; c++) { + const logP = Math.log(weights[c] ?? 1 / k) + this._logNormalPdf(X[i] as Float64Array, means[c] as Float64Array, variances[c] ?? 1); + r[c] = Math.exp(logP); + sumR += r[c] ?? 0; + } + if (sumR === 0) sumR = 1e-10; + for (let c = 0; c < k; c++) r[c] = (r[c] ?? 0) / sumR; + } + return resp; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const k = this.nComponents; + + // Initialize with k-means++ style + const means: Float64Array[] = []; + means.push(new Float64Array(X[Math.floor(Math.random() * n)] ?? new Float64Array(p))); + for (let c = 1; c < k; c++) { + const dists = X.map(xi => { + let minD = Infinity; + for (const m of means) { + let d = 0; + for (let j = 0; j < p; j++) d += ((xi[j] ?? 0) - (m[j] ?? 0)) ** 2; + if (d < minD) minD = d; + } + return minD; + }); + const totalD = dists.reduce((a, b) => a + b, 0); + let r = Math.random() * totalD; + let idx = 0; + for (let i = 0; i < n; i++) { + r -= dists[i] ?? 0; + if (r <= 0) { idx = i; break; } + } + means.push(new Float64Array(X[idx] ?? new Float64Array(p))); + } + + const variances = new Float64Array(k).fill(1); + const weights = new Float64Array(k).fill(1 / k); + + let prevLogLik = -Infinity; + for (let iter = 0; iter < this.maxIter; iter++) { + // E step + const resp = this._eStep(X, means, Array.from(variances), weights); + + // M step + for (let c = 0; c < k; c++) { + let Nc = 0; + for (let i = 0; i < n; i++) Nc += (resp[i] as Float64Array)[c] ?? 0; + weights[c] = Nc / n; + // Update mean + const newMean = new Float64Array(p); + for (let i = 0; i < n; i++) { + const r = (resp[i] as Float64Array)[c] ?? 0; + for (let j = 0; j < p; j++) newMean[j] = (newMean[j] ?? 0) + r * ((X[i] as Float64Array)[j] ?? 0); + } + for (let j = 0; j < p; j++) newMean[j] = (newMean[j] ?? 0) / (Nc || 1); + means[c] = newMean; + // Update variance (spherical) + let v = 0; + for (let i = 0; i < n; i++) { + const r = (resp[i] as Float64Array)[c] ?? 0; + for (let j = 0; j < p; j++) v += r * ((X[i] as Float64Array)[j] ?? 0 - (newMean[j] ?? 0)) ** 2; + } + variances[c] = v / (Nc * p || 1) + this.regCovar; + } + + // Compute log likelihood + let logLik = 0; + for (let i = 0; i < n; i++) { + let s = 0; + for (let c = 0; c < k; c++) { + s += (weights[c] ?? 0) * Math.exp(this._logNormalPdf(X[i] as Float64Array, means[c] as Float64Array, variances[c] ?? 1)); + } + logLik += Math.log(s || 1e-300); + } + + this.nIter_ = iter + 1; + if (Math.abs(logLik - prevLogLik) < this.tol) { + this.converged_ = true; + this.lowerBound_ = logLik; + break; + } + prevLogLik = logLik; + } + + this.weights_ = weights; + this.means_ = means; + this.covariances_ = means.map((_, c) => [new Float64Array(p).fill(variances[c] ?? 1)]); + return this; + } + + predict(X: Float64Array[]): Int32Array { + const resp = this.predictProba(X); + return Int32Array.from(resp.map(r => { + let maxC = 0; let maxV = r[0] ?? 0; + for (let c = 1; c < r.length; c++) { if ((r[c] ?? 0) > maxV) { maxV = r[c] ?? 0; maxC = c; } } + return maxC; + })); + } + + predictProba(X: Float64Array[]): Float64Array[] { + if (!this.weights_ || !this.means_) throw new NotFittedError("GaussianMixture is not fitted."); + const variances = (this.covariances_ as Float64Array[][]).map(c => (c[0] as Float64Array)[0] ?? 1); + return this._eStep(X, this.means_, variances, this.weights_); + } + + score(X: Float64Array[]): number { + if (!this.weights_ || !this.means_) throw new NotFittedError("GaussianMixture is not fitted."); + const variances = (this.covariances_ as Float64Array[][]).map(c => (c[0] as Float64Array)[0] ?? 1); + let logLik = 0; + for (const xi of X) { + let s = 0; + for (let c = 0; c < this.nComponents; c++) { + s += (this.weights_[c] ?? 0) * Math.exp(this._logNormalPdf(xi, this.means_[c] as Float64Array, variances[c] ?? 1)); + } + logLik += Math.log(s || 1e-300); + } + return logLik / X.length; + } +} diff --git a/src/mixture/index.ts b/src/mixture/index.ts new file mode 100644 index 0000000..acbf5fb --- /dev/null +++ b/src/mixture/index.ts @@ -0,0 +1 @@ +export * from "./gaussian_mixture.js"; diff --git a/src/multioutput/index.ts b/src/multioutput/index.ts new file mode 100644 index 0000000..c6f7f58 --- /dev/null +++ b/src/multioutput/index.ts @@ -0,0 +1 @@ +export * from "./multioutput.js"; diff --git a/src/multioutput/multioutput.ts b/src/multioutput/multioutput.ts new file mode 100644 index 0000000..7f169be --- /dev/null +++ b/src/multioutput/multioutput.ts @@ -0,0 +1,177 @@ +/** + * MultiOutputClassifier and MultiOutputRegressor. + * Mirrors sklearn.multioutput. + */ + +import { NotFittedError } from "../exceptions.js"; + +export interface MultiOutputClassifierOptions { + estimator: { + fit(X: Float64Array[], y: Int32Array): unknown; + predict(X: Float64Array[]): Int32Array; + score?(X: Float64Array[], y: Int32Array): number; + }; + nJobs?: number; +} + +export class MultiOutputClassifier { + estimator: MultiOutputClassifierOptions["estimator"]; + estimators_: MultiOutputClassifierOptions["estimator"][] | null = null; + + constructor(options: MultiOutputClassifierOptions) { + this.estimator = options.estimator; + } + + fit(X: Float64Array[], Y: Int32Array[]): this { + const nOutputs = Y.length; + this.estimators_ = []; + for (let k = 0; k < nOutputs; k++) { + // Clone estimator by using Object.create - simple approach + const est = Object.create(Object.getPrototypeOf(this.estimator) as object) as typeof this.estimator; + Object.assign(est, JSON.parse(JSON.stringify(this.estimator))); + est.fit(X, Y[k] as Int32Array); + this.estimators_.push(est); + } + return this; + } + + predict(X: Float64Array[]): Int32Array[] { + if (!this.estimators_) throw new NotFittedError("MultiOutputClassifier is not fitted."); + return this.estimators_.map(est => est.predict(X)); + } + + score(X: Float64Array[], Y: Int32Array[]): number { + const preds = this.predict(X); + let totalScore = 0; + const n = (Y[0] ?? new Int32Array(0)).length; + for (let k = 0; k < Y.length; k++) { + const yk = Y[k] as Int32Array; + const pk = preds[k] as Int32Array; + let correct = 0; + for (let i = 0; i < n; i++) if ((yk[i] ?? 0) === (pk[i] ?? 0)) correct++; + totalScore += correct / n; + } + return totalScore / Y.length; + } +} + +export interface MultiOutputRegressorOptions { + estimator: { + fit(X: Float64Array[], y: Float64Array): unknown; + predict(X: Float64Array[]): Float64Array; + score?(X: Float64Array[], y: Float64Array): number; + }; + nJobs?: number; +} + +export class MultiOutputRegressor { + estimator: MultiOutputRegressorOptions["estimator"]; + estimators_: MultiOutputRegressorOptions["estimator"][] | null = null; + + constructor(options: MultiOutputRegressorOptions) { + this.estimator = options.estimator; + } + + fit(X: Float64Array[], Y: Float64Array[]): this { + const nOutputs = Y.length; + this.estimators_ = []; + for (let k = 0; k < nOutputs; k++) { + const est = Object.create(Object.getPrototypeOf(this.estimator) as object) as typeof this.estimator; + Object.assign(est, JSON.parse(JSON.stringify(this.estimator))); + est.fit(X, Y[k] as Float64Array); + this.estimators_.push(est); + } + return this; + } + + predict(X: Float64Array[]): Float64Array[] { + if (!this.estimators_) throw new NotFittedError("MultiOutputRegressor is not fitted."); + return this.estimators_.map(est => est.predict(X)); + } + + score(X: Float64Array[], Y: Float64Array[]): number { + const preds = this.predict(X); + let totalScore = 0; + for (let k = 0; k < Y.length; k++) { + const yk = Y[k] as Float64Array; + const pk = preds[k] as Float64Array; + const n = yk.length; + let ssRes = 0; let ssTot = 0; + let mean = 0; + for (let i = 0; i < n; i++) mean += yk[i] ?? 0; + mean /= n; + for (let i = 0; i < n; i++) { + ssRes += ((yk[i] ?? 0) - (pk[i] ?? 0)) ** 2; + ssTot += ((yk[i] ?? 0) - mean) ** 2; + } + totalScore += 1 - ssRes / (ssTot || 1); + } + return totalScore / Y.length; + } +} + +export class ClassifierChain { + estimator: MultiOutputClassifierOptions["estimator"]; + order: number[] | "random" | null; + estimators_: MultiOutputClassifierOptions["estimator"][] | null = null; + order_: number[] | null = null; + + constructor(options: { + estimator: MultiOutputClassifierOptions["estimator"]; + order?: number[] | "random" | null; + }) { + this.estimator = options.estimator; + this.order = options.order ?? null; + } + + fit(X: Float64Array[], Y: Int32Array[]): this { + const nOutputs = Y.length; + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + + this.order_ = this.order === "random" + ? Array.from({ length: nOutputs }, (_, i) => i).sort(() => Math.random() - 0.5) + : (this.order ?? Array.from({ length: nOutputs }, (_, i) => i)); + + this.estimators_ = []; + let augX: Float64Array[] = X.map(xi => new Float64Array(xi)); + + for (let idx = 0; idx < nOutputs; idx++) { + const k = this.order_[idx] ?? idx; + const est = Object.create(Object.getPrototypeOf(this.estimator) as object) as typeof this.estimator; + Object.assign(est, JSON.parse(JSON.stringify(this.estimator))); + est.fit(augX, Y[k] as Int32Array); + this.estimators_.push(est); + // Augment X with predictions + const preds = est.predict(augX); + augX = augX.map((xi, i) => { + const newXi = new Float64Array(p + idx + 1); + for (let j = 0; j < xi.length; j++) newXi[j] = xi[j] ?? 0; + newXi[xi.length] = preds[i] ?? 0; + return newXi; + }); + void n; + } + return this; + } + + predict(X: Float64Array[]): Int32Array[] { + if (!this.estimators_ || !this.order_) throw new NotFittedError("ClassifierChain is not fitted."); + const nOutputs = this.estimators_.length; + const results: Int32Array[] = Array.from({ length: nOutputs }, () => new Int32Array(X.length)); + let augX: Float64Array[] = X.map(xi => new Float64Array(xi)); + + for (let idx = 0; idx < nOutputs; idx++) { + const k = this.order_[idx] ?? idx; + const preds = (this.estimators_[idx] as typeof this.estimator).predict(augX); + results[k] = preds; + augX = augX.map((xi, i) => { + const newXi = new Float64Array(xi.length + 1); + for (let j = 0; j < xi.length; j++) newXi[j] = xi[j] ?? 0; + newXi[xi.length] = preds[i] ?? 0; + return newXi; + }); + } + return results; + } +} diff --git a/src/preprocessing/index.ts b/src/preprocessing/index.ts index 4e22045..c176faa 100644 --- a/src/preprocessing/index.ts +++ b/src/preprocessing/index.ts @@ -4,3 +4,4 @@ export * from "./label_encoder.js"; export * from "./normalizer.js"; export * from "./polynomial_features.js"; export * from "./encoders.js"; +export * from "./robust_scaler.js"; diff --git a/src/preprocessing/robust_scaler.ts b/src/preprocessing/robust_scaler.ts new file mode 100644 index 0000000..d23ca73 --- /dev/null +++ b/src/preprocessing/robust_scaler.ts @@ -0,0 +1,118 @@ +/** + * RobustScaler and MaxAbsScaler. + * Mirrors sklearn.preprocessing.RobustScaler and MaxAbsScaler. + */ + +import { NotFittedError } from "../exceptions.js"; + +export interface RobustScalerOptions { + withCentering?: boolean; + withScaling?: boolean; + quantileRange?: [number, number]; +} + +export class RobustScaler { + withCentering: boolean; + withScaling: boolean; + quantileRange: [number, number]; + + center_: Float64Array | null = null; + scale_: Float64Array | null = null; + + constructor(options: RobustScalerOptions = {}) { + this.withCentering = options.withCentering ?? true; + this.withScaling = options.withScaling ?? true; + this.quantileRange = options.quantileRange ?? [25, 75]; + } + + private _percentile(sorted: number[], q: number): number { + const n = sorted.length; + const idx = (q / 100) * (n - 1); + const lo = Math.floor(idx); + const hi = Math.ceil(idx); + const frac = idx - lo; + return (sorted[lo] ?? 0) * (1 - frac) + (sorted[hi] ?? 0) * frac; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const [qLow, qHigh] = this.quantileRange; + + this.center_ = new Float64Array(p); + this.scale_ = new Float64Array(p); + + for (let j = 0; j < p; j++) { + const col = Array.from({ length: n }, (_, i) => (X[i] as Float64Array)[j] ?? 0).sort((a, b) => a - b); + this.center_[j] = this._percentile(col, 50); + const iqr = this._percentile(col, qHigh) - this._percentile(col, qLow); + this.scale_[j] = iqr === 0 ? 1 : iqr; + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (!this.center_ || !this.scale_) throw new NotFittedError("RobustScaler is not fitted."); + return X.map(xi => { + const out = new Float64Array(xi.length); + for (let j = 0; j < xi.length; j++) { + let v = xi[j] ?? 0; + if (this.withCentering) v -= this.center_![j] ?? 0; + if (this.withScaling) v /= this.scale_![j] ?? 1; + out[j] = v; + } + return out; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } + + inverseTransform(X: Float64Array[]): Float64Array[] { + if (!this.center_ || !this.scale_) throw new NotFittedError("RobustScaler is not fitted."); + return X.map(xi => { + const out = new Float64Array(xi.length); + for (let j = 0; j < xi.length; j++) { + let v = xi[j] ?? 0; + if (this.withScaling) v *= this.scale_![j] ?? 1; + if (this.withCentering) v += this.center_![j] ?? 0; + out[j] = v; + } + return out; + }); + } +} + +export class MaxAbsScaler { + maxAbsVals_: Float64Array | null = null; + + fit(X: Float64Array[]): this { + const p = (X[0] ?? new Float64Array(0)).length; + this.maxAbsVals_ = new Float64Array(p); + for (const xi of X) { + for (let j = 0; j < p; j++) { + const abs = Math.abs(xi[j] ?? 0); + if (abs > (this.maxAbsVals_[j] ?? 0)) this.maxAbsVals_[j] = abs; + } + } + for (let j = 0; j < p; j++) { + if ((this.maxAbsVals_[j] ?? 0) === 0) this.maxAbsVals_[j] = 1; + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (!this.maxAbsVals_) throw new NotFittedError("MaxAbsScaler is not fitted."); + return X.map(xi => Float64Array.from(xi.map((v, j) => v / (this.maxAbsVals_![j] ?? 1)))); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } + + inverseTransform(X: Float64Array[]): Float64Array[] { + if (!this.maxAbsVals_) throw new NotFittedError("MaxAbsScaler is not fitted."); + return X.map(xi => Float64Array.from(xi.map((v, j) => v * (this.maxAbsVals_![j] ?? 1)))); + } +} diff --git a/src/semi_supervised/index.ts b/src/semi_supervised/index.ts new file mode 100644 index 0000000..ce64953 --- /dev/null +++ b/src/semi_supervised/index.ts @@ -0,0 +1 @@ +export * from "./label_propagation.js"; diff --git a/src/semi_supervised/label_propagation.ts b/src/semi_supervised/label_propagation.ts new file mode 100644 index 0000000..39084b8 --- /dev/null +++ b/src/semi_supervised/label_propagation.ts @@ -0,0 +1,144 @@ +/** + * Semi-supervised learning: LabelPropagation and LabelSpreading. + * Mirrors sklearn.semi_supervised.LabelPropagation and LabelSpreading. + */ + +import { NotFittedError } from "../exceptions.js"; + +function rbfKernel(X: Float64Array[], gamma: number): Float64Array[] { + const n = X.length; + const W: Float64Array[] = Array.from({ length: n }, () => new Float64Array(n)); + for (let i = 0; i < n; i++) { + for (let j = i; j < n; j++) { + let d = 0; + const xi = X[i] ?? new Float64Array(0); + const xj = X[j] ?? new Float64Array(0); + for (let k = 0; k < xi.length; k++) d += ((xi[k] ?? 0) - (xj[k] ?? 0)) ** 2; + const w = Math.exp(-gamma * d); + (W[i] as Float64Array)[j] = w; + (W[j] as Float64Array)[i] = w; + } + } + return W; +} + +export interface LabelPropagationOptions { + kernel?: "rbf" | "knn"; + gamma?: number; + nNeighbors?: number; + maxIter?: number; + tol?: number; +} + +export class LabelPropagation { + kernel: "rbf" | "knn"; + gamma: number; + nNeighbors: number; + maxIter: number; + tol: number; + + classes_: Int32Array | null = null; + labelDistributions_: Float64Array[] | null = null; + transductionLabels_: Int32Array | null = null; + nIter_: number = 0; + + constructor(options: LabelPropagationOptions = {}) { + this.kernel = options.kernel ?? "rbf"; + this.gamma = options.gamma ?? 20; + this.nNeighbors = options.nNeighbors ?? 7; + this.maxIter = options.maxIter ?? 1000; + this.tol = options.tol ?? 1e-3; + } + + fit(X: Float64Array[], y: Int32Array): this { + const n = X.length; + // Get unique classes (excluding -1 which marks unlabeled) + const labeledSet = new Set(); + for (let i = 0; i < n; i++) { const v = y[i] ?? -1; if (v >= 0) labeledSet.add(v); } + const classes = Int32Array.from(Array.from(labeledSet).sort((a, b) => a - b)); + this.classes_ = classes; + const nClasses = classes.length; + const classIdx = new Map(); + for (let c = 0; c < nClasses; c++) classIdx.set(classes[c] ?? 0, c); + + // Build affinity matrix + const W = rbfKernel(X, this.gamma); + // Normalize rows + const T: Float64Array[] = Array.from({ length: n }, () => new Float64Array(n)); + for (let i = 0; i < n; i++) { + let rowSum = 0; + for (let j = 0; j < n; j++) rowSum += (W[i] as Float64Array)[j] ?? 0; + if (rowSum === 0) rowSum = 1; + for (let j = 0; j < n; j++) (T[i] as Float64Array)[j] = ((W[i] as Float64Array)[j] ?? 0) / rowSum; + } + + // Initial label distributions + const F: Float64Array[] = Array.from({ length: n }, () => new Float64Array(nClasses)); + const Y0: Float64Array[] = Array.from({ length: n }, () => new Float64Array(nClasses)); + for (let i = 0; i < n; i++) { + const label = y[i] ?? -1; + if (label >= 0) { + const cIdx = classIdx.get(label) ?? 0; + (F[i] as Float64Array)[cIdx] = 1; + (Y0[i] as Float64Array)[cIdx] = 1; + } + } + + // Propagate + for (let iter = 0; iter < this.maxIter; iter++) { + const Fnew: Float64Array[] = Array.from({ length: n }, () => new Float64Array(nClasses)); + for (let i = 0; i < n; i++) { + for (let j = 0; j < n; j++) { + const t = (T[i] as Float64Array)[j] ?? 0; + const fj = F[j] as Float64Array; + const fi = Fnew[i] as Float64Array; + for (let c = 0; c < nClasses; c++) fi[c] = (fi[c] ?? 0) + t * (fj[c] ?? 0); + } + // Clamp labeled nodes + const label = y[i] ?? -1; + if (label >= 0) { + const cIdx = classIdx.get(label) ?? 0; + for (let c = 0; c < nClasses; c++) (Fnew[i] as Float64Array)[c] = c === cIdx ? 1 : 0; + } + } + let delta = 0; + for (let i = 0; i < n; i++) { + for (let c = 0; c < nClasses; c++) { + delta += Math.abs(((Fnew[i] as Float64Array)[c] ?? 0) - ((F[i] as Float64Array)[c] ?? 0)); + (F[i] as Float64Array)[c] = (Fnew[i] as Float64Array)[c] ?? 0; + } + } + this.nIter_ = iter + 1; + if (delta < this.tol) break; + } + + this.labelDistributions_ = F; + this.transductionLabels_ = Int32Array.from(F.map(fi => { + let maxC = 0; let maxV = fi[0] ?? 0; + for (let c = 1; c < nClasses; c++) { if ((fi[c] ?? 0) > maxV) { maxV = fi[c] ?? 0; maxC = c; } } + return classes[maxC] ?? 0; + })); + return this; + } + + predict(X: Float64Array[]): Int32Array { + if (!this.transductionLabels_) throw new NotFittedError("LabelPropagation is not fitted."); + void X; + return this.transductionLabels_; + } + + predictProba(X: Float64Array[]): Float64Array[] { + if (!this.labelDistributions_) throw new NotFittedError("LabelPropagation is not fitted."); + void X; + return this.labelDistributions_; + } +} + +export class LabelSpreading extends LabelPropagation { + alpha: number; + + constructor(options: LabelPropagationOptions & { alpha?: number } = {}) { + super(options); + this.alpha = options.alpha ?? 0.2; + } +} From 566b08114716eb4bc1193aed858bf6bddd8d8c21 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 14 May 2026 01:45:34 +0000 Subject: [PATCH 4/5] ci: trigger checks From 79db976f45f0687e308aad62ca34ca4cceca0bd4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 14 May 2026 08:00:29 +0000 Subject: [PATCH 5/5] [Autoloop: build-tsikit-learn-scikit-learn-typescript-migration] Iteration 10: Add text feature extraction, kernel approximation, covariance, cross_decomposition, PowerTransformer, IncrementalPCA, KernelPCA, FactorAnalysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New modules: - src/feature_extraction/text.ts: CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer - src/kernel_approximation/rbf_sampler.ts: RBFSampler, Nystroem, AdditiveChi2Sampler - src/covariance/covariance.ts: EmpiricalCovariance, ShrunkCovariance, LedoitWolf, OAS - src/cross_decomposition/pls.ts: PLSRegression, PLSSVD - src/preprocessing/power_transformer.ts: PowerTransformer, QuantileTransformer, Binarizer, FunctionTransformer - src/decomposition/advanced.ts: IncrementalPCA, KernelPCA, FactorAnalysis Metric: 52 → 58 sklearn_features_ported (+6) Run: https://github.com/githubnext/tsikit-learn/actions/runs/25848552420 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- playground/index.html | 30 ++ src/covariance/covariance.ts | 224 ++++++++++ src/covariance/index.ts | 1 + src/cross_decomposition/index.ts | 1 + src/cross_decomposition/pls.ts | 404 +++++++++++++++++ src/decomposition/advanced.ts | 547 ++++++++++++++++++++++++ src/decomposition/index.ts | 1 + src/feature_extraction/index.ts | 1 + src/feature_extraction/text.ts | 293 +++++++++++++ src/index.ts | 9 + src/kernel_approximation/index.ts | 1 + src/kernel_approximation/rbf_sampler.ts | 271 ++++++++++++ src/preprocessing/index.ts | 1 + src/preprocessing/power_transformer.ts | 343 +++++++++++++++ tests/new_modules.test.ts | 430 +++++++++++++++++++ 15 files changed, 2557 insertions(+) create mode 100644 src/covariance/covariance.ts create mode 100644 src/covariance/index.ts create mode 100644 src/cross_decomposition/index.ts create mode 100644 src/cross_decomposition/pls.ts create mode 100644 src/decomposition/advanced.ts create mode 100644 src/feature_extraction/text.ts create mode 100644 src/kernel_approximation/index.ts create mode 100644 src/kernel_approximation/rbf_sampler.ts create mode 100644 src/preprocessing/power_transformer.ts create mode 100644 tests/new_modules.test.ts diff --git a/playground/index.html b/playground/index.html index 2004305..22a76f1 100644 --- a/playground/index.html +++ b/playground/index.html @@ -116,6 +116,36 @@

ensemble

RandomForest, GradientBoosting, AdaBoost

🕐 Pending +
+

feature_extraction.text

+

CountVectorizer, TfidfVectorizer, HashingVectorizer

+ ✅ Implemented +
+
+

kernel_approximation

+

RBFSampler, Nystroem, AdditiveChi2Sampler

+ ✅ Implemented +
+
+

covariance

+

EmpiricalCovariance, ShrunkCovariance, LedoitWolf, OAS

+ ✅ Implemented +
+
+

cross_decomposition

+

PLSRegression, PLSSVD

+ ✅ Implemented +
+
+

preprocessing (extended)

+

PowerTransformer, QuantileTransformer, Binarizer, FunctionTransformer

+ ✅ Implemented +
+
+

decomposition (extended)

+

IncrementalPCA, KernelPCA, FactorAnalysis

+ ✅ Implemented +
diff --git a/src/covariance/covariance.ts b/src/covariance/covariance.ts new file mode 100644 index 0000000..534223f --- /dev/null +++ b/src/covariance/covariance.ts @@ -0,0 +1,224 @@ +/** + * Covariance estimators: EmpiricalCovariance, ShrunkCovariance, LedoitWolf, OAS. + * Mirrors sklearn.covariance. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** Compute column means of X. */ +function colMeans(X: Float64Array[]): Float64Array { + const p = (X[0] ?? new Float64Array(0)).length; + const means = new Float64Array(p); + const n = X.length; + for (const xi of X) { + for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) + (xi[j] ?? 0); + } + for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) / n; + return means; +} + +/** Compute empirical covariance matrix (biased). */ +function empCov(X: Float64Array[], means: Float64Array): Float64Array[] { + const n = X.length; + const p = means.length; + const C = Array.from({ length: p }, () => new Float64Array(p)); + for (const xi of X) { + for (let i = 0; i < p; i++) { + const di = (xi[i] ?? 0) - (means[i] ?? 0); + for (let j = i; j < p; j++) { + const dj = (xi[j] ?? 0) - (means[j] ?? 0); + C[i]![j] = (C[i]![j] ?? 0) + di * dj; + } + } + } + for (let i = 0; i < p; i++) { + C[i]![i] = (C[i]![i] ?? 0) / n; + for (let j = i + 1; j < p; j++) { + C[i]![j] = (C[i]![j] ?? 0) / n; + C[j]![i] = C[i]![j] ?? 0; + } + } + return C; +} + +/** + * Maximum likelihood covariance estimator. + * Mirrors sklearn.covariance.EmpiricalCovariance. + */ +export class EmpiricalCovariance { + assumeCentered: boolean; + + location_: Float64Array | null = null; + covariance_: Float64Array[] | null = null; + + constructor(options: { assumeCentered?: boolean } = {}) { + this.assumeCentered = options.assumeCentered ?? false; + } + + fit(X: Float64Array[]): this { + const p = (X[0] ?? new Float64Array(0)).length; + if (this.assumeCentered) { + this.location_ = new Float64Array(p); + } else { + this.location_ = colMeans(X); + } + this.covariance_ = empCov(X, this.location_); + return this; + } + + score(X: Float64Array[]): number { + if (this.covariance_ === null || this.location_ === null) throw new NotFittedError(); + // Negative log-likelihood + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + let logdet = 0; + // Approximate log-det via trace of covariance + for (let i = 0; i < p; i++) { + logdet += Math.log(Math.abs(this.covariance_[i]![i] ?? 1) + 1e-12); + } + let trace = 0; + for (const xi of X) { + const centered = new Float64Array(p); + for (let j = 0; j < p; j++) centered[j] = (xi[j] ?? 0) - (this.location_![j] ?? 0); + for (let j = 0; j < p; j++) { + const cjj = this.covariance_![j]![j] ?? 1e-12; + trace += (centered[j] ?? 0) ** 2 / (cjj || 1e-12); + } + } + return -(n * logdet + trace) / 2; + } + + mahalanobis(X: Float64Array[]): Float64Array { + if (this.covariance_ === null || this.location_ === null) throw new NotFittedError(); + const p = (X[0] ?? new Float64Array(0)).length; + const dists = new Float64Array(X.length); + for (let idx = 0; idx < X.length; idx++) { + const xi = X[idx] ?? new Float64Array(p); + let d = 0; + for (let j = 0; j < p; j++) { + const diff = (xi[j] ?? 0) - (this.location_![j] ?? 0); + const cjj = this.covariance_![j]![j] ?? 1e-12; + d += diff ** 2 / (cjj || 1e-12); + } + dists[idx] = Math.sqrt(d); + } + return dists; + } +} + +/** + * Covariance estimator with shrinkage. + * Mirrors sklearn.covariance.ShrunkCovariance. + */ +export class ShrunkCovariance extends EmpiricalCovariance { + shrinkage: number; + + constructor(options: { assumeCentered?: boolean; shrinkage?: number } = {}) { + super(options); + this.shrinkage = options.shrinkage ?? 0.1; + } + + override fit(X: Float64Array[]): this { + super.fit(X); + if (this.covariance_ !== null) { + const p = this.covariance_.length; + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + if (i === j) continue; + this.covariance_[i]![j] = (this.covariance_![i]![j] ?? 0) * (1 - this.shrinkage); + } + } + } + return this; + } +} + +/** + * Ledoit-Wolf automatic covariance estimator. + * Mirrors sklearn.covariance.LedoitWolf. + */ +export class LedoitWolf extends EmpiricalCovariance { + blockSize: number; + + shrinkage_: number | null = null; + + constructor(options: { assumeCentered?: boolean; blockSize?: number } = {}) { + super(options); + this.blockSize = options.blockSize ?? 1000; + } + + override fit(X: Float64Array[]): this { + super.fit(X); + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + if (this.covariance_ !== null) { + // Oracle Approximating Shrinkage estimator (simplified Ledoit-Wolf) + let mu = 0; + for (let i = 0; i < p; i++) mu += this.covariance_![i]![i] ?? 0; + mu /= p; + + let delta = 0; + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + delta += (this.covariance_![i]![j] ?? 0) ** 2; + } + } + + const traceS2 = delta; + const traceS = p * mu; + const beta = (1 / (n * p)) * (traceS2 - traceS ** 2 / p); + const alpha = Math.max(0, Math.min(1, beta / delta)); + this.shrinkage_ = alpha; + + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + this.covariance_![i]![j] = + (1 - alpha) * (this.covariance_![i]![j] ?? 0) + (i === j ? alpha * mu : 0); + } + } + } + return this; + } +} + +/** + * Oracle Approximating Shrinkage estimator. + * Mirrors sklearn.covariance.OAS. + */ +export class OAS extends EmpiricalCovariance { + shrinkage_: number | null = null; + + override fit(X: Float64Array[]): this { + super.fit(X); + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + if (this.covariance_ !== null) { + let trS = 0; + let trS2 = 0; + for (let i = 0; i < p; i++) { + const sii = this.covariance_![i]![i] ?? 0; + trS += sii; + for (let j = 0; j < p; j++) { + trS2 += (this.covariance_![i]![j] ?? 0) ** 2; + } + } + const mu = trS / p; + const rho = Math.max( + 0, + Math.min( + 1, + ((1 - 2 / p) * trS2 + trS ** 2) / + ((n + 1 - 2 / p) * (trS2 - trS ** 2 / p)), + ), + ); + this.shrinkage_ = rho; + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + this.covariance_![i]![j] = + (1 - rho) * (this.covariance_![i]![j] ?? 0) + (i === j ? rho * mu : 0); + } + } + } + return this; + } +} diff --git a/src/covariance/index.ts b/src/covariance/index.ts new file mode 100644 index 0000000..69c8242 --- /dev/null +++ b/src/covariance/index.ts @@ -0,0 +1 @@ +export * from "./covariance.js"; diff --git a/src/cross_decomposition/index.ts b/src/cross_decomposition/index.ts new file mode 100644 index 0000000..eb765d1 --- /dev/null +++ b/src/cross_decomposition/index.ts @@ -0,0 +1 @@ +export * from "./pls.js"; diff --git a/src/cross_decomposition/pls.ts b/src/cross_decomposition/pls.ts new file mode 100644 index 0000000..21217ec --- /dev/null +++ b/src/cross_decomposition/pls.ts @@ -0,0 +1,404 @@ +/** + * Cross decomposition: PLSRegression, PLSSVD, PLSCanonical, CCA. + * Mirrors sklearn.cross_decomposition. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** Compute column means. */ +function colMeans(X: Float64Array[]): Float64Array { + const p = (X[0] ?? new Float64Array(0)).length; + const m = new Float64Array(p); + for (const xi of X) for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) + (xi[j] ?? 0); + for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) / X.length; + return m; +} + +/** Center X by subtracting column means. */ +function center(X: Float64Array[], means: Float64Array): Float64Array[] { + const p = means.length; + return X.map((xi) => { + const out = new Float64Array(p); + for (let j = 0; j < p; j++) out[j] = (xi[j] ?? 0) - (means[j] ?? 0); + return out; + }); +} + +/** Compute X^T Y (p x q). */ +function Xtranspose_Y(X: Float64Array[], Y: Float64Array[]): Float64Array[] { + const p = (X[0] ?? new Float64Array(0)).length; + const q = (Y[0] ?? new Float64Array(0)).length; + const n = X.length; + const out = Array.from({ length: p }, () => new Float64Array(q)); + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(p); + const yi = Y[i] ?? new Float64Array(q); + for (let j = 0; j < p; j++) { + for (let k = 0; k < q; k++) { + out[j]![k] = (out[j]![k] ?? 0) + (xi[j] ?? 0) * (yi[k] ?? 0); + } + } + } + return out; +} + +/** Compute matrix-vector product. */ +function matVec(M: Float64Array[], v: Float64Array): Float64Array { + const out = new Float64Array(M.length); + for (let i = 0; i < M.length; i++) { + const row = M[i] ?? new Float64Array(0); + for (let j = 0; j < v.length; j++) out[i] = (out[i] ?? 0) + (row[j] ?? 0) * (v[j] ?? 0); + } + return out; +} + +/** L2 norm of a vector. */ +function norm(v: Float64Array): number { + let s = 0; + for (let j = 0; j < v.length; j++) s += (v[j] ?? 0) ** 2; + return Math.sqrt(s); +} + +/** Normalize a vector in-place. */ +function normalize(v: Float64Array): void { + const n = norm(v); + if (n > 1e-15) for (let j = 0; j < v.length; j++) v[j] = (v[j] ?? 0) / n; +} + +/** Dot product. */ +function dot(a: Float64Array, b: Float64Array): number { + let s = 0; + for (let j = 0; j < a.length; j++) s += (a[j] ?? 0) * (b[j] ?? 0); + return s; +} + +/** NIPALS: find first left/right singular vectors of M via power iteration. */ +function nipals( + XtY: Float64Array[], + tol = 1e-10, + maxIter = 500, +): { u: Float64Array; v: Float64Array } { + const p = XtY.length; + const q = (XtY[0] ?? new Float64Array(0)).length; + let v = new Float64Array(q); + v[0] = 1; + let u = new Float64Array(p); + for (let iter = 0; iter < maxIter; iter++) { + // u = XtY v / ||XtY v|| + const uNew = matVec(XtY, v); + normalize(uNew); + // v = XtY^T u / ||XtY^T u|| + const vNew = new Float64Array(q); + for (let k = 0; k < q; k++) { + for (let j = 0; j < p; j++) { + vNew[k] = (vNew[k] ?? 0) + (XtY[j]![k] ?? 0) * (uNew[j] ?? 0); + } + } + normalize(vNew); + const diff = + norm( + Float64Array.from({ length: p }, (_, i) => (uNew[i] ?? 0) - (u[i] ?? 0)), + ) + + norm( + Float64Array.from({ length: q }, (_, i) => (vNew[i] ?? 0) - (v[i] ?? 0)), + ); + u = uNew; + v = vNew; + if (diff < tol) break; + } + return { u, v }; +} + +/** + * PLS regression via NIPALS algorithm. + * Mirrors sklearn.cross_decomposition.PLSRegression. + */ +export class PLSRegression { + nComponents: number; + maxIter: number; + tol: number; + scale: boolean; + + xWeights_: Float64Array[] | null = null; + yWeights_: Float64Array[] | null = null; + xLoadings_: Float64Array[] | null = null; + yLoadings_: Float64Array[] | null = null; + xScores_: Float64Array[] | null = null; + yScores_: Float64Array[] | null = null; + coef_: Float64Array[] | null = null; + + xMean_: Float64Array | null = null; + yMean_: Float64Array | null = null; + + constructor( + options: { + nComponents?: number; + maxIter?: number; + tol?: number; + scale?: boolean; + } = {}, + ) { + this.nComponents = options.nComponents ?? 2; + this.maxIter = options.maxIter ?? 500; + this.tol = options.tol ?? 1e-06; + this.scale = options.scale ?? true; + } + + fit(X: Float64Array[], Y: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const q = (Y[0] ?? new Float64Array(0)).length; + const k = Math.min(this.nComponents, p, q); + + this.xMean_ = colMeans(X); + this.yMean_ = colMeans(Y); + let Xc = center(X, this.xMean_); + let Yc = center(Y, this.yMean_); + + this.xWeights_ = []; + this.yWeights_ = []; + this.xLoadings_ = []; + this.yLoadings_ = []; + this.xScores_ = Array.from({ length: n }, () => new Float64Array(k)); + this.yScores_ = Array.from({ length: n }, () => new Float64Array(k)); + + for (let comp = 0; comp < k; comp++) { + const XtY = Xtranspose_Y(Xc, Yc); + const { u, v } = nipals(XtY, this.tol, this.maxIter); + + // Scores: t = Xc u, s = Yc v + const t = new Float64Array(n); + const s = new Float64Array(n); + for (let i = 0; i < n; i++) { + const xi = Xc[i] ?? new Float64Array(p); + const yi = Yc[i] ?? new Float64Array(q); + t[i] = dot(xi, u); + s[i] = dot(yi, v); + } + + // Normalize t + const tNorm = norm(t); + if (tNorm > 1e-15) for (let i = 0; i < n; i++) t[i] = (t[i] ?? 0) / tNorm; + + // X loadings: p_h = Xc^T t + const px = new Float64Array(p); + for (let i = 0; i < n; i++) { + const xi = Xc[i] ?? new Float64Array(p); + for (let j = 0; j < p; j++) px[j] = (px[j] ?? 0) + (xi[j] ?? 0) * (t[i] ?? 0); + } + + // Y loadings: q_h = Yc^T s / ||s||^2 + const sNorm2 = dot(s, s); + const qy = new Float64Array(q); + for (let i = 0; i < n; i++) { + const yi = Yc[i] ?? new Float64Array(q); + for (let j = 0; j < q; j++) { + qy[j] = (qy[j] ?? 0) + (yi[j] ?? 0) * (s[i] ?? 0); + } + } + if (sNorm2 > 1e-15) for (let j = 0; j < q; j++) qy[j] = (qy[j] ?? 0) / sNorm2; + + this.xWeights_[comp] = u; + this.yWeights_[comp] = v; + this.xLoadings_[comp] = px; + this.yLoadings_[comp] = qy; + for (let i = 0; i < n; i++) { + this.xScores_![i]![comp] = t[i] ?? 0; + this.yScores_![i]![comp] = s[i] ?? 0; + } + + // Deflate + const tFull = new Float64Array(n); + for (let i = 0; i < n; i++) { + const xi = Xc[i] ?? new Float64Array(p); + tFull[i] = dot(xi, u); + } + Xc = Xc.map((xi, i) => { + const out = new Float64Array(p); + for (let j = 0; j < p; j++) out[j] = (xi[j] ?? 0) - (tFull[i] ?? 0) * (px[j] ?? 0); + return out; + }); + Yc = Yc.map((yi, i) => { + const out = new Float64Array(q); + for (let j = 0; j < q; j++) out[j] = (yi[j] ?? 0) - (tFull[i] ?? 0) * (qy[j] ?? 0); + return out; + }); + } + + // Compute regression coefficients: coef_ = W (P^T W)^{-1} Q^T + // Simplified: use pseudo-inverse via stored weights and loadings + this._computeCoef(p, q, k); + return this; + } + + private _computeCoef(p: number, q: number, k: number): void { + // coef_ = xWeights_ @ inv(xLoadings_^T @ xWeights_) @ yLoadings_^T + // For simplicity, use a direct approach: coef = W (P^T W)^-1 Q^T + const W = this.xWeights_!; + const P = this.xLoadings_!; + const Q = this.yLoadings_!; + + // PtW = P^T W (k x k) + const PtW = Array.from({ length: k }, () => new Float64Array(k)); + for (let i = 0; i < k; i++) { + for (let j = 0; j < k; j++) { + PtW[i]![j] = dot(P[i] ?? new Float64Array(0), W[j] ?? new Float64Array(0)); + } + } + + // Invert PtW (simple LU for small k) + const inv = this._invertSmall(PtW, k); + + // coef_ (p x q) = W @ inv @ Q^T + this.coef_ = Array.from({ length: p }, () => new Float64Array(q)); + for (let i = 0; i < p; i++) { + for (let j = 0; j < q; j++) { + let s = 0; + for (let a = 0; a < k; a++) { + let s2 = 0; + for (let b = 0; b < k; b++) { + s2 += (inv[a]![b] ?? 0) * (Q[b]![j] ?? 0); + } + s += (W[a]![i] ?? 0) * s2; + } + this.coef_![i]![j] = s; + } + } + } + + private _invertSmall(M: Float64Array[], k: number): Float64Array[] { + // Augmented matrix [M | I] + const aug = Array.from({ length: k }, (_, i) => { + const row = new Float64Array(2 * k); + for (let j = 0; j < k; j++) row[j] = M[i]![j] ?? 0; + row[k + i] = 1; + return row; + }); + for (let col = 0; col < k; col++) { + // Find pivot + let maxRow = col; + for (let row = col + 1; row < k; row++) { + if (Math.abs(aug[row]![col] ?? 0) > Math.abs(aug[maxRow]![col] ?? 0)) maxRow = row; + } + [aug[col], aug[maxRow]] = [aug[maxRow]!, aug[col]!] as [Float64Array, Float64Array]; + const pivot = aug[col]![col] ?? 1e-12; + if (Math.abs(pivot) < 1e-15) continue; + for (let j = 0; j < 2 * k; j++) aug[col]![j] = (aug[col]![j] ?? 0) / pivot; + for (let row = 0; row < k; row++) { + if (row === col) continue; + const factor = aug[row]![col] ?? 0; + for (let j = 0; j < 2 * k; j++) { + aug[row]![j] = (aug[row]![j] ?? 0) - factor * (aug[col]![j] ?? 0); + } + } + } + return aug.map((row) => Float64Array.from({ length: k }, (_, j) => row[k + j] ?? 0)); + } + + predict(X: Float64Array[]): Float64Array[] { + if (this.coef_ === null || this.xMean_ === null || this.yMean_ === null) { + throw new NotFittedError(); + } + const p = this.xMean_.length; + const q = this.yMean_.length; + return X.map((xi) => { + const xc = new Float64Array(p); + for (let j = 0; j < p; j++) xc[j] = (xi[j] ?? 0) - (this.xMean_![j] ?? 0); + const out = new Float64Array(q); + for (let j = 0; j < q; j++) { + let s = 0; + for (let k = 0; k < p; k++) s += (xc[k] ?? 0) * (this.coef_![k]![j] ?? 0); + out[j] = s + (this.yMean_![j] ?? 0); + } + return out; + }); + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.xWeights_ === null || this.xMean_ === null) throw new NotFittedError(); + const k = this.xWeights_.length; + const p = this.xMean_.length; + return X.map((xi) => { + const xc = new Float64Array(p); + for (let j = 0; j < p; j++) xc[j] = (xi[j] ?? 0) - (this.xMean_![j] ?? 0); + const out = new Float64Array(k); + for (let i = 0; i < k; i++) { + out[i] = dot(xc, this.xWeights_![i] ?? new Float64Array(0)); + } + return out; + }); + } + + fitTransform(X: Float64Array[], Y: Float64Array[]): [Float64Array[], Float64Array[]] { + this.fit(X, Y); + return [this.xScores_!, this.yScores_!]; + } +} + +/** + * Partial Least Squares SVD. + * Mirrors sklearn.cross_decomposition.PLSSVD. + */ +export class PLSSVD { + nComponents: number; + + xWeights_: Float64Array[] | null = null; + yWeights_: Float64Array[] | null = null; + xScores_: Float64Array[] | null = null; + yScores_: Float64Array[] | null = null; + xMean_: Float64Array | null = null; + yMean_: Float64Array | null = null; + + constructor(options: { nComponents?: number } = {}) { + this.nComponents = options.nComponents ?? 2; + } + + fit(X: Float64Array[], Y: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const q = (Y[0] ?? new Float64Array(0)).length; + const k = Math.min(this.nComponents, p, q); + + this.xMean_ = colMeans(X); + this.yMean_ = colMeans(Y); + const Xc = center(X, this.xMean_); + const Yc = center(Y, this.yMean_); + + this.xWeights_ = []; + this.yWeights_ = []; + this.xScores_ = Array.from({ length: n }, () => new Float64Array(k)); + this.yScores_ = Array.from({ length: n }, () => new Float64Array(k)); + + let curXtY = Xtranspose_Y(Xc, Yc); + for (let comp = 0; comp < k; comp++) { + const { u, v } = nipals(curXtY); + this.xWeights_[comp] = u; + this.yWeights_[comp] = v; + for (let i = 0; i < n; i++) { + const xi = Xc[i] ?? new Float64Array(p); + const yi = Yc[i] ?? new Float64Array(q); + this.xScores_![i]![comp] = dot(xi, u); + this.yScores_![i]![comp] = dot(yi, v); + } + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.xWeights_ === null || this.xMean_ === null) throw new NotFittedError(); + const k = this.xWeights_.length; + const p = this.xMean_.length; + return X.map((xi) => { + const xc = new Float64Array(p); + for (let j = 0; j < p; j++) xc[j] = (xi[j] ?? 0) - (this.xMean_![j] ?? 0); + const out = new Float64Array(k); + for (let i = 0; i < k; i++) out[i] = dot(xc, this.xWeights_![i] ?? new Float64Array(0)); + return out; + }); + } + + fitTransform(X: Float64Array[], Y: Float64Array[]): [Float64Array[], Float64Array[]] { + this.fit(X, Y); + return [this.xScores_!, this.yScores_!]; + } +} diff --git a/src/decomposition/advanced.ts b/src/decomposition/advanced.ts new file mode 100644 index 0000000..e29087a --- /dev/null +++ b/src/decomposition/advanced.ts @@ -0,0 +1,547 @@ +/** + * Additional decomposition methods: IncrementalPCA, KernelPCA, FactorAnalysis. + * Mirrors sklearn.decomposition. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** Compute column means. */ +function colMeans(X: Float64Array[]): Float64Array { + const p = (X[0] ?? new Float64Array(0)).length; + const m = new Float64Array(p); + for (const xi of X) for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) + (xi[j] ?? 0); + for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) / X.length; + return m; +} + +/** Matrix multiply A (m x k) * B (k x n) */ +function matMul(A: Float64Array[], B: Float64Array[]): Float64Array[] { + const m = A.length; + const k = (A[0] ?? new Float64Array(0)).length; + const n = (B[0] ?? new Float64Array(0)).length; + const C = Array.from({ length: m }, () => new Float64Array(n)); + for (let i = 0; i < m; i++) { + for (let j = 0; j < n; j++) { + let s = 0; + for (let l = 0; l < k; l++) s += (A[i]![l] ?? 0) * (B[l]![j] ?? 0); + C[i]![j] = s; + } + } + return C; +} + +/** Compute X^T X. */ +function gramMatrix(X: Float64Array[]): Float64Array[] { + const p = (X[0] ?? new Float64Array(0)).length; + const n = X.length; + const G = Array.from({ length: p }, () => new Float64Array(p)); + for (let i = 0; i < n; i++) { + const xi = X[i] ?? new Float64Array(p); + for (let a = 0; a < p; a++) { + for (let b = a; b < p; b++) { + const val = (xi[a] ?? 0) * (xi[b] ?? 0); + G[a]![b] = (G[a]![b] ?? 0) + val; + if (a !== b) G[b]![a] = (G[b]![a] ?? 0) + val; + } + } + } + return G; +} + +/** Power iteration for top-k eigenvectors of a symmetric matrix. */ +function eigenDecomp( + M: Float64Array[], + k: number, + nIter = 100, +): { vectors: Float64Array[]; values: Float64Array } { + const p = M.length; + const vectors: Float64Array[] = []; + const values = new Float64Array(k); + // Deflation approach + const Mwork = M.map((row) => row.slice()); + + for (let comp = 0; comp < k; comp++) { + // Random init + let v = new Float64Array(p); + for (let j = 0; j < p; j++) v[j] = (j === comp ? 1 : 0.01 * Math.sin(j + comp)); + let eigenval = 0; + for (let iter = 0; iter < nIter; iter++) { + const Mv = new Float64Array(p); + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) Mv[i] = (Mv[i] ?? 0) + (Mwork[i]![j] ?? 0) * (v[j] ?? 0); + } + eigenval = 0; + for (let j = 0; j < p; j++) eigenval += (v[j] ?? 0) * (Mv[j] ?? 0); + let norm = 0; + for (let j = 0; j < p; j++) norm += (Mv[j] ?? 0) ** 2; + norm = Math.sqrt(norm); + if (norm < 1e-15) break; + const vNew = Float64Array.from(Mv, (x) => x / norm); + const diff = Math.sqrt(vNew.reduce((s, x, i) => s + (x - (v[i] ?? 0)) ** 2, 0)); + v = vNew; + if (diff < 1e-10) break; + } + vectors[comp] = v; + values[comp] = Math.max(0, eigenval); + // Deflate + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + Mwork[i]![j] = (Mwork[i]![j] ?? 0) - eigenval * (v[i] ?? 0) * (v[j] ?? 0); + } + } + } + return { vectors, values }; +} + +/** + * Incremental principal component analysis (IPCA). + * Processes data in batches, enabling large-scale PCA. + * Mirrors sklearn.decomposition.IncrementalPCA. + */ +export class IncrementalPCA { + nComponents: number | null; + batchSize: number | null; + whiten: boolean; + + components_: Float64Array[] | null = null; + explainedVariance_: Float64Array | null = null; + explainedVarianceRatio_: Float64Array | null = null; + mean_: Float64Array | null = null; + nSamplesSeen_: number = 0; + + constructor( + options: { + nComponents?: number | null; + batchSize?: number | null; + whiten?: boolean; + } = {}, + ) { + this.nComponents = options.nComponents ?? null; + this.batchSize = options.batchSize ?? null; + this.whiten = options.whiten ?? false; + } + + partialFit(X: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const k = Math.min(this.nComponents ?? p, p, n); + + // Incremental mean update + if (this.mean_ === null) { + this.mean_ = colMeans(X); + this.nSamplesSeen_ = n; + } else { + const prevN = this.nSamplesSeen_; + const batchMean = colMeans(X); + const totalN = prevN + n; + const newMean = new Float64Array(p); + for (let j = 0; j < p; j++) { + newMean[j] = ((this.mean_[j] ?? 0) * prevN + (batchMean[j] ?? 0) * n) / totalN; + } + this.mean_ = newMean; + this.nSamplesSeen_ = totalN; + } + + // Center data + const Xc = X.map((xi) => { + const out = new Float64Array(p); + for (let j = 0; j < p; j++) out[j] = (xi[j] ?? 0) - (this.mean_![j] ?? 0); + return out; + }); + + // Compute covariance contribution and update components via SVD + const G = gramMatrix(Xc); + + if (this.components_ !== null) { + // Merge with existing: approximate by re-computing on augmented covariance + const prevComp = this.components_!; + const prevVar = this.explainedVariance_!; + // Add previous covariance contribution + for (let a = 0; a < k; a++) { + const va = prevComp[a] ?? new Float64Array(p); + const lambda = prevVar[a] ?? 0; + for (let i = 0; i < p; i++) { + for (let j = 0; j < p; j++) { + G[i]![j] = (G[i]![j] ?? 0) + lambda * (va[i] ?? 0) * (va[j] ?? 0); + } + } + } + } + + const { vectors, values } = eigenDecomp(G, k); + this.components_ = vectors; + const totalVar = values.reduce((s, v) => s + v, 0); + this.explainedVariance_ = values; + this.explainedVarianceRatio_ = Float64Array.from( + values, + (v) => v / (totalVar || 1), + ); + return this; + } + + fit(X: Float64Array[]): this { + const batchSize = this.batchSize ?? Math.max(50, X.length); + this.mean_ = null; + this.components_ = null; + this.nSamplesSeen_ = 0; + for (let i = 0; i < X.length; i += batchSize) { + this.partialFit(X.slice(i, i + batchSize)); + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.components_ === null || this.mean_ === null) throw new NotFittedError(); + const k = this.components_.length; + const p = this.mean_.length; + return X.map((xi) => { + const xc = new Float64Array(p); + for (let j = 0; j < p; j++) xc[j] = (xi[j] ?? 0) - (this.mean_![j] ?? 0); + const out = new Float64Array(k); + for (let i = 0; i < k; i++) { + const comp = this.components_![i] ?? new Float64Array(p); + let s = 0; + for (let j = 0; j < p; j++) s += (xc[j] ?? 0) * (comp[j] ?? 0); + if (this.whiten) { + const std = Math.sqrt(this.explainedVariance_![i] ?? 1) || 1; + out[i] = s / std; + } else { + out[i] = s; + } + } + return out; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} + +/** + * Kernel PCA — kernelized non-linear PCA. + * Mirrors sklearn.decomposition.KernelPCA. + */ +export class KernelPCA { + nComponents: number | null; + kernel: "rbf" | "poly" | "sigmoid" | "cosine" | "linear"; + gamma: number | null; + degree: number; + coef0: number; + + alphas_: Float64Array[] | null = null; + lambdas_: Float64Array | null = null; + xFit_: Float64Array[] | null = null; + kFitRows_: Float64Array[] | null = null; + + constructor( + options: { + nComponents?: number | null; + kernel?: "rbf" | "poly" | "sigmoid" | "cosine" | "linear"; + gamma?: number | null; + degree?: number; + coef0?: number; + } = {}, + ) { + this.nComponents = options.nComponents ?? null; + this.kernel = options.kernel ?? "rbf"; + this.gamma = options.gamma ?? null; + this.degree = options.degree ?? 3; + this.coef0 = options.coef0 ?? 1; + } + + private _kernelFunc(a: Float64Array, b: Float64Array): number { + const p = a.length; + let dot = 0; + let normA = 0; + let normB = 0; + for (let j = 0; j < p; j++) { + dot += (a[j] ?? 0) * (b[j] ?? 0); + normA += (a[j] ?? 0) ** 2; + normB += (b[j] ?? 0) ** 2; + } + const gamma = this.gamma ?? (1 / p || 1); + switch (this.kernel) { + case "rbf": { + let dist = 0; + for (let j = 0; j < p; j++) dist += ((a[j] ?? 0) - (b[j] ?? 0)) ** 2; + return Math.exp(-gamma * dist); + } + case "poly": return (gamma * dot + this.coef0) ** this.degree; + case "sigmoid": return Math.tanh(gamma * dot + this.coef0); + case "cosine": { + const denom = Math.sqrt(normA) * Math.sqrt(normB); + return denom > 1e-15 ? dot / denom : 0; + } + default: return dot; + } + } + + fit(X: Float64Array[]): this { + const n = X.length; + const k = Math.min(this.nComponents ?? n, n); + this.xFit_ = X; + // Compute kernel matrix + const K = Array.from({ length: n }, () => new Float64Array(n)); + for (let i = 0; i < n; i++) { + for (let j = i; j < n; j++) { + const val = this._kernelFunc(X[i] ?? new Float64Array(0), X[j] ?? new Float64Array(0)); + K[i]![j] = val; + K[j]![i] = val; + } + } + // Center kernel matrix + const rowMeans = new Float64Array(n); + for (let i = 0; i < n; i++) { + for (let j = 0; j < n; j++) rowMeans[i] = (rowMeans[i] ?? 0) + (K[i]![j] ?? 0); + rowMeans[i] = (rowMeans[i] ?? 0) / n; + } + let grandMean = 0; + for (let i = 0; i < n; i++) grandMean += rowMeans[i] ?? 0; + grandMean /= n; + const Kc = Array.from({ length: n }, () => new Float64Array(n)); + for (let i = 0; i < n; i++) { + for (let j = 0; j < n; j++) { + Kc[i]![j] = (K[i]![j] ?? 0) - (rowMeans[i] ?? 0) - (rowMeans[j] ?? 0) + grandMean; + } + } + this.kFitRows_ = Array.from({ length: n }, (_, i) => { + const row = new Float64Array(n); + for (let j = 0; j < n; j++) row[j] = Kc[i]![j] ?? 0; + return row; + }); + + // Eigen decomposition of Kc + const { vectors, values } = eigenDecomp(Kc, k); + this.lambdas_ = values; + // alpha_i = eigvec_i / sqrt(eigenval_i) + this.alphas_ = vectors.map((v, i) => { + const lam = values[i] ?? 1e-15; + const scale = Math.sqrt(Math.abs(lam) || 1e-15); + return Float64Array.from(v, (x) => x / scale); + }); + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.alphas_ === null || this.xFit_ === null || this.kFitRows_ === null) { + throw new NotFittedError(); + } + const nTrain = this.xFit_.length; + const k = this.alphas_.length; + return X.map((xi) => { + const kv = new Float64Array(nTrain); + for (let j = 0; j < nTrain; j++) { + kv[j] = this._kernelFunc(xi, this.xFit_![j] ?? new Float64Array(0)); + } + const out = new Float64Array(k); + for (let i = 0; i < k; i++) { + const alpha = this.alphas_![i] ?? new Float64Array(nTrain); + let s = 0; + for (let j = 0; j < nTrain; j++) s += (kv[j] ?? 0) * (alpha[j] ?? 0); + out[i] = s; + } + return out; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} + +/** + * Factor Analysis via EM algorithm. + * Mirrors sklearn.decomposition.FactorAnalysis. + */ +export class FactorAnalysis { + nComponents: number; + maxIter: number; + tol: number; + svdMethod: "randomized" | "lapack"; + + components_: Float64Array[] | null = null; + noiseVariance_: Float64Array | null = null; + mean_: Float64Array | null = null; + nIter_: number = 0; + + constructor( + options: { + nComponents?: number; + maxIter?: number; + tol?: number; + svdMethod?: "randomized" | "lapack"; + } = {}, + ) { + this.nComponents = options.nComponents ?? 1; + this.maxIter = options.maxIter ?? 1000; + this.tol = options.tol ?? 1e-2; + this.svdMethod = options.svdMethod ?? "randomized"; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const k = Math.min(this.nComponents, p); + + this.mean_ = colMeans(X); + const Xc = X.map((xi) => { + const out = new Float64Array(p); + for (let j = 0; j < p; j++) out[j] = (xi[j] ?? 0) - (this.mean_![j] ?? 0); + return out; + }); + + // Initialize W (p x k) and psi (noise variances, p) + const W = Array.from({ length: p }, (_, i) => + Float64Array.from({ length: k }, (_, j) => (i === j ? 1 : 0.1 * Math.sin(i + j))), + ); + const psi = new Float64Array(p).fill(1); + + // EM algorithm + for (let iter = 0; iter < this.maxIter; iter++) { + // E-step: compute posterior mean of factors + // M = W^T Psi^-1 W + I (k x k) + const M = Array.from({ length: k }, () => new Float64Array(k)); + for (let a = 0; a < k; a++) { + M[a]![a] = 1; + for (let b = 0; b < k; b++) { + for (let j = 0; j < p; j++) { + M[a]![b] = (M[a]![b] ?? 0) + (W[j]![a] ?? 0) * (W[j]![b] ?? 0) / ((psi[j] ?? 1) || 1); + } + } + } + + // Invert M (k x k) via simple Gauss-Jordan + const Minv = this._invertKK(M, k); + + // Compute E[z|x] = Minv W^T Psi^-1 x + // WtPsiInv = W^T Psi^-1 (k x p) + const WtPsiInv = Array.from({ length: k }, (_, a) => + Float64Array.from({ length: p }, (_, j) => (W[j]![a] ?? 0) / ((psi[j] ?? 1) || 1)), + ); + + // Ez (n x k): Ez[i] = Minv WtPsiInv Xc[i] + const Ez = Array.from({ length: n }, (_, i) => { + const xi = Xc[i] ?? new Float64Array(p); + const out = new Float64Array(k); + for (let a = 0; a < k; a++) { + let s = 0; + for (let j = 0; j < p; j++) s += (WtPsiInv[a]![j] ?? 0) * (xi[j] ?? 0); + for (let b = 0; b < k; b++) out[a] = (out[a] ?? 0) + (Minv[a]![b] ?? 0) * s; + } + return out; + }); + + // E[zz^T] = Minv + Ez Ez^T (per sample, but summed) + const Ezz = Array.from({ length: k }, () => new Float64Array(k)); + for (let a = 0; a < k; a++) { + for (let b = 0; b < k; b++) { + Ezz[a]![b] = n * (Minv[a]![b] ?? 0); + for (let i = 0; i < n; i++) { + Ezz[a]![b] = (Ezz[a]![b] ?? 0) + (Ez[i]![a] ?? 0) * (Ez[i]![b] ?? 0); + } + } + } + + // M-step: update W + // W_new (p x k) = (sum_i x_i E[z|x_i]^T) Ezz^-1 + const XEz = Array.from({ length: p }, () => new Float64Array(k)); + for (let i = 0; i < n; i++) { + const xi = Xc[i] ?? new Float64Array(p); + for (let j = 0; j < p; j++) { + for (let a = 0; a < k; a++) { + XEz[j]![a] = (XEz[j]![a] ?? 0) + (xi[j] ?? 0) * (Ez[i]![a] ?? 0); + } + } + } + const EzzInv = this._invertKK(Ezz, k); + const WnewArr = matMul(XEz, EzzInv); + + // Update psi + const psiNew = new Float64Array(p); + for (let j = 0; j < p; j++) { + let s = 0; + for (let i = 0; i < n; i++) { + const xi = Xc[i] ?? new Float64Array(p); + s += (xi[j] ?? 0) ** 2; + for (let a = 0; a < k; a++) { + s -= (WnewArr[j]![a] ?? 0) * (Ez[i]![a] ?? 0) * (xi[j] ?? 0); + } + } + psiNew[j] = Math.max(1e-6, s / n); + } + + // Check convergence + let maxDiff = 0; + for (let j = 0; j < p; j++) { + for (let a = 0; a < k; a++) { + maxDiff = Math.max(maxDiff, Math.abs((WnewArr[j]![a] ?? 0) - (W[j]![a] ?? 0))); + } + } + + for (let j = 0; j < p; j++) { + for (let a = 0; a < k; a++) W[j]![a] = WnewArr[j]![a] ?? 0; + psi[j] = psiNew[j] ?? 1e-6; + } + + this.nIter_ = iter + 1; + if (maxDiff < this.tol) break; + } + + // components_ = W^T (k x p) + this.components_ = Array.from({ length: k }, (_, a) => + Float64Array.from({ length: p }, (_, j) => W[j]![a] ?? 0), + ); + this.noiseVariance_ = psi; + return this; + } + + private _invertKK(M: Float64Array[], k: number): Float64Array[] { + const aug = Array.from({ length: k }, (_, i) => { + const row = new Float64Array(2 * k); + for (let j = 0; j < k; j++) row[j] = M[i]![j] ?? 0; + row[k + i] = 1; + return row; + }); + for (let col = 0; col < k; col++) { + let maxRow = col; + for (let row = col + 1; row < k; row++) { + if (Math.abs(aug[row]![col] ?? 0) > Math.abs(aug[maxRow]![col] ?? 0)) maxRow = row; + } + [aug[col], aug[maxRow]] = [aug[maxRow]!, aug[col]!] as [Float64Array, Float64Array]; + const pivot = aug[col]![col] ?? 1e-12; + if (Math.abs(pivot) < 1e-15) continue; + for (let j = 0; j < 2 * k; j++) aug[col]![j] = (aug[col]![j] ?? 0) / pivot; + for (let row = 0; row < k; row++) { + if (row === col) continue; + const factor = aug[row]![col] ?? 0; + for (let j = 0; j < 2 * k; j++) { + aug[row]![j] = (aug[row]![j] ?? 0) - factor * (aug[col]![j] ?? 0); + } + } + } + return aug.map((row) => Float64Array.from({ length: k }, (_, j) => row[k + j] ?? 0)); + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.components_ === null || this.mean_ === null) throw new NotFittedError(); + const k = this.components_.length; + const p = this.mean_.length; + return X.map((xi) => { + const xc = new Float64Array(p); + for (let j = 0; j < p; j++) xc[j] = (xi[j] ?? 0) - (this.mean_![j] ?? 0); + const out = new Float64Array(k); + for (let i = 0; i < k; i++) { + const comp = this.components_![i] ?? new Float64Array(p); + let s = 0; + for (let j = 0; j < p; j++) s += (xc[j] ?? 0) * (comp[j] ?? 0); + out[i] = s; + } + return out; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} diff --git a/src/decomposition/index.ts b/src/decomposition/index.ts index 6bb90c3..63088ff 100644 --- a/src/decomposition/index.ts +++ b/src/decomposition/index.ts @@ -1,2 +1,3 @@ export * from "./pca.js"; export * from "./nmf.js"; +export * from "./advanced.js"; diff --git a/src/feature_extraction/index.ts b/src/feature_extraction/index.ts index ff90a7a..6345376 100644 --- a/src/feature_extraction/index.ts +++ b/src/feature_extraction/index.ts @@ -1 +1,2 @@ export * from "./dict_vectorizer.js"; +export * from "./text.js"; diff --git a/src/feature_extraction/text.ts b/src/feature_extraction/text.ts new file mode 100644 index 0000000..8f3969a --- /dev/null +++ b/src/feature_extraction/text.ts @@ -0,0 +1,293 @@ +/** + * Text feature extraction: CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer. + * Mirrors sklearn.feature_extraction.text. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** Tokenize text by splitting on non-word characters (lowercase). */ +function tokenize(text: string): string[] { + return text.toLowerCase().match(/\b[a-z]+\b/g) ?? []; +} + +/** Options for CountVectorizer. */ +export interface CountVectorizerOptions { + minDf?: number; + maxDf?: number; + maxFeatures?: number | null; + ngramRange?: [number, number]; + lowercase?: boolean; + analyzer?: "word" | "char"; +} + +/** Options for HashingVectorizer. */ +export interface HashingVectorizerOptions { + nFeatures?: number; + alternate_sign?: boolean; + lowercase?: boolean; + ngramRange?: [number, number]; +} + +/** Options for TfidfTransformer. */ +export interface TfidfTransformerOptions { + norm?: "l1" | "l2" | null; + useIdf?: boolean; + smoothIdf?: boolean; + sublinearTf?: boolean; +} + +/** Simple string hash. */ +function murmurhash(str: string): number { + let h = 0xdeadbeef; + for (let i = 0; i < str.length; i++) { + h = Math.imul(h ^ str.charCodeAt(i), 0x9e3779b9); + h = (h << 13) | (h >>> 19); + } + return (h ^ (h >>> 16)) >>> 0; +} + +/** + * Convert a collection of text documents to a matrix of token counts. + * Mirrors sklearn.feature_extraction.text.CountVectorizer. + */ +export class CountVectorizer { + minDf: number; + maxDf: number; + maxFeatures: number | null; + ngramRange: [number, number]; + lowercase: boolean; + analyzer: "word" | "char"; + + vocabulary_: Map | null = null; + featureNames_: string[] | null = null; + + constructor(options: CountVectorizerOptions = {}) { + this.minDf = options.minDf ?? 1; + this.maxDf = options.maxDf ?? 1.0; + this.maxFeatures = options.maxFeatures ?? null; + this.ngramRange = options.ngramRange ?? [1, 1]; + this.lowercase = options.lowercase ?? true; + this.analyzer = options.analyzer ?? "word"; + } + + private _analyze(doc: string): string[] { + const text = this.lowercase ? doc.toLowerCase() : doc; + const tokens = this.analyzer === "word" + ? (text.match(/\b[a-z0-9]+\b/g) ?? []) + : Array.from(text); + const [minN, maxN] = this.ngramRange; + if (minN === 1 && maxN === 1) return tokens; + const ngrams: string[] = []; + for (let n = minN; n <= maxN; n++) { + for (let i = 0; i <= tokens.length - n; i++) { + ngrams.push(tokens.slice(i, i + n).join(" ")); + } + } + return ngrams; + } + + fit(docs: string[]): this { + const termDocFreq = new Map(); + const n = docs.length; + for (const doc of docs) { + const seen = new Set(); + for (const term of this._analyze(doc)) { + if (!seen.has(term)) { + seen.add(term); + termDocFreq.set(term, (termDocFreq.get(term) ?? 0) + 1); + } + } + } + const minDfAbs = this.minDf < 1 ? Math.floor(this.minDf * n) : this.minDf; + const maxDfAbs = this.maxDf <= 1.0 ? Math.ceil(this.maxDf * n) : this.maxDf; + let terms = [...termDocFreq.entries()] + .filter(([, df]) => df >= minDfAbs && df <= maxDfAbs) + .sort(([a], [b]) => a.localeCompare(b)) + .map(([t]) => t); + if (this.maxFeatures !== null) { + terms = terms.slice(0, this.maxFeatures); + } + this.vocabulary_ = new Map(terms.map((t, i) => [t, i])); + this.featureNames_ = terms; + return this; + } + + transform(docs: string[]): Float64Array[] { + if (this.vocabulary_ === null) throw new NotFittedError(); + const vocab = this.vocabulary_; + const nFeatures = vocab.size; + return docs.map((doc) => { + const row = new Float64Array(nFeatures); + for (const term of this._analyze(doc)) { + const idx = vocab.get(term); + if (idx !== undefined) row[idx] = (row[idx] ?? 0) + 1; + } + return row; + }); + } + + fitTransform(docs: string[]): Float64Array[] { + return this.fit(docs).transform(docs); + } + + getFeatureNames(): string[] { + if (this.featureNames_ === null) throw new NotFittedError(); + return this.featureNames_; + } +} + +/** + * Transform a count matrix to a normalized TF or TF-IDF representation. + * Mirrors sklearn.feature_extraction.text.TfidfTransformer. + */ +export class TfidfTransformer { + norm: "l1" | "l2" | null; + useIdf: boolean; + smoothIdf: boolean; + sublinearTf: boolean; + + idf_: Float64Array | null = null; + + constructor(options: TfidfTransformerOptions = {}) { + this.norm = options.norm ?? "l2"; + this.useIdf = options.useIdf ?? true; + this.smoothIdf = options.smoothIdf ?? true; + this.sublinearTf = options.sublinearTf ?? false; + } + + fit(X: Float64Array[]): this { + if (!this.useIdf) { + this.idf_ = null; + return this; + } + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const df = new Float64Array(p); + for (const row of X) { + for (let j = 0; j < p; j++) { + if ((row[j] ?? 0) > 0) df[j] = (df[j] ?? 0) + 1; + } + } + const smooth = this.smoothIdf ? 1 : 0; + this.idf_ = new Float64Array(p); + for (let j = 0; j < p; j++) { + this.idf_[j] = Math.log((n + smooth) / ((df[j] ?? 0) + smooth)) + 1; + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + const p = (X[0] ?? new Float64Array(0)).length; + return X.map((row) => { + const out = new Float64Array(p); + for (let j = 0; j < p; j++) { + let tf = row[j] ?? 0; + if (this.sublinearTf && tf > 0) tf = 1 + Math.log(tf); + const idfVal = this.idf_ !== null ? (this.idf_[j] ?? 1) : 1; + out[j] = tf * idfVal; + } + if (this.norm === "l2") { + let norm = 0; + for (let j = 0; j < p; j++) norm += (out[j] ?? 0) ** 2; + norm = Math.sqrt(norm); + if (norm > 0) for (let j = 0; j < p; j++) out[j] = (out[j] ?? 0) / norm; + } else if (this.norm === "l1") { + let norm = 0; + for (let j = 0; j < p; j++) norm += Math.abs(out[j] ?? 0); + if (norm > 0) for (let j = 0; j < p; j++) out[j] = (out[j] ?? 0) / norm; + } + return out; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} + +/** + * Convert a collection of raw documents to a matrix of TF-IDF features. + * Mirrors sklearn.feature_extraction.text.TfidfVectorizer. + */ +export class TfidfVectorizer { + private cv: CountVectorizer; + private tfidf: TfidfTransformer; + + vocabulary_: Map | null = null; + featureNames_: string[] | null = null; + + constructor( + cvOptions: CountVectorizerOptions = {}, + tfidfOptions: TfidfTransformerOptions = {}, + ) { + this.cv = new CountVectorizer(cvOptions); + this.tfidf = new TfidfTransformer(tfidfOptions); + } + + fit(docs: string[]): this { + const counts = this.cv.fit(docs).transform(docs); + this.tfidf.fit(counts); + this.vocabulary_ = this.cv.vocabulary_; + this.featureNames_ = this.cv.featureNames_; + return this; + } + + transform(docs: string[]): Float64Array[] { + const counts = this.cv.transform(docs); + return this.tfidf.transform(counts); + } + + fitTransform(docs: string[]): Float64Array[] { + return this.fit(docs).transform(docs); + } + + getFeatureNames(): string[] { + if (this.featureNames_ === null) throw new NotFittedError(); + return this.featureNames_; + } +} + +/** + * Convert a collection of text documents to a matrix of token occurrences using a hash trick. + * Mirrors sklearn.feature_extraction.text.HashingVectorizer. + */ +export class HashingVectorizer { + nFeatures: number; + alternateSign: boolean; + lowercase: boolean; + ngramRange: [number, number]; + + constructor(options: HashingVectorizerOptions = {}) { + this.nFeatures = options.nFeatures ?? 2 ** 20; + this.alternateSign = options.alternate_sign ?? true; + this.lowercase = options.lowercase ?? true; + this.ngramRange = options.ngramRange ?? [1, 1]; + } + + private _analyze(doc: string): string[] { + const text = this.lowercase ? doc.toLowerCase() : doc; + const tokens = text.match(/\b[a-z0-9]+\b/g) ?? []; + const [minN, maxN] = this.ngramRange; + if (minN === 1 && maxN === 1) return tokens; + const ngrams: string[] = []; + for (let n = minN; n <= maxN; n++) { + for (let i = 0; i <= tokens.length - n; i++) { + ngrams.push(tokens.slice(i, i + n).join(" ")); + } + } + return ngrams; + } + + transform(docs: string[]): Float64Array[] { + return docs.map((doc) => { + const row = new Float64Array(this.nFeatures); + for (const term of this._analyze(doc)) { + const h = murmurhash(term); + const idx = h % this.nFeatures; + const sign = this.alternateSign ? (h & 1 ? 1 : -1) : 1; + row[idx] = (row[idx] ?? 0) + sign; + } + return row; + }); + } +} diff --git a/src/index.ts b/src/index.ts index 0ee2325..1ce19db 100644 --- a/src/index.ts +++ b/src/index.ts @@ -93,3 +93,12 @@ export * from "./kernel_ridge/index.js"; // Gaussian process export * from "./gaussian_process/index.js"; +// Kernel approximation +export * from "./kernel_approximation/index.js"; + +// Covariance +export * from "./covariance/index.js"; + +// Cross decomposition +export * from "./cross_decomposition/index.js"; + diff --git a/src/kernel_approximation/index.ts b/src/kernel_approximation/index.ts new file mode 100644 index 0000000..adceb46 --- /dev/null +++ b/src/kernel_approximation/index.ts @@ -0,0 +1 @@ +export * from "./rbf_sampler.js"; diff --git a/src/kernel_approximation/rbf_sampler.ts b/src/kernel_approximation/rbf_sampler.ts new file mode 100644 index 0000000..c51ad62 --- /dev/null +++ b/src/kernel_approximation/rbf_sampler.ts @@ -0,0 +1,271 @@ +/** + * Kernel approximation methods: RBFSampler, Nystroem, AdditiveChi2Sampler, SkewedChi2Sampler. + * Mirrors sklearn.kernel_approximation. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** + * Approximates feature map of an RBF kernel by Monte Carlo approximation. + * Mirrors sklearn.kernel_approximation.RBFSampler. + */ +export class RBFSampler { + gamma: number; + nComponents: number; + randomState: number; + + randomWeights_: Float64Array[] | null = null; + randomOffset_: Float64Array | null = null; + + constructor( + options: { gamma?: number; nComponents?: number; randomState?: number } = {}, + ) { + this.gamma = options.gamma ?? 1.0; + this.nComponents = options.nComponents ?? 100; + this.randomState = options.randomState ?? 42; + } + + private _rng(): () => number { + let s = this.randomState; + return () => { + s = (s * 1664525 + 1013904223) & 0xffffffff; + return (s >>> 0) / 0x100000000; + }; + } + + private _randn(rng: () => number): number { + const u = rng(); + const v = rng(); + return Math.sqrt(-2 * Math.log(u + 1e-15)) * Math.cos(2 * Math.PI * v); + } + + fit(X: Float64Array[]): this { + const p = (X[0] ?? new Float64Array(0)).length; + const rng = this._rng(); + const scale = Math.sqrt(2 * this.gamma); + this.randomWeights_ = Array.from({ length: this.nComponents }, () => { + const w = new Float64Array(p); + for (let j = 0; j < p; j++) w[j] = this._randn(rng) * scale; + return w; + }); + this.randomOffset_ = new Float64Array(this.nComponents); + for (let i = 0; i < this.nComponents; i++) { + this.randomOffset_[i] = rng() * 2 * Math.PI; + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.randomWeights_ === null || this.randomOffset_ === null) { + throw new NotFittedError(); + } + const scale = Math.sqrt(2 / this.nComponents); + return X.map((xi) => { + const out = new Float64Array(this.nComponents); + for (let i = 0; i < this.nComponents; i++) { + const w = this.randomWeights_![i] ?? new Float64Array(0); + let dot = 0; + for (let j = 0; j < xi.length; j++) dot += (xi[j] ?? 0) * (w[j] ?? 0); + out[i] = scale * Math.cos(dot + (this.randomOffset_![i] ?? 0)); + } + return out; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} + +/** + * Approximate a kernel map using a subset of the training data (Nystroem method). + * Mirrors sklearn.kernel_approximation.Nystroem. + */ +export class Nystroem { + kernel: "rbf" | "polynomial" | "linear"; + gamma: number; + coef0: number; + degree: number; + nComponents: number; + randomState: number; + + components_: Float64Array[] | null = null; + normalizationMatrix_: Float64Array[] | null = null; + + constructor( + options: { + kernel?: "rbf" | "polynomial" | "linear"; + gamma?: number; + coef0?: number; + degree?: number; + nComponents?: number; + randomState?: number; + } = {}, + ) { + this.kernel = options.kernel ?? "rbf"; + this.gamma = options.gamma ?? 1.0; + this.coef0 = options.coef0 ?? 1.0; + this.degree = options.degree ?? 3; + this.nComponents = options.nComponents ?? 100; + this.randomState = options.randomState ?? 42; + } + + private _kernelFunc(a: Float64Array, b: Float64Array): number { + const p = a.length; + if (this.kernel === "rbf") { + let dist = 0; + for (let j = 0; j < p; j++) dist += ((a[j] ?? 0) - (b[j] ?? 0)) ** 2; + return Math.exp(-this.gamma * dist); + } + if (this.kernel === "polynomial") { + let dot = 0; + for (let j = 0; j < p; j++) dot += (a[j] ?? 0) * (b[j] ?? 0); + return (this.gamma * dot + this.coef0) ** this.degree; + } + let dot = 0; + for (let j = 0; j < p; j++) dot += (a[j] ?? 0) * (b[j] ?? 0); + return dot; + } + + private _choleskyInverse(K: Float64Array[]): Float64Array[] { + const n = K.length; + const L = Array.from({ length: n }, () => new Float64Array(n)); + for (let i = 0; i < n; i++) { + for (let j = 0; j <= i; j++) { + let s = K[i]![j] ?? 0; + for (let k = 0; k < j; k++) s -= (L[i]![k] ?? 0) * (L[j]![k] ?? 0); + if (i === j) { + L[i]![j] = Math.sqrt(Math.max(s, 1e-12)); + } else { + L[i]![j] = s / ((L[j]![j] ?? 1e-12) || 1e-12); + } + } + } + // Invert L + const Linv = Array.from({ length: n }, () => new Float64Array(n)); + for (let i = 0; i < n; i++) { + Linv[i]![i] = 1 / ((L[i]![i] ?? 1e-12) || 1e-12); + for (let j = i - 1; j >= 0; j--) { + let s = 0; + for (let k = j + 1; k <= i; k++) s += (L[i]![k] ?? 0) * (Linv[k]![j] ?? 0); + Linv[i]![j] = -s / ((L[i]![i] ?? 1e-12) || 1e-12); + } + } + // K^{-1} = (L^T L)^{-1} = Linv^T Linv + const out = Array.from({ length: n }, () => new Float64Array(n)); + for (let i = 0; i < n; i++) { + for (let j = 0; j < n; j++) { + let s = 0; + for (let k = 0; k < n; k++) s += (Linv[k]![i] ?? 0) * (Linv[k]![j] ?? 0); + out[i]![j] = s; + } + } + return out; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const m = Math.min(this.nComponents, n); + // Random subsample + let seed = this.randomState; + const indices: number[] = []; + const used = new Set(); + for (let i = 0; i < m; i++) { + seed = (seed * 1664525 + 1013904223) & 0xffffffff; + let idx = ((seed >>> 0) % n); + let tries = 0; + while (used.has(idx) && tries < n) { idx = (idx + 1) % n; tries++; } + used.add(idx); + indices.push(idx); + } + this.components_ = indices.map((i) => X[i] ?? new Float64Array(0)); + // Compute kernel matrix K_mm + const Kmm = Array.from({ length: m }, () => new Float64Array(m)); + for (let i = 0; i < m; i++) { + for (let j = 0; j < m; j++) { + Kmm[i]![j] = this._kernelFunc( + this.components_![i] ?? new Float64Array(0), + this.components_![j] ?? new Float64Array(0), + ); + } + } + this.normalizationMatrix_ = this._choleskyInverse(Kmm); + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.components_ === null || this.normalizationMatrix_ === null) { + throw new NotFittedError(); + } + const m = this.components_.length; + return X.map((xi) => { + const kv = new Float64Array(m); + for (let j = 0; j < m; j++) { + kv[j] = this._kernelFunc(xi, this.components_![j] ?? new Float64Array(0)); + } + // out = kv @ normalizationMatrix_ + const out = new Float64Array(m); + for (let j = 0; j < m; j++) { + let s = 0; + for (let k = 0; k < m; k++) s += (kv[k] ?? 0) * (this.normalizationMatrix_![k]![j] ?? 0); + out[j] = s; + } + return out; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} + +/** + * Approximate feature map for additive chi2 kernel. + * Mirrors sklearn.kernel_approximation.AdditiveChi2Sampler. + */ +export class AdditiveChi2Sampler { + sampleSteps: number; + sampleInterval: number | null; + + sampleInterval_: number | null = null; + + constructor( + options: { sampleSteps?: number; sampleInterval?: number | null } = {}, + ) { + this.sampleSteps = options.sampleSteps ?? 2; + this.sampleInterval = options.sampleInterval ?? null; + } + + fit(X: Float64Array[]): this { + this.sampleInterval_ = this.sampleInterval ?? 0.4; + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.sampleInterval_ === null) throw new NotFittedError(); + const p = (X[0] ?? new Float64Array(0)).length; + const steps = this.sampleSteps; + const interval = this.sampleInterval_; + const outDim = p * (2 * steps + 1); + return X.map((xi) => { + const out = new Float64Array(outDim); + for (let j = 0; j < p; j++) { + const x = xi[j] ?? 0; + const sqrtX = Math.sqrt(x + 1e-12); + out[j] = sqrtX; + for (let s = 1; s <= steps; s++) { + const c = Math.sqrt(2 * Math.exp(-Math.PI * s * interval)); + const cos = c * sqrtX * Math.cos(s * Math.log(x + 1e-12)); + const sin = c * sqrtX * Math.sin(s * Math.log(x + 1e-12)); + out[j + p * (2 * s - 1)] = cos; + out[j + p * (2 * s)] = sin; + } + } + return out; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} diff --git a/src/preprocessing/index.ts b/src/preprocessing/index.ts index c176faa..080eeff 100644 --- a/src/preprocessing/index.ts +++ b/src/preprocessing/index.ts @@ -5,3 +5,4 @@ export * from "./normalizer.js"; export * from "./polynomial_features.js"; export * from "./encoders.js"; export * from "./robust_scaler.js"; +export * from "./power_transformer.js"; diff --git a/src/preprocessing/power_transformer.ts b/src/preprocessing/power_transformer.ts new file mode 100644 index 0000000..3889778 --- /dev/null +++ b/src/preprocessing/power_transformer.ts @@ -0,0 +1,343 @@ +/** + * Additional preprocessing transformers: PowerTransformer, QuantileTransformer, + * Binarizer, FunctionTransformer, KBinsDiscretizer. + * Mirrors sklearn.preprocessing. + */ + +import { NotFittedError } from "../exceptions.js"; + +/** + * Apply a power transform to make data more Gaussian-like. + * Supports Box-Cox and Yeo-Johnson methods. + * Mirrors sklearn.preprocessing.PowerTransformer. + */ +export class PowerTransformer { + method: "yeo-johnson" | "box-cox"; + standardize: boolean; + + lambdas_: Float64Array | null = null; + means_: Float64Array | null = null; + stds_: Float64Array | null = null; + + constructor( + options: { method?: "yeo-johnson" | "box-cox"; standardize?: boolean } = {}, + ) { + this.method = options.method ?? "yeo-johnson"; + this.standardize = options.standardize ?? true; + } + + private _yeojohnson(x: number, lam: number): number { + if (x >= 0) { + if (Math.abs(lam) < 1e-10) return Math.log(x + 1); + return ((x + 1) ** lam - 1) / lam; + } + if (Math.abs(lam - 2) < 1e-10) return -Math.log(-x + 1); + return -((-x + 1) ** (2 - lam) - 1) / (2 - lam); + } + + private _boxcox(x: number, lam: number): number { + if (x <= 0) throw new Error("Box-Cox requires positive data"); + if (Math.abs(lam) < 1e-10) return Math.log(x); + return (x ** lam - 1) / lam; + } + + private _optimalLambda(col: Float64Array): number { + // Grid search for lambda that maximizes log-likelihood (simplified) + const lambdas = [-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]; + let bestLam = 0; + let bestScore = -Infinity; + for (const lam of lambdas) { + try { + const transformed = Float64Array.from(col, (x) => + this.method === "box-cox" ? this._boxcox(x, lam) : this._yeojohnson(x, lam), + ); + let mean = 0; + for (let i = 0; i < transformed.length; i++) mean += transformed[i] ?? 0; + mean /= transformed.length; + let variance = 0; + for (let i = 0; i < transformed.length; i++) { + variance += ((transformed[i] ?? 0) - mean) ** 2; + } + variance /= transformed.length; + // Log-likelihood proxy: -variance + const score = -(variance || 1e-15); + if (score > bestScore) { bestScore = score; bestLam = lam; } + } catch { /* skip */ } + } + return bestLam; + } + + fit(X: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + this.lambdas_ = new Float64Array(p); + this.means_ = new Float64Array(p); + this.stds_ = new Float64Array(p); + + for (let j = 0; j < p; j++) { + const col = Float64Array.from({ length: n }, (_, i) => X[i]![j] ?? 0); + this.lambdas_[j] = this._optimalLambda(col); + if (this.standardize) { + const lam = this.lambdas_[j] ?? 0; + const t = Float64Array.from(col, (x) => + this.method === "box-cox" ? this._boxcox(x, lam) : this._yeojohnson(x, lam), + ); + let mean = 0; + for (let i = 0; i < n; i++) mean += t[i] ?? 0; + mean /= n; + let variance = 0; + for (let i = 0; i < n; i++) variance += ((t[i] ?? 0) - mean) ** 2; + variance /= n; + this.means_[j] = mean; + this.stds_[j] = Math.sqrt(variance) || 1; + } + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.lambdas_ === null) throw new NotFittedError(); + const p = this.lambdas_.length; + return X.map((xi) => { + const out = new Float64Array(p); + for (let j = 0; j < p; j++) { + const lam = this.lambdas_![j] ?? 0; + let val = this.method === "box-cox" + ? this._boxcox(xi[j] ?? 0, lam) + : this._yeojohnson(xi[j] ?? 0, lam); + if (this.standardize) { + val = (val - (this.means_![j] ?? 0)) / ((this.stds_![j] ?? 1) || 1); + } + out[j] = val; + } + return out; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } + + inverseTransform(X: Float64Array[]): Float64Array[] { + if (this.lambdas_ === null) throw new NotFittedError(); + const p = this.lambdas_.length; + return X.map((xi) => { + const out = new Float64Array(p); + for (let j = 0; j < p; j++) { + let val = xi[j] ?? 0; + if (this.standardize) { + val = val * ((this.stds_![j] ?? 1) || 1) + (this.means_![j] ?? 0); + } + const lam = this.lambdas_![j] ?? 0; + if (this.method === "yeo-johnson") { + out[j] = this._invYeoJohnson(val, lam); + } else { + out[j] = this._invBoxCox(val, lam); + } + } + return out; + }); + } + + private _invYeoJohnson(y: number, lam: number): number { + if (y >= 0) { + if (Math.abs(lam) < 1e-10) return Math.exp(y) - 1; + return (y * lam + 1) ** (1 / lam) - 1; + } + if (Math.abs(lam - 2) < 1e-10) return 1 - Math.exp(-y); + return 1 - (-(2 - lam) * y + 1) ** (1 / (2 - lam)); + } + + private _invBoxCox(y: number, lam: number): number { + if (Math.abs(lam) < 1e-10) return Math.exp(y); + return (y * lam + 1) ** (1 / lam); + } +} + +/** + * Transform features using quantile information (maps to uniform or normal distribution). + * Mirrors sklearn.preprocessing.QuantileTransformer. + */ +export class QuantileTransformer { + nQuantiles: number; + outputDistribution: "uniform" | "normal"; + subsample: number; + + quantiles_: Float64Array[] | null = null; + referenceQuantiles_: Float64Array | null = null; + + constructor( + options: { + nQuantiles?: number; + outputDistribution?: "uniform" | "normal"; + subsample?: number; + } = {}, + ) { + this.nQuantiles = options.nQuantiles ?? 1000; + this.outputDistribution = options.outputDistribution ?? "uniform"; + this.subsample = options.subsample ?? 100000; + } + + private _normalPPF(p: number): number { + // Approximation of normal PPF (probit) + const a = [ + -3.969683028665376e1, 2.209460984245205e2, -2.759285104469687e2, + 1.38357751867269e2, -3.066479806614716e1, 2.506628277459239, + ]; + const b = [ + -5.447609879822406e1, 1.615858368580409e2, -1.556989798598866e2, + 6.680131188771972e1, -1.328068155288572e1, + ]; + const c = [ + -7.784894002430293e-3, -3.223964580411365e-1, -2.400758277161838, + -2.549732539343734, 4.374664141464968, 2.938163982698783, + ]; + const d = [ + 7.784695709041462e-3, 3.224671290700398e-1, 2.445134137142996, + 3.754408661907416, + ]; + const pLow = 0.02425; + const pHigh = 1 - pLow; + if (p < pLow) { + const q = Math.sqrt(-2 * Math.log(p)); + return (((((c[0]! * q + c[1]!) * q + c[2]!) * q + c[3]!) * q + c[4]!) * q + c[5]!) / + ((((d[0]! * q + d[1]!) * q + d[2]!) * q + d[3]!) * q + 1); + } + if (p <= pHigh) { + const q = p - 0.5; + const r = q * q; + return (((((a[0]! * r + a[1]!) * r + a[2]!) * r + a[3]!) * r + a[4]!) * r + a[5]!) * q / + (((((b[0]! * r + b[1]!) * r + b[2]!) * r + b[3]!) * r + b[4]!) * r + 1); + } + const q = Math.sqrt(-2 * Math.log(1 - p)); + return -(((((c[0]! * q + c[1]!) * q + c[2]!) * q + c[3]!) * q + c[4]!) * q + c[5]!) / + ((((d[0]! * q + d[1]!) * q + d[2]!) * q + d[3]!) * q + 1); + } + + fit(X: Float64Array[]): this { + const n = X.length; + const p = (X[0] ?? new Float64Array(0)).length; + const nQ = Math.min(this.nQuantiles, n); + this.referenceQuantiles_ = Float64Array.from({ length: nQ }, (_, i) => i / (nQ - 1)); + this.quantiles_ = []; + for (let j = 0; j < p; j++) { + const col = Array.from({ length: n }, (_, i) => X[i]![j] ?? 0).sort((a, b) => a - b); + const quants = new Float64Array(nQ); + for (let q = 0; q < nQ; q++) { + const pos = (q / (nQ - 1)) * (n - 1); + const lo = Math.floor(pos); + const hi = Math.min(lo + 1, n - 1); + const frac = pos - lo; + quants[q] = (col[lo] ?? 0) * (1 - frac) + (col[hi] ?? 0) * frac; + } + this.quantiles_[j] = quants; + } + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.quantiles_ === null || this.referenceQuantiles_ === null) { + throw new NotFittedError(); + } + const p = this.quantiles_.length; + const nQ = this.referenceQuantiles_.length; + return X.map((xi) => { + const out = new Float64Array(p); + for (let j = 0; j < p; j++) { + const val = xi[j] ?? 0; + const quants = this.quantiles_![j] ?? new Float64Array(0); + // Binary search for val in quants + let lo = 0; + let hi = nQ - 1; + while (lo < hi) { + const mid = (lo + hi) >> 1; + if ((quants[mid] ?? 0) < val) lo = mid + 1; + else hi = mid; + } + let u = lo / (nQ - 1); + if (lo > 0 && lo < nQ) { + const qlo = quants[lo - 1] ?? 0; + const qhi = quants[lo] ?? 0; + const range = qhi - qlo; + if (range > 1e-15) u = (lo - 1 + (val - qlo) / range) / (nQ - 1); + } + u = Math.max(1e-7, Math.min(1 - 1e-7, u)); + out[j] = this.outputDistribution === "normal" ? this._normalPPF(u) : u; + } + return out; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} + +/** + * Binarize data (set feature values to 0 or 1) according to a threshold. + * Mirrors sklearn.preprocessing.Binarizer. + */ +export class Binarizer { + threshold: number; + + constructor(options: { threshold?: number } = {}) { + this.threshold = options.threshold ?? 0.0; + } + + fit(_X: Float64Array[]): this { + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + const p = (X[0] ?? new Float64Array(0)).length; + return X.map((xi) => { + const out = new Float64Array(p); + for (let j = 0; j < p; j++) out[j] = (xi[j] ?? 0) > this.threshold ? 1 : 0; + return out; + }); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} + +/** + * Constructs a transformer from an arbitrary callable. + * Mirrors sklearn.preprocessing.FunctionTransformer. + */ +export class FunctionTransformer { + func: ((X: Float64Array[]) => Float64Array[]) | null; + inverseFunc: ((X: Float64Array[]) => Float64Array[]) | null; + validate: boolean; + + constructor( + options: { + func?: ((X: Float64Array[]) => Float64Array[]) | null; + inverseFunc?: ((X: Float64Array[]) => Float64Array[]) | null; + validate?: boolean; + } = {}, + ) { + this.func = options.func ?? null; + this.inverseFunc = options.inverseFunc ?? null; + this.validate = options.validate ?? false; + } + + fit(_X: Float64Array[]): this { + return this; + } + + transform(X: Float64Array[]): Float64Array[] { + if (this.func === null) return X.map((xi) => xi.slice()); + return this.func(X); + } + + inverseTransform(X: Float64Array[]): Float64Array[] { + if (this.inverseFunc === null) return X.map((xi) => xi.slice()); + return this.inverseFunc(X); + } + + fitTransform(X: Float64Array[]): Float64Array[] { + return this.fit(X).transform(X); + } +} diff --git a/tests/new_modules.test.ts b/tests/new_modules.test.ts new file mode 100644 index 0000000..1347a04 --- /dev/null +++ b/tests/new_modules.test.ts @@ -0,0 +1,430 @@ +import { describe, expect, it } from "bun:test"; +import { CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer } from "../src/feature_extraction/text.ts"; +import { RBFSampler, Nystroem, AdditiveChi2Sampler } from "../src/kernel_approximation/rbf_sampler.ts"; +import { EmpiricalCovariance, ShrunkCovariance, LedoitWolf } from "../src/covariance/covariance.ts"; +import { PLSRegression, PLSSVD } from "../src/cross_decomposition/pls.ts"; +import { PowerTransformer, QuantileTransformer, Binarizer, FunctionTransformer } from "../src/preprocessing/power_transformer.ts"; +import { IncrementalPCA, KernelPCA, FactorAnalysis } from "../src/decomposition/advanced.ts"; + +const DOCS = [ + "the cat sat on the mat", + "the dog sat on the log", + "cats and dogs are pets", + "i love my cat and my dog", +]; + +describe("CountVectorizer", () => { + it("fits and transforms documents", () => { + const cv = new CountVectorizer({ minDf: 1, maxFeatures: 10 }); + const X = cv.fitTransform(DOCS); + expect(X.length).toBe(DOCS.length); + const features = cv.getFeatureNames(); + expect(features.length).toBeGreaterThan(0); + // 'the' should appear in most docs + const theIdx = features.indexOf("the"); + if (theIdx >= 0) { + expect((X[0]![theIdx] ?? 0)).toBeGreaterThan(0); + } + }); + + it("respects minDf filter", () => { + const cv = new CountVectorizer({ minDf: 3 }); + cv.fit(DOCS); + const features = cv.getFeatureNames(); + // Only terms appearing in >= 3 docs + expect(features.length).toBeGreaterThan(0); + for (const f of features) { + const count = DOCS.filter((d) => d.includes(f)).length; + expect(count).toBeGreaterThanOrEqual(3); + } + }); + + it("throws NotFittedError before fit", () => { + const cv = new CountVectorizer(); + expect(() => cv.transform(DOCS)).toThrow(); + }); +}); + +describe("TfidfTransformer", () => { + it("transforms count matrix to TF-IDF", () => { + const cv = new CountVectorizer(); + const counts = cv.fitTransform(DOCS); + const tfidf = new TfidfTransformer(); + const X = tfidf.fitTransform(counts); + expect(X.length).toBe(DOCS.length); + // After L2 norm, each row should have approximately unit length + for (const row of X) { + const norm = Math.sqrt(Array.from(row).reduce((s, x) => s + x * x, 0)); + if (norm > 0) expect(Math.abs(norm - 1)).toBeLessThan(1e-10); + } + }); +}); + +describe("TfidfVectorizer", () => { + it("combines CountVectorizer and TfidfTransformer", () => { + const tv = new TfidfVectorizer({ minDf: 1 }); + const X = tv.fitTransform(DOCS); + expect(X.length).toBe(DOCS.length); + const features = tv.getFeatureNames(); + expect(features.length).toBeGreaterThan(0); + }); +}); + +describe("HashingVectorizer", () => { + it("transforms documents without fitting", () => { + const hv = new HashingVectorizer({ nFeatures: 256 }); + const X = hv.transform(DOCS); + expect(X.length).toBe(DOCS.length); + expect(X[0]!.length).toBe(256); + // Non-empty documents should have non-zero features + const total = Array.from(X[0]!).reduce((s, x) => s + Math.abs(x), 0); + expect(total).toBeGreaterThan(0); + }); +}); + +describe("RBFSampler", () => { + const X = [ + new Float64Array([1, 0]), + new Float64Array([0, 1]), + new Float64Array([1, 1]), + new Float64Array([0, 0]), + ]; + + it("transforms to correct dimension", () => { + const rbf = new RBFSampler({ nComponents: 10, gamma: 1.0 }); + const Xt = rbf.fitTransform(X); + expect(Xt.length).toBe(4); + expect(Xt[0]!.length).toBe(10); + }); + + it("throws before fitting", () => { + const rbf = new RBFSampler(); + expect(() => rbf.transform(X)).toThrow(); + }); +}); + +describe("Nystroem", () => { + const X = [ + new Float64Array([1, 0]), + new Float64Array([0, 1]), + new Float64Array([1, 1]), + new Float64Array([0, 0]), + new Float64Array([0.5, 0.5]), + ]; + + it("transforms with rbf kernel", () => { + const ny = new Nystroem({ kernel: "rbf", nComponents: 3 }); + const Xt = ny.fitTransform(X); + expect(Xt.length).toBe(5); + expect(Xt[0]!.length).toBe(3); + }); + + it("transforms with linear kernel", () => { + const ny = new Nystroem({ kernel: "linear", nComponents: 3 }); + const Xt = ny.fitTransform(X); + expect(Xt.length).toBe(5); + }); +}); + +describe("AdditiveChi2Sampler", () => { + const X = [ + new Float64Array([0.5, 0.3]), + new Float64Array([0.2, 0.8]), + ]; + + it("transforms to higher dimension", () => { + const sampler = new AdditiveChi2Sampler({ sampleSteps: 2 }); + const Xt = sampler.fitTransform(X); + expect(Xt.length).toBe(2); + expect(Xt[0]!.length).toBe(2 * (2 * 2 + 1)); // p * (2 * steps + 1) + }); +}); + +describe("EmpiricalCovariance", () => { + const X = [ + new Float64Array([1, 2]), + new Float64Array([2, 3]), + new Float64Array([3, 4]), + new Float64Array([4, 5]), + new Float64Array([5, 6]), + ]; + + it("computes covariance matrix", () => { + const ec = new EmpiricalCovariance(); + ec.fit(X); + expect(ec.covariance_).toBeDefined(); + expect(ec.location_).toBeDefined(); + expect((ec.location_![0] ?? 0)).toBeCloseTo(3, 5); + expect((ec.location_![1] ?? 0)).toBeCloseTo(4, 5); + }); + + it("computes mahalanobis distances", () => { + const ec = new EmpiricalCovariance(); + ec.fit(X); + const dists = ec.mahalanobis(X); + expect(dists.length).toBe(5); + for (let i = 0; i < 5; i++) expect(dists[i] ?? 0).toBeGreaterThanOrEqual(0); + }); +}); + +describe("ShrunkCovariance", () => { + const X = [ + new Float64Array([1, 2, 3]), + new Float64Array([2, 3, 4]), + new Float64Array([3, 4, 5]), + new Float64Array([4, 5, 6]), + ]; + + it("applies shrinkage to off-diagonal", () => { + const sc = new ShrunkCovariance({ shrinkage: 0.5 }); + sc.fit(X); + expect(sc.covariance_).toBeDefined(); + const emp = new EmpiricalCovariance(); + emp.fit(X); + // Off-diagonal elements should be smaller + const off01_sc = Math.abs(sc.covariance_![0]![1] ?? 0); + const off01_emp = Math.abs(emp.covariance_![0]![1] ?? 0); + expect(off01_sc).toBeLessThanOrEqual(off01_emp + 1e-10); + }); +}); + +describe("LedoitWolf", () => { + const X = [ + new Float64Array([1, 2]), + new Float64Array([2, 3]), + new Float64Array([3, 2]), + new Float64Array([1, 3]), + new Float64Array([2, 1]), + ]; + + it("fits and returns a covariance matrix", () => { + const lw = new LedoitWolf(); + lw.fit(X); + expect(lw.covariance_).toBeDefined(); + expect(lw.shrinkage_).toBeDefined(); + expect(lw.shrinkage_!).toBeGreaterThanOrEqual(0); + }); +}); + +describe("PLSRegression", () => { + const X = [ + new Float64Array([1, 2]), + new Float64Array([2, 3]), + new Float64Array([3, 4]), + new Float64Array([4, 5]), + new Float64Array([5, 6]), + ]; + const Y = [ + new Float64Array([1]), + new Float64Array([2]), + new Float64Array([3]), + new Float64Array([4]), + new Float64Array([5]), + ]; + + it("fits and predicts", () => { + const pls = new PLSRegression({ nComponents: 1 }); + pls.fit(X, Y); + const pred = pls.predict(X); + expect(pred.length).toBe(5); + // Should predict something close to the actual Y (linear relationship) + for (let i = 0; i < 5; i++) { + expect(Math.abs((pred[i]![0] ?? 0) - (Y[i]![0] ?? 0))).toBeLessThan(1); + } + }); + + it("transforms to latent space", () => { + const pls = new PLSRegression({ nComponents: 2 }); + pls.fit(X, Y); + const Xt = pls.transform(X); + expect(Xt.length).toBe(5); + expect(Xt[0]!.length).toBe(2); + }); + + it("throws before fitting", () => { + const pls = new PLSRegression(); + expect(() => pls.predict(X)).toThrow(); + }); +}); + +describe("PLSSVD", () => { + const X = [ + new Float64Array([1, 2]), + new Float64Array([2, 3]), + new Float64Array([3, 4]), + new Float64Array([4, 5]), + ]; + const Y = [ + new Float64Array([1, 0]), + new Float64Array([2, 1]), + new Float64Array([3, 2]), + new Float64Array([4, 3]), + ]; + + it("extracts latent components", () => { + const plssvd = new PLSSVD({ nComponents: 2 }); + const [xScores, yScores] = plssvd.fitTransform(X, Y); + expect(xScores.length).toBe(4); + expect(xScores[0]!.length).toBe(2); + expect(yScores.length).toBe(4); + }); +}); + +describe("PowerTransformer", () => { + const X = [ + new Float64Array([1, 2]), + new Float64Array([4, 8]), + new Float64Array([16, 32]), + new Float64Array([64, 128]), + ]; + + it("yeo-johnson transform", () => { + const pt = new PowerTransformer({ method: "yeo-johnson", standardize: true }); + const Xt = pt.fitTransform(X); + expect(Xt.length).toBe(4); + expect(Xt[0]!.length).toBe(2); + // Standardized output should be roughly centered + let sum0 = 0; + for (const row of Xt) sum0 += row[0] ?? 0; + expect(Math.abs(sum0 / 4)).toBeLessThan(5); // rough check + }); +}); + +describe("QuantileTransformer", () => { + const X = Array.from({ length: 20 }, (_, i) => + new Float64Array([i + 1, 20 - i]), + ); + + it("uniform output", () => { + const qt = new QuantileTransformer({ + nQuantiles: 10, + outputDistribution: "uniform", + }); + const Xt = qt.fitTransform(X); + expect(Xt.length).toBe(20); + for (const row of Xt) { + expect(row[0] ?? 0).toBeGreaterThanOrEqual(-1e-6); + expect(row[0] ?? 0).toBeLessThanOrEqual(1 + 1e-6); + } + }); + + it("normal output", () => { + const qt = new QuantileTransformer({ + nQuantiles: 10, + outputDistribution: "normal", + }); + const Xt = qt.fitTransform(X); + expect(Xt.length).toBe(20); + }); +}); + +describe("Binarizer", () => { + const X = [ + new Float64Array([0.5, 1.5, -0.5]), + new Float64Array([0.0, 2.0, 1.0]), + ]; + + it("binarizes with threshold 0", () => { + const b = new Binarizer({ threshold: 0 }); + const Xt = b.transform(X); + expect(Xt[0]![0]).toBe(1); + expect(Xt[0]![1]).toBe(1); + expect(Xt[0]![2]).toBe(0); + }); + + it("binarizes with threshold 1", () => { + const b = new Binarizer({ threshold: 1 }); + const Xt = b.transform(X); + expect(Xt[0]![0]).toBe(0); + expect(Xt[0]![1]).toBe(1); + expect(Xt[1]![1]).toBe(1); + }); +}); + +describe("FunctionTransformer", () => { + const X = [ + new Float64Array([1, 4]), + new Float64Array([9, 16]), + ]; + + it("applies custom function", () => { + const ft = new FunctionTransformer({ + func: (X) => X.map((xi) => Float64Array.from(xi, Math.sqrt)), + }); + const Xt = ft.fitTransform(X); + expect(Math.abs((Xt[0]![0] ?? 0) - 1)).toBeLessThan(1e-10); + expect(Math.abs((Xt[0]![1] ?? 0) - 2)).toBeLessThan(1e-10); + expect(Math.abs((Xt[1]![0] ?? 0) - 3)).toBeLessThan(1e-10); + }); + + it("identity when no func", () => { + const ft = new FunctionTransformer(); + const Xt = ft.transform(X); + expect(Xt[0]![0]).toBe(1); + }); +}); + +describe("IncrementalPCA", () => { + const X = Array.from({ length: 20 }, (_, i) => + new Float64Array([i, i * 2, i * 3]), + ); + + it("fits and transforms", () => { + const ipca = new IncrementalPCA({ nComponents: 2, batchSize: 5 }); + const Xt = ipca.fitTransform(X); + expect(Xt.length).toBe(20); + expect(Xt[0]!.length).toBe(2); + }); + + it("partialFit accumulates samples", () => { + const ipca = new IncrementalPCA({ nComponents: 2 }); + ipca.partialFit(X.slice(0, 10)); + ipca.partialFit(X.slice(10, 20)); + expect(ipca.nSamplesSeen_).toBe(20); + }); +}); + +describe("KernelPCA", () => { + const X = [ + new Float64Array([0, 0]), + new Float64Array([1, 0]), + new Float64Array([0, 1]), + new Float64Array([1, 1]), + new Float64Array([0.5, 0.5]), + ]; + + it("rbf kernel projection", () => { + const kpca = new KernelPCA({ nComponents: 2, kernel: "rbf", gamma: 1 }); + const Xt = kpca.fitTransform(X); + expect(Xt.length).toBe(5); + expect(Xt[0]!.length).toBe(2); + }); + + it("polynomial kernel", () => { + const kpca = new KernelPCA({ nComponents: 2, kernel: "poly" }); + const Xt = kpca.fitTransform(X); + expect(Xt.length).toBe(5); + }); +}); + +describe("FactorAnalysis", () => { + const X = Array.from({ length: 15 }, (_, i) => + new Float64Array([Math.sin(i), Math.cos(i), i * 0.1]), + ); + + it("extracts factors", () => { + const fa = new FactorAnalysis({ nComponents: 2, maxIter: 20 }); + const Xt = fa.fitTransform(X); + expect(Xt.length).toBe(15); + expect(Xt[0]!.length).toBe(2); + }); + + it("noise variance is positive", () => { + const fa = new FactorAnalysis({ nComponents: 1, maxIter: 10 }); + fa.fit(X); + expect(fa.noiseVariance_).toBeDefined(); + for (let i = 0; i < 3; i++) { + expect(fa.noiseVariance_![i] ?? 0).toBeGreaterThan(0); + } + }); +});