flucoma · tedmoore · May 31, 2022 · May 23, 2022 · May 23, 2022 · May 23, 2022
diff --git a/doc/PCA.rst b/doc/PCA.rst
@@ -1,56 +1,56 @@
-:digest: Dimensionality Reduction with Principal Component Analysis
+:digest: Principal Component Analysis
 :species: data
 :sc-categories: Dimensionality Reduction, Data Processing
 :sc-related: Classes/FluidMDS, Classes/FluidDataSet
 :see-also: 
 :description: 
-   Principal Components Analysis of a :fluid-obj:`DataSet`
+   Principal Components Analysis (PCA) of a :fluid-obj:`DataSet`. 
+
+:discussion:
 
-   https://scikit-learn.org/stable/modules/decomposition.html#principal-component-analysis-pca
+   PCA fits to a DataSet to determine its principal components, each of which is a new axis through the data that maximises the variance, or “differences”, within the Data. PCA can then transform the original DataSet or individual points to position them in relation to the principal components (i.e., “new axes”) for better comparing how they differ from other points in the DataSet. PCA is often used for dimensionality reduction and is also useful for removing redundancy (i.e., correlation) and/or noise (i.e., dimensions that are uniformly distributed) from a DataSet.
 
 :control numDimensions:
 
-   The number of dimensions to reduce to
-
+   The number of dimensions (principal components) to keep after a ``transform``, using PCA for dimensionality reduction. 
 
 :message fit:
 
    :arg dataSet: A :fluid-obj:`DataSet` to analyse
 
    :arg action: Run when done
 
-   Train this model on a :fluid-obj:`DataSet` but don't transform the data
+   Train this model on a :fluid-obj:`DataSet` to determine the principal components, but don't transform any data.
 
 :message transform:
 
-   :arg sourceDataSet: Source data, or the DataSet name
+   :arg sourceDataSet: source DataSet
 
-   :arg destDataSet: Destination data, or the DataSet name
+   :arg destDataSet: destination DataSet
 
    :arg action: Run when done
 
-   Given a trained model, apply the reduction to a source :fluid-obj:`DataSet` and write to a destination. Can be the same for both input and output (in-place). Returns the fraction of accounted variance, aka the fidelity of the new representation: a value near 1.0 means a higher fidelity to the original.
+   Given a trained model, transform a source :fluid-obj:`DataSet` into the PCA-space and write to a destination DataSet. The DataSets can be the same for both input and output (performs the operation in-place). This process returns the fraction (between 0 and 1) of explained variance.
 
-
 :message inverseTransform:
 
-   :arg sourceDataSet: Source data, or the DataSet name
+   :arg sourceDataSet: source DataSet
 
-   :arg destDataSet: Destination data, or the DataSet name
+   :arg destDataSet: destination DataSet
 
    :arg action: Run when done
 
-   Given a trained model, invert a source :fluid-obj:`DataSet` containing dimensions that are principal components to a destination :fluid-obj:`DataSet` with the dimensionality of the data that was used to ``fit``. :fluid-obj:`DataSet` can be the same for both input and output (the operation will be performed in-place). 
+   Given a trained model, invert a source :fluid-obj:`DataSet` containing ``numDimensions`` dimensions that are in PCA-space to a destination :fluid-obj:`DataSet` with the dimensionality of the data that was used to ``fit``. :fluid-obj:`DataSet` can be the same for both input and output (the operation will be performed in-place). 
 
 :message fitTransform:
 
-   :arg sourceDataSet: Source data, or the DataSet name
+   :arg sourceDataSet: source DataSet
 
-   :arg destDataSet: Destination data, or the DataSet name
+   :arg destDataSet: destination DataSet
 
    :arg action: Run when done
 
-   :fluid-obj:`PCA#fit` and :fluid-obj:`PCA#transform` in a single pass. Returns the fraction of accounted variance, aka the fidelity of the new representation: a value near 1.0 means a higher fidelity to the original.
+   :fluid-obj:`PCA#fit` and :fluid-obj:`PCA#transform` in a single pass. Returns the fraction (between 0 and 1) of explained variance.
 
 :message transformPoint:
 
@@ -60,7 +60,7 @@
 
    :arg action: Run when done
 
-   Given a trained model, transform the data point in ``sourceBuffer`` from the original dimensional space to ``numDimensions`` principal components and write into ``destBuffer``.
+   Given a trained model, transform the data point in ``sourceBuffer`` from the original dimensional space to ``numDimensions`` in PCA-space and write into ``destBuffer``.
 
 :message inverseTransformPoint:
 
@@ -70,4 +70,4 @@
 
   :arg action: Run when done
 
-  Given a trained model, transform the data point in ``sourceBuffer`` from being ``numDimensions`` principal components into the original dimensional space and write into ```destBuffer``.
+  Given a trained model, transform the data point in ``sourceBuffer`` from being ``numDimensions`` in PCA-space into the original dimensional space and write into ```destBuffer``.
diff --git a/example-code/sc/PCA.scd b/example-code/sc/PCA.scd
@@ -1,110 +1,122 @@
-
+strong::Dimensionality Reduction::
 code::
-s.reboot;
-//Preliminaries: we want some audio, a couple of FluidDataSets, some Buffers, a FluidStandardize and a FluidPCA
-(
-~audiofile = FluidFilesPath("Tremblay-ASWINE-ScratchySynth-M.wav");
-~raw = FluidDataSet(s);
-~standardized = FluidDataSet(s);
-~reduced = FluidDataSet(s);
-~audio = Buffer.read(s,~audiofile);
-~mfcc_feature = Buffer.new(s);
-~stats = Buffer.alloc(s, 7, 12);
-~datapoint = Buffer.alloc(s, 12);
-~standardizer  = FluidStandardize(s);
-~pca = FluidPCA(s,2);
-)
 
+s.boot;
 
-// Load audio and run an mfcc analysis, which gives us 13 points (we'll throw the 0th away)
-(
-~audio = Buffer.read(s,~audiofile);
-FluidBufMFCC.process(s,~audio, features: ~mfcc_feature,action:{"Done MFCCs".postln});
-)
+~src = Buffer.read(s,FluidFilesPath("Tremblay-ASWINE-ScratchySynth-M.wav"));
 
-// Divide the time series in 100, and take the mean of each segment and add this as a point to
-// the 'raw' FluidDataSet
+// load MFCC analyses into a dataset
 (
-{
-	var trig = LocalIn.kr(1, 1);
-	var buf =  LocalBuf(12, 1);
-    var count = PulseCount.kr(trig) - 1;
-	var chunkLen = (~mfcc_feature.numFrames / 100).asInteger;
-	var stats = FluidBufStats.kr(
-        source: ~mfcc_feature, startFrame: count * chunkLen,
-        startChan:1, numFrames: chunkLen, stats: ~stats,
-        trig: trig * (count < 100), blocking: 1
-	);
-	var rd = BufRd.kr(12, ~stats, DC.kr(0), 0, 1);
-	var bufWr, dsWr;
-	12.do{|i|
-		bufWr = BufWr.kr(rd[i], buf, DC.kr(i));
-	};
-	dsWr = FluidDataSetWr.kr(~raw, buf: buf, idNumber: count, trig: Done.kr(stats));
-	LocalOut.kr( Done.kr(dsWr));
-	FreeSelf.kr(count - 99);
-	Poll.kr(trig,(100 - count));
-}.play;
+~mfcc_feature = Buffer(s);
+FluidBufMFCC.processBlocking(s,~src,startCoeff:1,features:~mfcc_feature);
+~ds = FluidDataSet(s).fromBuffer(~mfcc_feature);
+~ds.print;
 )
-// wait for the count to reaches 0 in the post window.
 
-//First standardize our DataSet, so that the MFCC dimensions are on comensurate scales
-//Then apply the PCA in-place on the standardized data
-//Download the DataSet contents into an array for plotting
+// first standardize our DataSet, so that the MFCC dimensions are in similar ranges
+// then apply the PCA in-place on the standardized data,
+// reducing the number of dimensions to the default of 2
+// lastly normalize it so it can be plotted in a normalized space
 (
-~reducedarray = Array.new(100);
-~standardizer.fitTransform(~raw, ~standardized);
-~pca.fitTransform(~standardized, ~reduced, action:{|x|
-	x.postln; //pass on the variance
-	~reduced.dump{|x| 100.do{|i|
-		~reducedarray.add(x["data"][i.asString])
-	}};
+~stand = FluidStandardize(s).fitTransform(~ds,~ds);
+~pca = FluidPCA(s).fitTransform(~ds,~ds);
+~norm = FluidNormalize(s).fitTransform(~ds,~ds);
+~ds.dump({
+	arg dict;
+	defer{FluidPlotter(dict:dict)};
 });
 )
-
-//Visualise the 2D projection of our original 12D data
+::
+strong::Server-side queries::
+code::
 (
-d = ~reducedarray.flop.deepCollect(1, { |x| x.normalize});
-w = Window("scatter", Rect(128, 64, 200, 200));
-w.drawFunc = {
-    Pen.use {
-        d[0].size.do{|i|
-            var x = (d[0][i]*200);
-            var y = (d[1][i]*200);
-            var r = Rect(x,y,5,5);
-            Pen.fillColor = Color.blue;
-            Pen.fillOval(r);
-        }
-    }
-};
-w.refresh;
-w.front;
-)
+{
+	var src = PlayBuf.ar(1,~src,BufRateScale.ir(~src),loop:1);
+	var mfccs = FluidMFCC.kr(src,startCoeff:1);
+	var trig = Impulse.kr(30);
+	var inputPoint = LocalBuf(13);
+	var standPoint = LocalBuf(13);
+	var outputPoint = LocalBuf(2);
+	var normPoint = LocalBuf(2);
+	var sig, pca1, pca2;
 
-// transform a single point with arbitrary value
-~inbuf = Buffer.loadCollection(s,0.5.dup(12));
-~outbuf = Buffer.new(s);
-~pca.transformPoint(~inbuf,~outbuf,{|x|x.postln;x.getn(0,1,{|y|y.postln;};)});
-::
+	FluidKrToBuf.kr(mfccs,inputPoint);
+	~stand.kr(trig,inputPoint,standPoint);
+	~pca.kr(trig, standPoint, outputPoint,2);
+	~norm.kr(trig,outputPoint,normPoint);
 
-subsection:: Server Side Queries
+	# pca1, pca2 = FluidBufToKr.kr(normPoint).lag(0.01).poll;
 
-Let's map our learned PCA dimensions to the controls of a processor
+	sig = CombC.ar(src,0.05,[1-pca1,pca2].clip * 0.05,(1-pca1).clip * 3,-16.dbamp) + src;
 
+	sig;
+}.play;
+)
+::
+strong::Whitening::
 code::
+
+// without whitening (left plot), principal component 1 (x axis) clearly has a longer variance,
+// as it should, than principal component 2 (y axis). with whitening, both PCs have unit variance.
+// (both plots have the same ranges for their axes). because of this change in relative *scale* the
+// distances used to compute the clusters will be different, and will very likely end up with different
+// clusters! (run it a few times to see the varieties)
+
+~src = Buffer.readChannel(s,FluidFilesPath("Tremblay-ASWINE-ScratchySynth-M.wav"),channels:[0]);
+
+// load analyses into a dataset
 (
-{
-	var mapped;
-	var audio = BufRd.ar(1,~audio,LFSaw.ar(BufDur.ir(~audio).reciprocal).range(0, BufFrames.ir(~audio)));
-	var mfcc = FluidMFCC.kr(audio)[1..12];
-	var smoothed = LagUD.kr(mfcc,1*ControlDur.ir,500*ControlDur.ir);
-	var trig = Impulse.kr(ControlRate.ir / 2);
-    var inputPoint = LocalBuf(12);
-    var outputPoint = LocalBuf(2);
-	smoothed.collect{|coeff,i| BufWr.kr([coeff],inputPoint,i)};
-    ~pca.kr(trig, inputPoint, outputPoint, 2);
-	mapped = BufRd.kr(1,outputPoint, phase:[0,1]).linlin(-3,3,0,3);
-	CombC.ar(audio,3,mapped[0],mapped[1]*3)
-}.play;
+~analysis = Buffer(s);
+FluidBufSpectralShape.processBlocking(s,~src,features:~analysis);
+// FluidBufMFCC.processBlocking(s,~src,startCoeff:1,features:~analysis);
+~ds = FluidDataSet(s).fromBuffer(~analysis);
+~ds.print;
+
+~stand = FluidStandardize(s).fitTransform(~ds,~ds);
+
+~ds_pca = FluidDataSet(s);
+~pca = FluidPCA(s).fitTransform(~ds,~ds_pca);
+~ls = FluidLabelSet(s);
+FluidKMeans(s,4).fitPredict(~ds_pca,~ls);
+
+~ds_pca_white = FluidDataSet(s);
+~pca_white = FluidPCA(s,whiten:1).fitTransform(~ds,~ds_pca_white);
+~ls_white = FluidLabelSet(s);
+FluidKMeans(s,4).fitPredict(~ds_pca_white,~ls_white);
+
+~norm = FluidNormalize(s).fit(~ds_pca);
+~norm_white = FluidNormalize(s).fit(~ds_pca_white);
+
+~ds_pca.dump({
+	arg dict;
+	~ds_pca_white.dump({
+		arg dict_white;
+		~ls.dump({
+			arg labels;
+			~ls_white.dump({
+				arg labels_white;
+				~norm.dump({
+					arg norm;
+					~norm_white.dump({
+						arg norm_white;
+						var min = min(norm["data_min"].minItem,norm_white["data_min"].minItem);
+						var max = max(norm["data_max"].maxItem,norm_white["data_max"].maxItem);
+
+						defer{
+							var win = Window(bounds:Rect(0,0,1000,500));
+							win.layout_(
+								HLayout(
+									FluidPlotter(dict:dict,xmin:min,xmax:max,ymin:min,ymax:max,standalone:false).categories_(labels),
+									FluidPlotter(dict:dict_white,xmin:min,xmax:max,ymin:min,ymax:max,standalone:false).categories_(labels_white)
+								)
+							);
+							win.front;
+						};
+					});
+				});
+			});
+		});
+	});
+});
 )
 ::