@@ -443,10 +443,24 @@ var reader = mlContext.Data.TextReader(ctx => (
443443// Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
444444var trainData = reader .Read (trainDataPath );
445445
446+ // Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used
447+ // several times somewhere. The caching mechanism is also lazy; it only caches things after being used.
448+ // User can replace all the subsequently uses of "trainData" with "cachedTrainData". We still use "trainData" because
449+ // a caching step, which provides the same caching function, will be inserted in the considered "learningPipeline."
450+ var cachedTrainData = trainData .Cache ();
451+
446452// Step two: define the learning pipeline.
447453
448454// We 'start' the pipeline with the output of the reader.
449455var learningPipeline = reader .MakeNewEstimator ()
456+ // We add a step for caching data in memory so that the downstream iterative training
457+ // algorithm can efficiently scan through the data multiple times. Otherwise, the following
458+ // trainer will read data from disk multiple times. The caching mechanism uses an on-demand strategy.
459+ // The data accessed in any downstream step will be cached since its first use. In general, you only
460+ // need to add a caching step before trainable step, because caching is not helpful if the data is
461+ // only scanned once. This step can be removed if user doesn't have enough memory to store the whole
462+ // data set.
463+ .AppendCacheCheckpoint ()
450464 // Now we can add any 'training steps' to it. In our case we want to 'normalize' the data (rescale to be
451465 // between -1 and 1 for all examples)
452466 .Append (r => (
@@ -486,13 +500,28 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments
486500// Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
487501var trainData = reader .Read (trainDataPath );
488502
503+ // Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used
504+ // several times somewhere. The caching mechanism is also lazy; it only caches things after being used.
505+ // User can replace all the subsequently uses of "trainData" with "cachedTrainData". We still use "trainData" because
506+ // a caching step, which provides the same caching function, will be inserted in the considered "dynamicPipeline."
507+ var cachedTrainData = mlContext .Data .Cache (trainData );
508+
489509// Step two: define the learning pipeline.
490510
491511// We 'start' the pipeline with the output of the reader.
492512var dynamicPipeline =
493513 // First 'normalize' the data (rescale to be
494514 // between -1 and 1 for all examples)
495515 mlContext .Transforms .Normalize (" FeatureVector" )
516+ // We add a step for caching data in memory so that the downstream iterative training
517+ // algorithm can efficiently scan through the data multiple times. Otherwise, the following
518+ // trainer will read data from disk multiple times. The caching mechanism uses an on-demand strategy.
519+ // The data accessed in any downstream step will be cached since its first use. In general, you only
520+ // need to add a caching step before trainable step, because caching is not helpful if the data is
521+ // only scanned once. This step can be removed if user doesn't have enough memory to store the whole
522+ // data set. Notice that in the upstream Transforms.Normalize step, we only scan through the data
523+ // once so adding a caching step before it is not helpful.
524+ .AppendCacheCheckpoint (mlContext )
496525 // Add the SDCA regression trainer.
497526 .Append (mlContext .Regression .Trainers .StochasticDualCoordinateAscent (label : " Target" , features : " FeatureVector" ));
498527
@@ -595,6 +624,13 @@ var learningPipeline = reader.MakeNewEstimator()
595624 r .Label ,
596625 // Concatenate all the features together into one column 'Features'.
597626 Features : r .SepalLength .ConcatWith (r .SepalWidth , r .PetalLength , r .PetalWidth )))
627+ // We add a step for caching data in memory so that the downstream iterative training
628+ // algorithm can efficiently scan through the data multiple times. Otherwise, the following
629+ // trainer will read data from disk multiple times. The caching mechanism uses an on-demand strategy.
630+ // The data accessed in any downstream step will be cached since its first use. In general, you only
631+ // need to add a caching step before trainable step, because caching is not helpful if the data is
632+ // only scanned once.
633+ .AppendCacheCheckpoint ()
598634 .Append (r => (
599635 r .Label ,
600636 // Train the multi-class SDCA model to predict the label using features.
@@ -640,6 +676,8 @@ var dynamicPipeline =
640676 mlContext .Transforms .Concatenate (" Features" , " SepalLength" , " SepalWidth" , " PetalLength" , " PetalWidth" )
641677 // Note that the label is text, so it needs to be converted to key.
642678 .Append (mlContext .Transforms .Categorical .MapValueToKey (" Label" ), TransformerScope .TrainTest )
679+ // Cache data in moemory for steps after the cache check point stage.
680+ .AppendCacheCheckpoint (mlContext )
643681 // Use the multi-class SDCA model to predict the label using features.
644682 .Append (mlContext .MulticlassClassification .Trainers .StochasticDualCoordinateAscent ())
645683 // Apply the inverse conversion from 'PredictedLabel' column back to string value.
@@ -741,6 +779,7 @@ var trainData = mlContext.CreateStreamingDataView(churnData);
741779
742780var dynamicLearningPipeline = mlContext .Transforms .Categorical .OneHotEncoding (" DemographicCategory" )
743781 .Append (mlContext .Transforms .Concatenate (" Features" , " DemographicCategory" , " LastVisits" ))
782+ .AppendCacheCheckpoint (mlContext ) // FastTree will benefit from caching data in memory.
744783 .Append (mlContext .BinaryClassification .Trainers .FastTree (" HasChurned" , " Features" , numTrees : 20 ));
745784
746785var dynamicModel = dynamicLearningPipeline .Fit (trainData );
@@ -757,6 +796,7 @@ var staticLearningPipeline = staticData.MakeNewEstimator()
757796 .Append (r => (
758797 r .HasChurned ,
759798 Features : r .DemographicCategory .OneHotEncoding ().ConcatWith (r .LastVisits )))
799+ .AppendCacheCheckpoint () // FastTree will benefit from caching data in memory.
760800 .Append (r => mlContext .BinaryClassification .Trainers .FastTree (r .HasChurned , r .Features , numTrees : 20 ));
761801
762802var staticModel = staticLearningPipeline .Fit (staticData );
@@ -813,6 +853,8 @@ var learningPipeline = reader.MakeNewEstimator()
813853 // When the normalizer is trained, the below delegate is going to be called.
814854 // We use it to memorize the scales.
815855 onFit : (scales , offsets ) => normScales = scales )))
856+ // Cache data used in memory because the subsequently trainer needs to access the data multiple times.
857+ .AppendCacheCheckpoint ()
816858 .Append (r => (
817859 r .Label ,
818860 // Train the multi-class SDCA model to predict the label using features.
@@ -987,6 +1029,10 @@ var catColumns = data.GetColumn(r => r.CategoricalFeatures).Take(10).ToArray();
9871029
9881030// Build several alternative featurization pipelines.
9891031var learningPipeline = reader .MakeNewEstimator ()
1032+ // Cache data in memory in an on-demand manner. Columns used in any downstream step will be
1033+ // cached in memory at their first uses. This step can be removed if user's machine doesn't
1034+ // have enough memory.
1035+ .AppendCacheCheckpoint ()
9901036 .Append (r => (
9911037 r .Label ,
9921038 r .NumericalFeatures ,
@@ -1070,6 +1116,9 @@ var workclasses = transformedData.GetColumn<float[]>(mlContext, "WorkclassOneHot
10701116var fullLearningPipeline = dynamicPipeline
10711117 // Concatenate two of the 3 categorical pipelines, and the numeric features.
10721118 .Append (mlContext .Transforms .Concatenate (" Features" , " NumericalFeatures" , " CategoricalBag" , " WorkclassOneHotTrimmed" ))
1119+ // Cache data in memory so that the following trainer will be able to access training examples without
1120+ // reading them from disk multiple times.
1121+ .AppendCacheCheckpoint (mlContext )
10731122 // Now we're ready to train. We chose our FastTree trainer for this classification task.
10741123 .Append (mlContext .BinaryClassification .Trainers .FastTree (numTrees : 50 ));
10751124
@@ -1121,6 +1170,10 @@ var messageTexts = data.GetColumn(x => x.Message).Take(20).ToArray();
11211170
11221171// Apply various kinds of text operations supported by ML.NET.
11231172var learningPipeline = reader .MakeNewEstimator ()
1173+ // Cache data in memory in an on-demand manner. Columns used in any downstream step will be
1174+ // cached in memory at their first uses. This step can be removed if user's machine doesn't
1175+ // have enough memory.
1176+ .AppendCacheCheckpoint ()
11241177 .Append (r => (
11251178 // One-stop shop to run the full text featurization.
11261179 TextFeatures : r .Message .FeaturizeText (),
@@ -1243,6 +1296,9 @@ var learningPipeline = reader.MakeNewEstimator()
12431296 Label : r .Label .ToKey (),
12441297 // Concatenate all the features together into one column 'Features'.
12451298 Features : r .SepalLength .ConcatWith (r .SepalWidth , r .PetalLength , r .PetalWidth )))
1299+ // Add a step for caching data in memory so that the downstream iterative training
1300+ // algorithm can efficiently scan through the data multiple times.
1301+ .AppendCacheCheckpoint ()
12461302 .Append (r => (
12471303 r .Label ,
12481304 // Train the multi-class SDCA model to predict the label using features.
@@ -1298,6 +1354,10 @@ var dynamicPipeline =
12981354 mlContext .Transforms .Concatenate (" Features" , " SepalLength" , " SepalWidth" , " PetalLength" , " PetalWidth" )
12991355 // Note that the label is text, so it needs to be converted to key.
13001356 .Append (mlContext .Transforms .Conversions .MapValueToKey (" Label" ), TransformerScope .TrainTest )
1357+ // Cache data in memory so that SDCA trainer will be able to randomly access training examples without
1358+ // reading data from disk multiple times. Data will be cached at its first use in any downstream step.
1359+ // Notice that unused part in the data may not be cached.
1360+ .AppendCacheCheckpoint (mlContext )
13011361 // Use the multi-class SDCA model to predict the label using features.
13021362 .Append (mlContext .MulticlassClassification .Trainers .StochasticDualCoordinateAscent ());
13031363
@@ -1439,6 +1499,7 @@ public static ITransformer TrainModel(MLContext mlContext, IDataView trainData)
14391499 Action < InputRow , OutputRow > mapping = (input , output ) => output .Label = input .Income > 50000 ;
14401500 // Construct the learning pipeline.
14411501 var estimator = mlContext .Transforms .CustomMapping (mapping , null )
1502+ .AppendCacheCheckpoint (mlContext )
14421503 .Append (mlContext .BinaryClassification .Trainers .FastTree (label : " Label" ));
14431504
14441505 return estimator .Fit (trainData );
@@ -1480,8 +1541,12 @@ public class CustomMappings
14801541var estimator = mlContext .Transforms .CustomMapping <InputRow , OutputRow >(CustomMappings .IncomeMapping , nameof (CustomMappings .IncomeMapping ))
14811542 .Append (mlContext .BinaryClassification .Trainers .FastTree (label : " Label" ));
14821543
1544+ // If memory is enough, we can cache the data in-memory to avoid reading them from file
1545+ // when it will be accessed multiple times.
1546+ var cachedTrainData = mlContext .Data .Cache (trainData );
1547+
14831548// Train the model.
1484- var model = estimator .Fit (trainData );
1549+ var model = estimator .Fit (cachedTrainData );
14851550
14861551// Save the model.
14871552using (var fs = File .Create (modelPath ))
0 commit comments