Skip to content

Commit

Permalink
Iris Clustering sample updated to v0.7 and common-code approach
Browse files Browse the repository at this point in the history
  • Loading branch information
CESARDELATORRE committed Nov 5, 2018
1 parent 4487efc commit 8caa168
Show file tree
Hide file tree
Showing 18 changed files with 234 additions and 179 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

| ML.NET version | API type | Status | App Type | Data type | Scenario | ML Task | Algorithms |
|----------------|-------------------|-------------------------------|-------------|-----------|---------------------|---------------------------|-----------------------------|
| v0.6 | Dynamic API | Up-to-date | Two console apps | .csv file | Fraud Detection | Two-class classification | FastTree Binary Classification |
| v0.6 | Dynamic API | Needs update to 0.7 and README.md | Two console apps | .csv file | Fraud Detection | Two-class classification | FastTree Binary Classification |

In this introductory sample, you'll see how to use ML.NET to predict a credit card fraud. In the world of machine learning, this type of prediction is known as binary classification.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@

In this introductory sample, you'll see how to use [ML.NET](https://www.microsoft.com/net/learn/apps/machine-learning-and-ai/ml-dotnet) to predict a sentiment (positive or negative) for customer reviews. In the world of machine learning, this type of prediction is known as **binary classification**.

## API version: Static and Estimators-based API
It is important to note that this sample uses the **static API with Estimators**, available since ML.NET v0.6.

## Problem
This problem is centered around predicting if a customer's review has positive or negative sentiment. We will use small wikipedia-detox-datasets (one dataset for training and a second dataset for model's accuracy evaluation) that were processed by humans and each comment has been assigned a sentiment label:
* 0 - nice/positive
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ static void Main(string[] args)

// STEP4: Evaluate accuracy of the model
var metrics = modelBuilder.EvaluateClusteringModel(pivotDataView);
Common.ConsoleHelper.PrintClusteringMetrics("KMeansPlusPlus", metrics);
Common.ConsoleHelper.PrintClusteringMetrics("KMeans", metrics);

// STEP5: Save/persist the model as a .ZIP file
modelBuilder.SaveModelAsFile(modelZip);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

| ML.NET version | API type | Status | App Type | Data type | Scenario | ML Task | Algorithms |
|----------------|-------------------|-------------------------------|-------------|-----------|---------------------|---------------------------|-----------------------------|
| v0.6 | Dynamic API | Evolving | Console app | .csv files | Customer segmentation | Clustering | K-means++ |
| v0.6 | Dynamic API | README.md needs update | Console app | .csv files | Customer segmentation | Clustering | K-means++ |

## Problem

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.28010.2046
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Clustering_Iris", "Clustering_Iris.csproj", "{E730C84B-0F03-4C0C-9B22-68130091C900}"
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Clustering_Iris", "IrisClustering/IrisClusteringConsoleApp/Clustering_Iris.csproj", "{E730C84B-0F03-4C0C-9B22-68130091C900}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp2.1</TargetFramework>
</PropertyGroup>

<ItemGroup>
<Compile Include="..\..\..\..\common\ConsoleHelper.cs" Link="Common\ConsoleHelper.cs" />
<Compile Include="..\..\..\..\common\ModelBuilder.cs" Link="Common\ModelBuilder.cs" />
<Compile Include="..\..\..\..\common\ModelScorer.cs" Link="Common\ModelScorer.cs" />
</ItemGroup>

<ItemGroup>
<PackageReference Include="Microsoft.ML" Version="0.7.0-preview-27031-8" />
</ItemGroup>

<ItemGroup>
<None Update="datasets\iris-full.txt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>

<ItemGroup>
<Folder Include="Common\" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
using Microsoft.ML;
using Microsoft.ML.Core.Data;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Transforms;
using System;
using System.Collections.Generic;
using System.Text;

namespace Clustering_Iris
{
class DataLoader
{
MLContext _mlContext;
private TextLoader _loader;

public DataLoader(MLContext mlContext)
{
_mlContext = mlContext;

// Create the TextLoader by defining the data columns and where to find (column position) them in the text file.
_loader = mlContext.Data.TextReader(new TextLoader.Arguments()
{
Separator = "\t",
HasHeader = true,
Column = new[]
{
new TextLoader.Column("Label", DataKind.R4, 0),
new TextLoader.Column("SepalLength", DataKind.R4, 1),
new TextLoader.Column("SepalWidth", DataKind.R4, 2),
new TextLoader.Column("PetalLength", DataKind.R4, 3),
new TextLoader.Column("PetalWidth", DataKind.R4, 4),
}
});
}

public IDataView GetDataView(string filePath)
{
return _loader.Read(filePath);
}
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using Microsoft.ML;
using Microsoft.ML.Core.Data;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Transforms;
using Microsoft.ML.Transforms.Categorical;
using Microsoft.ML.Transforms.PCA;
using System;
using System.Collections.Generic;
using System.Text;

namespace Clustering_Iris
{
public class DataProcessor
{
public IEstimator<ITransformer> DataProcessPipeline { get; private set; }

public DataProcessor(MLContext mlContext)
{
// Configure data transformations in the DataProcess pipeline
DataProcessPipeline = mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth");
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
using Microsoft.ML.Runtime.Api;
using System;
using System.Collections.Generic;
using System.Text;

namespace Clustering_Iris.DataStructures
{
public class IrisData
{
[Column("0")]
public float Label;

[Column("1")]
public float SepalLength;

[Column("2")]
public float SepalWidth;

[Column("3")]
public float PetalLength;

[Column("4")]
public float PetalWidth;

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
using Microsoft.ML.Runtime.Api;
using System;
using System.Collections.Generic;
using System.Text;

namespace Clustering_Iris.DataStructures
{
// IrisPrediction is the result returned from prediction operations
public class IrisPrediction
{
[ColumnName("PredictedLabel")]
public uint SelectedClusterId;

[ColumnName("Score")]
public float[] Distance;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
using System;
using System.IO;

using Microsoft.ML;
using Common;
using Clustering_Iris.DataStructures;

namespace Clustering_Iris
{
internal static class Program
{
private static string AppPath => Path.GetDirectoryName(Environment.GetCommandLineArgs()[0]);

private static string BaseDatasetsLocation = @"../../../../Data";
private static string DataPath = $"{BaseDatasetsLocation}/iris-full.txt";

private static string BaseModelsPath = @"../../../../MLModels";
private static string ModelPath = $"{BaseModelsPath}/IrisModel.zip";

private static void Main(string[] args)
{
//Create the MLContext to share across components for deterministic results
MLContext mlContext = new MLContext(seed: 1); //Seed set to any number so you have a deterministic environment

//STEP 1: Common data loading
DataLoader dataLoader = new DataLoader(mlContext);
var trainingDataView = dataLoader.GetDataView(DataPath);

//STEP 2: Process data transformations in pipeline
var dataProcessor = new DataProcessor(mlContext);
var dataProcessPipeline = dataProcessor.DataProcessPipeline;

// (Optional) Peek data in training DataView after applying the ProcessPipeline's transformations
Common.ConsoleHelper.PeekDataViewInConsole<IrisData>(mlContext, trainingDataView, dataProcessPipeline, 10);
Common.ConsoleHelper.PeekVectorColumnDataInConsole(mlContext, "Features", trainingDataView, dataProcessPipeline, 10);

// STEP 3: Create and train the model
var modelBuilder = new ModelBuilder<IrisData, IrisPrediction>(mlContext, dataProcessPipeline);
var trainer = mlContext.Clustering.Trainers.KMeans(features: "Features", clustersCount: 3);
modelBuilder.AddTrainer(trainer);
var trainedModel = modelBuilder.Train(trainingDataView);

// STEP4: Evaluate accuracy of the model
var metrics = modelBuilder.EvaluateClusteringModel(trainingDataView);
Common.ConsoleHelper.PrintClusteringMetrics("KMeans", metrics);

// STEP5: Save/persist the model as a .ZIP file
modelBuilder.SaveModelAsFile(ModelPath);

Console.WriteLine("=============== End of training process ===============");

Console.WriteLine("=============== Predict a cluster for a single case (Single Iris data sample) ===============");

// Test with one sample text
var sampleIrisData = new IrisData()
{
SepalLength = 3.3f,
SepalWidth = 1.6f,
PetalLength = 0.2f,
PetalWidth = 5.1f,
};

//Create the clusters: Create data files and plot a chart
var modelScorer = new ModelScorer<IrisData, IrisPrediction>(mlContext);
modelScorer.LoadModelFromZipFile(ModelPath);

var prediction = modelScorer.PredictSingle(sampleIrisData);

Console.WriteLine($"Cluster assigned for setosa flowers:"+prediction.SelectedClusterId);

Console.WriteLine("=============== End of process, hit any key to finish ===============");
Console.ReadKey();
}
}







}
Loading

0 comments on commit 8caa168

Please sign in to comment.