#### Goals

 - load a dataset
 - look for distributions
 - explore the data alone different dimensions

## Load the Wine Dataset - (no labels)



In [None]:
var fs = require("fs");
var parse = require('csv-parse/lib/sync');
var table = require('text-table');
var Plot = require('plotly-notebook-js');
var groupBy = require('lodash/groupBy');
var keys = require('lodash/keys');

In [None]:
var raw_csv_string = fs.readFileSync("datasets/wine.data.unlabelled.csv").toString();
var dataset = parse(raw_csv_string);
console.log("our dataset has", dataset.length, "rows and ", dataset[0].length, "columns")

In other words we have a dataset cointaining 178 feature vectors, each made up of 13 features. Each feature vector corresponds to a particular wine.

So we have a 13-dimensional data space that we could potentially do our analysis in. That's $R^{13}$!

In [None]:
var features = [
    'Alcohol',
    'Malic Acid',
    'Ash',
    'Alcalinity of ash',
    'Magnesium',
    'Total phenols',
    'Flavanoids',
    'Nonflavanoid phenols',
    'Proanthocyanins',
    'Color intensity',
    'Hue',
    'OD280/OD315 of diluted wines',
    'Proline'];

var format = { align: [ 'c', 'c' ], hsep: ' | ' };

var T = table([features], format);
console.log(T)
console.log("---------------------------")
var D = table(dataset.slice(0,5), format);
console.log(D)

Each of those feature is some measurement made on a wine. We have wine data from a number of different geographical regions.

The point of this dataset is can we work out which wines are from teh same region? Can we even work out how many different regiosn we have wines from?

Algorithms aside, how would you do that?

## Lets plot some data

### 1D

We could plot a simple histogram of any dimension we choose. Can we determine the number of regions or assign any wine to a region from a single dimension?

In [None]:
var decimals = 1;
var factor = Math.pow(10,decimals);

// select a feature
var DIM = 0;
// -------
var groups = groupBy(dataset.map((row) => row[DIM]), v => Math.floor(factor*v)/factor);
var values = keys(groups);
var histogram = values.map(value => groups[value].length);

var trace = { x: values.map(s => parseFloat(s)), y: histogram, type: 'bar', name: 'cosine'}
var layout = { title: `Feature: ${features[DIM]}`, xaxis: { title: features[DIM] }, yaxis: { title: "Value" }};

var myPlot = Plot.createPlot([trace], layout);

$$html$$ = myPlot.render();

### Move to 2D


In [None]:
var X = 0;
var Y = 11;

var x = dataset.map(d => d[X]).map(f => parseFloat(f));
var y = dataset.map(d => d[Y]).map(f => parseFloat(f));

var singleTrace = { x, y, mode: 'markers', marker: { size: 5, }, type: 'scatter' };

var layout = { xaxis: { title: features[X] }, yaxis: { title: features[Y] }, width: 500, height: 500 };

$$html$$ = Plot.createPlot([singleTrace], layout).render()

#### What about in 3D or 4D?

In [None]:
var X = 0, Y = 11, COLOR = 6, SIZE = 1;

var x = dataset.map(d => d[X]).map(f => parseFloat(f));
var y = dataset.map(d => d[Y]).map(f => parseFloat(f));
var color = dataset.map(d => d[COLOR]).map(f => parseFloat(f));
var size = 10; //dataset.map(d => d[SIZE]).map(f => parseFloat(f));
var singleTrace = { x, y, mode: 'markers', marker: { color, size, sizeref: 0.2 }, type: 'scatter' };
var layout = { title: `Size ${features[SIZE]} | Color: ${features[COLOR]}`, 
  xaxis: { title: features[X] }, yaxis: { title: features[Y] }, width: 700, height: 700 };

$$html$$ = Plot.createPlot([singleTrace], layout).render()