In [7]:
//https://jtablesaw.wordpress.com/2015/05/22/getting-started-with-outlier-2/
import com.twosigma.beaker.fileloader.CsvPlotReader
import com.github.lwhite1.tablesaw.api.ml.clustering.*
import com.github.lwhite1.tablesaw.reducing.*;

def reader = new CsvPlotReader()
def tornadoes = reader.read("Tablesaw/tornadoes_2014.csv")
//print dataset structure    
tornadoes.structure().print()
//get header names
tornadoes.columnNames()
//displays the row and column counts
tornadoes.shape()
//displays the first n rows
tornadoes.first(3).print()
//summarize the data in each column
tornadoes.summary()

//Mapping operations
def month = tornadoes.dateColumn("Date").month()
tornadoes.addColumn(month);

//Sorting by column
tornadoes.sortOn("-Fatalities")

//Descriptive statistics
tornadoes.column("Fatalities").summary().print();

//Performing totals and sub-totals
def injuriesByScale = tornadoes.median("Injuries").by("Scale")
injuriesByScale.setName("Median injuries by Tornado Scale")
injuriesByScale.print()

//Cross Tabs
CrossTab.xCount(tornadoes, tornadoes.categoryColumn("State"), tornadoes.shortColumn("Scale")).print()


Crosstab Counts: State x Scale
      0   1   2  3  4 total 
AL    12  32  7  4  0 55    
AR    5   12  2  0  1 20    
AZ    3   0   0  0  0 3     
CA    6   3   0  0  0 9     
CO    41  7   1  0  0 49    
CT    1   0   0  0  0 1     
DE    0   1   0  0  0 1     
FL    23  4   1  0  0 28    
GA    13  16  3  0  0 32    
IA    31  22  3  0  0 56    
ID    3   0   0  0  0 3     
IL    19  27  3  0  0 49    
IN    6   22  0  0  0 28    
KS    30  7   3  1  0 41    
KY    6   21  1  0  0 28    
LA    4   8   3  0  0 15    
MA    1   1   1  0  0 3     
MD    2   0   0  0  0 2     
ME    2   2   0  0  0 4     
MI    6   7   0  0  0 13    
MN    17  4   3  0  0 24    
MO    23  21  3  0  0 47    
MS    12  17  7  5  1 42    
MT    7   0   0  1  0 8     
NC    14  11  6  1  0 32    
ND    11  2   1  0  0 14    
NE    33  13  9  3  4 62    
NH    2   0   0  0  0 2     
NM    12  3   0  0  0 15    
NV    6   0   0  0  0 6     
NY    1   8   1  1  0 11    
OH    16  3   0  1  0 20    
OK    12  3 

In [8]:
//https://jtablesaw.wordpress.com/2016/08/08/k-means-clustering-in-java/

import com.twosigma.beaker.fileloader.CsvPlotReader
import com.github.lwhite1.tablesaw.api.ml.clustering.*
    
    
def reader = new CsvPlotReader()
def t = reader.read("Tablesaw/whiskey.csv")

t.structure().print();

def model = new Kmeans(
    5,
    t.nCol(2), t.nCol(3), t.nCol(4), t.nCol(5), t.nCol(6), t.nCol(7),
    t.nCol(8), t.nCol(9), t.nCol(10), t.nCol(11), t.nCol(12), t.nCol(13)
);

//print claster formation
model.clustered(t.column("Distillery")).print();
//print centroids for each claster
model.labeledCentroids().print();
//gets the distortion for our model
model.distortion()

def n = t.rowCount();
def kValues = new double[n - 2];
def distortions = new double[n - 2];

for (int k = 2; k < n; k++) {
  kValues[k - 2] = k;
  def kmeans = new Kmeans(k,
      t.nCol(2), t.nCol(3), t.nCol(4), t.nCol(5), t.nCol(6), t.nCol(7),
      t.nCol(8), t.nCol(9), t.nCol(10), t.nCol(11), t.nCol(12), t.nCol(13)
  );
  distortions[k - 2] = kmeans.distortion();
}
def linearYPlot = new Plot(title: "K-means clustering demo", xLabel:"K", yLabel: "distortion")
linearYPlot << new Line(x: kValues, y: distortions)

In [9]:
//https://jtablesaw.wordpress.com/2016/07/31/play-moneyball-data-science-in-tablesaw/
import com.twosigma.beaker.fileloader.CsvPlotReader
import com.github.lwhite1.tablesaw.api.ml.clustering.*
import com.github.lwhite1.tablesaw.api.ml.regression.*   
import static com.github.lwhite1.tablesaw.api.QueryHelper.*;
    
def reader = new CsvPlotReader()
def baseball = Table.createFromCsv("Tablesaw/baseball.csv");

// filter to the data available at the start of the 2002 season
def moneyball = baseball.selectWhere(column("year").isLessThan(2002));
def wins = moneyball.nCol("W");
def year = moneyball.nCol("Year");
def playoffs = moneyball.column("Playoffs");
    
def runDifference = moneyball.shortColumn("RS").subtract(moneyball.shortColumn("RA"));
moneyball.addColumn(runDifference);
runDifference.setName("RD");


def Plot = new Plot(title: "RD x Wins", xLabel:"RD", yLabel: "W")
Plot << new Points(x: moneyball.numericColumn("RD").toDoubleArray(), y: moneyball.numericColumn("W").toDoubleArray())

def winsModel = LeastSquares.train(wins, runDifference);

def runDiff = new double[1];
runDiff[0] = 135;
def expectedWins = winsModel.predict(runDiff);

def runsScored2 = 
    LeastSquares.train(moneyball.nCol("RS"), moneyball.nCol("OBP"), moneyball.nCol("SLG"));

new Histogram(data: Arrays.asList(runsScored2.residuals()), binCount: 25);