In [1]:
import (
		"github.com/pa-m/sklearn/metrics"
	    ms "github.com/pa-m/sklearn/model_selection"
		"github.com/e-XpertSolutions/go-iforest/iforest"	
		"bufio"
		"encoding/csv"
		"strconv"
		"github.com/janpfeifer/gonb/gonbui"
		"gonum.org/v1/plot/plotutil"
		"gonum.org/v1/plot"
		"gonum.org/v1/plot/plotter"
		"gonum.org/v1/plot/vg"
		"gonum.org/v1/plot/vg/draw"
		"os/exec"
		"github.com/janpfeifer/gonb/cache"
		"math"
		"math/rand"

	)

	

# UTILITIES

In [39]:


//load file Data and Return [][]float64 array
func loadData(myfile string) [][]float64 {

	l:=false
file, err := os.Open(myfile)
	if err != nil {
		fmt.Println("Error opening file:", err)
		return nil
	}
	defer file.Close()

	reader := csv.NewReader(file)
	records, err := reader.ReadAll()
	if err != nil {
		fmt.Println("Error reading CSV:", err)
		return nil
	}
	
	inputData := make([][]float64, len(records))
	for i, row := range records {

		inputData[i] = make([]float64, len(row))
		for j, value := range row {
			inputData[i][j], err = strconv.ParseFloat(value, 64)
			if err != nil {
				fmt.Printf("Error converting to float: line %d",i)
				l=true
				break	
			}
		}
		if l == true{
			break
		}
	}
	return inputData
}

func LoadLabels(filePath string)[]int{
	file, err := os.Open(filePath)
	if err != nil {
		fmt.Println("Error opening file:", err)
		return nil
	}
	defer file.Close()

	scanner := bufio.NewScanner(file)
	
	var labels []int

	for scanner.Scan() {
		label,err := strconv.Atoi(scanner.Text())
		if err != nil {
			// Handle the error if the conversion fails
			fmt.Println("Error:", err)
			return nil
		}
		labels= append(labels,label)
	}
	return labels
}

//Convert labels to float for F1 and Accuracy Score metrics
func ConvertToFloat(Labels_Pred []int , labels_Tru []int)([]float64 ,[]float64){

	length :=len(Labels_Pred)
	labelsPred_float := make([]float64, length)
	labelsTru_float := make([]float64, length)

	for i:=0; i<length; i++ {
		
		labelsPred_float[i] = float64(Labels_Pred[i])
		labelsTru_float[i] = float64(labels_Tru[i])
		
	}
	return labelsPred_float,labelsTru_float

}

func F1Score(labelsPred []int, labelsTru []int) float64{

	pred,tru := ConvertToFloat(labelsPred,labelsTru)

	Ypred, Ytrue := mat.NewDense(len(pred), 1, pred), mat.NewDense(len(tru), 1, tru)
	var sampleWeight []float64
	/* fmt.Printf("F1 macro %.2f\n", metrics.F1Score(Ytrue, Ypred, "macro", sampleWeight))
	fmt.Printf("F1 micro %.2f\n", metrics.F1Score(Ytrue, Ypred, "micro", sampleWeight))
	fmt.Printf("F1 weighted %.2f\n", metrics.F1Score(Ytrue, Ypred, "weighted", sampleWeight)) */
	return metrics.F1Score(Ytrue, Ypred, "macro", sampleWeight)

}

func AccuracyScore(labelsPred []int, labelsTru []int)float64{

	pred,tru := ConvertToFloat(labelsPred,labelsTru)

	var nilDense *mat.Dense
	normalize, sampleWeight := true, nilDense
	Ypred, Ytrue := mat.NewDense(len(pred), 1, pred), mat.NewDense(len(tru), 1, tru)
	return metrics.AccuracyScore(Ytrue, Ypred, normalize, sampleWeight)

}

// call command line executable
func prepareDataset(dataset string){
	cmd :=exec.Command("./extract_splitDataset.sh", dataset)
	cmd.Run()
}

func Mean(array []float64)float64{
	sum := 0.0
    for _, num := range array {
        sum += num
    }

    // Calculate the mean
    mean := sum / float64(len(array))
	return mean
}

func outlierRatio(lab []int) float64{
	count :=0 
	for _,v := range lab{
		if v == 1{
			count++
		}
	}
	anomaly := float64(count)/float64(len(lab))
	return anomaly
}

func findMaxIndex(arr []float64)int{

	maxIndex := 0 

    for i := 1; i < len(arr); i++ {
     
        if arr[i] > arr[maxIndex] {
            maxIndex = i
        }
    }

    return maxIndex 
}


func BuildLabelsArray(lendata int, lenzerodata int)[]int{


labeldata := make([]int, lendata)
			for i:=0; i<lendata; i++{
				if i < lenzerodata{
				labeldata[i] = 0
				}else{
				labeldata[i] = 1
			}
		}
		return labeldata
}




# TRAIN TEST FUNCTION

In [3]:
//Isolation Forest algorithm

func isoForestTrain_Test(training_data [][]float64 ,testing_data [][]float64 , treesNumber int, subsampleSize int, outliers float64) *iforest.Forest{
//model initialization
forest := iforest.NewForest(treesNumber, subsampleSize, outliers)

//training stage - creating trees
forest.Train(training_data)

//testing stage - finding anomalies 
forest.Test(testing_data)

//threshold := forest.AnomalyBound
//labels := forest.Labels

//fmt.Println("threshold is",threshold)

return forest;
}


# K-Fold Cross Validation Functions

In [40]:
type Split struct{
	TrainData [][] float64
	ValidateData [][] float64
	LabelData []int
	//trainLabelData []int for unit testing
	}

func SplitDataset (dataset [][]float64, labels[]int, splitRatio float64)(map[int][][]float64,map[int][][]float64,float64){

	myMap := make(map[int][][]float64)
	TrainingIndex_Data := make(map[int][][]float64)
	TestingIndex_Data := make(map[int][][]float64)
	count:=0
	for i,instance:= range dataset{
		if labels[i] == 0{
		myMap[0] = append(myMap[0], instance)
		}else{
		myMap[1] = append(myMap[1], instance)
		count++
		}
	}
	ShuffleUnit(myMap)
	split0:= int(math.Round(splitRatio * float64(len(myMap[0]))))
	split1:= int(math.Round(splitRatio * float64(len(myMap[1]))))

	TrainingIndex_Data[0] = myMap[0][:split0]
	TrainingIndex_Data[1] = myMap[1][:split1]
	TestingIndex_Data[0] = myMap[0][split0:]
	TestingIndex_Data[1] = myMap[1][split1:] 

	anomaly := float64(count)/float64(len(labels))
	return  TrainingIndex_Data, TestingIndex_Data, anomaly
}

func Shuffle(testing_set [][]float64, training_set [][]float64, label_set []int){

	rand.Shuffle(len(training_set), func(i, j int) {
		training_set[i], training_set[j] = training_set[j], training_set[i]
	})
	rand.Shuffle(len(testing_set), func(i, j int) {
		testing_set[i], testing_set[j] = testing_set[j], testing_set[i]
		label_set [i], label_set[j] = label_set[j], label_set[i]
	})
}

func ShuffleUnit(data_label map[int][][]float64){

	rand.Shuffle(len(data_label[0]), func(i, j int) {
		data_label[0][i], data_label[0][j] = data_label[0][j], data_label[0][i]
	})
	rand.Shuffle(len(data_label[1]), func(i, j int) {
		data_label[1][i], data_label[1][j] = data_label[1][j], data_label[1][i]
	})
}

func StratifiedKFold(K int, data_label map[int][][]float64)(ch chan Split){


	foldSizezeros:= int(math.Round(float64((len(data_label[0])/K))))
	foldSizeones:= int(math.Round((float64(len(data_label[1])/K))))

	ch = make(chan Split)
	
	go func(){
		
		var sp *Split
		for i:=0; i<K; i++{
			startIndex0 := i * foldSizezeros
			endIndex0 := (i + 1) * foldSizezeros
			startIndex1 := i * foldSizeones
			endIndex1 := (i + 1) * foldSizeones

			if i == K-1 {
				endIndex0 = len(data_label[0])
				endIndex1 = len(data_label[1])
			}
			trainData0 := make([][]float64, 0)
			trainData1 := make([][]float64, 0)

			// Merge all data except the test data into the training data
			trainData0 = append(trainData0, data_label[0][:startIndex0]...)
			trainData0 = append(trainData0, data_label[0][endIndex0:]...)
			trainData1 = append(trainData1, data_label[1][:startIndex1]...)
			trainData1 = append(trainData1, data_label[1][endIndex1:]...)
			
			trainData := Merge(trainData0,trainData1)
			

			testData0:=data_label[0][startIndex0:endIndex0]
			testData1:=data_label[1][startIndex1:endIndex1]
			
			testData := Merge(testData0, testData1)

			testDatalen:= len(testData)
		
		    labeldata:= BuildLabelsArray(testDatalen,len(testData0))
			
			Shuffle(testData,trainData,labeldata)
			
			sp = &Split{
				TrainData: trainData,
				ValidateData:  testData,
				LabelData: labeldata,
				//trainLabelData: trainlabeldata,
			}
			ch <- *sp
		}
		close(ch)	
	}()
	return ch
}

func Merge(data1 [][]float64, data2 [][]float64)[][]float64{

	concatenated:=make([][]float64, len(data1)+len(data2))
					copy(concatenated, data1)
    				copy(concatenated[len(data1):], data2)
	return concatenated
}
 

In [18]:
type config struct{
	treeNum int
	subsamplesize int
}

func CrossValidation(data map[int][][]float64,treenummax int,subsamplmax int,anomaly float64) *config{

	meanAccArr := make([]float64,0)
	configArr := make([]config,0)

	for tr_n :=10 ;tr_n <=treenummax; tr_n+= 10{
		for sss := 100; sss<=subsamplmax; sss+= 50{
			var conf *config
			fmt.Println("Tree number = ",tr_n)
			fmt.Println("Subsamplingsize = ",sss)
			Acc_arr:=make([]float64,0)
			//F1_arr:=make([]float64,0)
			for s := range StratifiedKFold(5,data){
					forest:=isoForestTrain_Test(s.TrainData, s.ValidateData,tr_n,sss,anomaly)
					Acc_arr = append(Acc_arr,AccuracyScore(forest.Labels,s.LabelData))
					//F1_arr = append(F1_arr,F1Score(forest.Labels,s.LabelData))
				}
					
			mean_accuracy:= Mean(Acc_arr)
			fmt.Println("Mean accuracy is ",mean_accuracy)
		    meanAccArr = append(meanAccArr,mean_accuracy)

		    conf = &config{
				treeNum:tr_n,
				subsamplesize:sss,
			}

			configArr = append(configArr,*conf)
		}
	}
	maxIndex := findMaxIndex(meanAccArr)
	bestConfig := configArr[maxIndex]
	return &bestConfig
}

In [47]:
%%
data:=loadData("dataset.csv")
labels:=LoadLabels("labels.csv")
training_data , testing_data , anomaly :=  SplitDataset(data,labels,0.8)
fmt.Println(float64(len(testing_data[0])/float64(len(testing_data[1])))
fmt.Println("anomaly ratio is: ",anomaly)
configuration := CrossValidation(training_data,20,150,anomaly) //returns the Best configuration {treeNum}{SubsamplingSize}
fmt.Println(configuration)


forestt := isoForestTrain_Test(data,data,configuration.treeNum,configuration.subsamplesize,anomaly)
testing := Merge(testing_data[0], testing_data[1])


testDatalen := len(testing)
testData0len :=len(testing_data[0])

labeldata:=BuildLabelsArray(testDatalen,testData0len)

predictlabels,_,_ :=forestt.Predict(testing)
fmt.Println("Predict Accuracy is...",AccuracyScore(predictlabels,labeldata))



anomaly ratio is:  0.07193103596551799
Tree number =  10
Subsamplingsize =  100
8.364671030069179
8.364671030069179
8.364671030069179
8.364671030069179
8.364671030069179
Mean accuracy is  0.924115625
Tree number =  10
Subsamplingsize =  150
9.175657275024252
9.175657275024252
9.175657275024252
9.175657275024252
9.175657275024252
Mean accuracy is  0.9161131250000001
Tree number =  20
Subsamplingsize =  100
8.364671030069179
8.364671030069179
8.364671030069179
8.364671030069179
8.364671030069179
Mean accuracy is  0.9168731250000001
Tree number =  20
Subsamplingsize =  150
9.175657275024252
9.175657275024252
9.175657275024252
9.175657275024252
9.175657275024252
Mean accuracy is  0.9195656249999999
&{10 100}
8.364671030069179
Predict Accuracy is... 0.9101222753056882


# KFold Unit Testing

In [None]:

func dataLabelGenerator(numberofinstances int)([][]float64 ,[]int){
	data := make([][]float64,0)
	labels := make([]int,0)
	count:=0
	for j:=0; j<numberofinstances; j++{
		label := rand.Intn(2)
		labels = append(labels,label)
		if label == 0 {
			data = append(data,[]float64{0.0, 0.0})
		}else{
			data= append(data,[]float64{1.1,1.1})
			count++
		}	
	} 

	
	return data,labels
}

func TestSplitDataset(ratio float64)(map[int][][]float64,map[int][][]float64, float64){

	data,labels := dataLabelGenerator(532)

	data_label_train,data_label_test,anomaly:=SplitDataset(data,labels,ratio)
	trainlen0:=len(data_label_train[0])
	trainlen1:=len(data_label_train[1])
    testlen0:=len(data_label_test[0])
	testlen1:=len(data_label_test[1]) 
 
    trainlen := trainlen0+trainlen1
	testlen := testlen0+testlen1 
	right_train_len := int(float64(len(data))*ratio)
	right_test_len := int(float64(len(data))*(1-ratio))
	fmt.Println("SPLITDATASET METRICSSSSSSS")
	fmt.Println("train length",trainlen)
	fmt.Println("what train length should be",right_train_len)
	fmt.Println("test length",testlen)
	fmt.Println("what train test should be",right_test_len)

	traininganomaly := float64(trainlen1)/float64(trainlen)
	testinganomaly := float64(testlen1)/float64(testlen)

	fmt.Println("real anomaly ratio",anomaly)
	fmt.Println("training set anomaly ratio",traininganomaly)
	fmt.Println("testing set anomaly ratio",testinganomaly)

	return 	data_label_train,data_label_test,anomaly

}

func TestStratifiedKFold(){

	data_label_train,_ ,anomaly:= TestSplitDataset(0.8)
	fmt.Println("KFOLD METRICSSSSSSS")
	for s := range StratifiedKFold(5,data_label_train){
		traindatalen:=len(s.TrainData)
		testdatalen:=len(s.ValidateData)
		validatedatalen:=len(s.ValidateData)
		Kfoldtotallength:= traindatalen+validatedatalen
		Initiallen:= len(data_label_train[0])+len(data_label_train[1])
		fmt.Println("validate data length",validatedatalen)
		fmt.Println("traindata length",traindatalen)
		fmt.Println("total length length", Kfoldtotallength)
		fmt.Println("should be the same as", Initiallen)
		count:=0
		//count1:=0
		for _,v:= range s.LabelData{
			if v==1{
				count++
			} 
		}
		/* for _,v:= range s.trainLabelData{
			if v==1{
				count1++
			} 
		} */
		testinganomaly:= float64(count)/float64(testdatalen)
		//traininganomaly:= float64(count1)/float64(traindatalen)

		fmt.Println("real anomaly ratio",anomaly)
		//fmt.Println("Kfold training set anomaly ratio",traininganomaly)
		fmt.Println("Kfold testing set anomaly ratio",testinganomaly)



}

}

%%

TestStratifiedKFold()

# Junk Code :( 

In [None]:

func KFold(K int, datasetlen int)(ch chan Split){

	Kpoint :=  int(math.Ceil(float64(datasetlen)/float64(K)))
	Kpoints:=[]int{0}

	for j:=1; j< K; j++{
		if Kpoints[j-1] + Kpoint >= datasetlen {
			break
		} 
		Kpoints = append(Kpoints, Kpoints[j-1] + Kpoint)
		
	}
	Kpoints = append(Kpoints, datasetlen)

    Indexes:=make([]int, datasetlen)
	for i:=0; i<datasetlen; i++{
		Indexes[i]=i
		
	} 

	ch = make(chan Split)
	
	go func(){
		var sp *Split
		for testsplit:=0; testsplit<K; testsplit++{
			if testsplit == 0{
				sp = &Split{
					TestIndex: Indexes[:Kpoints[1]],
					TrainIndex:  Indexes[Kpoints[1]:],				
				}

				}else if testsplit == K-1{
					sp = &Split{
					TestIndex: Indexes[Kpoints[K-1]:],
					TrainIndex:  Indexes[:Kpoints[K-1]],
					}
				}else{
					before:= Indexes[:Kpoints[testsplit]]
					after:= Indexes[Kpoints[testsplit+1]:]
					concatenated:=make([]int, len(before)+len(after))
					copy(concatenated, before)
    				copy(concatenated[len(before):], after)

					sp = &Split{
					TestIndex: Indexes[Kpoints[testsplit]:Kpoints[testsplit+1]],
					TrainIndex:  concatenated,
				}
			}
			ch <-*sp
		}
		close(ch)
	}()
	return ch

}
func getElements(indexTrain []int ,indexTest []int ,dataArray [][]float64, labelsArray []int)([][]float64,[][]float64, []int) {
	dataTrain := make([][]float64, len(indexTrain))
	dataTest := make([][]float64, len(indexTest))
	labels := make([]int, len(indexTest))
	
	for i, index := range indexTrain {
		dataTrain[i] = dataArray[index]
	}
	for i, index := range indexTest {
		dataTest[i] = dataArray[index]
		labels[i] = labelsArray[index]

	}
	return dataTrain,dataTest,labels
}

 func CrossValidation(dataset string){
	
	i:=1
	data:=loadData(dataset)
	labels:=LoadLabels("training_labels.csv")
	for s := range KFold(5,len(data)){
		fmt.Println("%Iteration N.",i)
		i++
		traindata,testdata,labeldata := getElements(s.TrainIndex,s.TestIndex,data,labels)
		forest:=isoForestTrain_Test(traindata, testdata,100,1000,0.8)
		AccuracyScore(forest.Labels,labeldata)
		F1Score(forest.Labels,labeldata)	
	}

}
%%
/* data := loadData("training.csv")
fmt.Println(data)
//datasetlen:=len(data)
for s := range KFold(5,data){
	fmt.Println(s.TrainData)
	
} */

cache.ResetKey("my_forest")
var forest = cache.Cache("my_forest", func() * iforest.Forest {return isoForestTrain_Test(data,data,configuration.treeNum,configuration.subsamplesize,outlier)})