In [1]:
:import "encoding/csv"

In [2]:
:import "fmt"

In [3]:
:import "github.com/gonum/floats"

In [4]:
:import "github.com/gonum/stat"

In [5]:
:import "github.com/pkg/errors"

In [6]:
:import "github.com/gonum/matrix/mat64"

# Definitions

In [7]:
type RepoData struct{
    Forks    []float64
    Stars    []float64
    Issues   []float64
    Size     []float64
}

func prepareData(filename string) (RepoData, error) {

    // Get our csv data
    csvfile, err := os.Open("repodata.csv")
    if err != nil {
        return RepoData{}, errors.Wrap(err, "Could not open CSV file")
    }
    defer csvfile.Close()

    reader := csv.NewReader(csvfile)
    reader.FieldsPerRecord = -1
    rawCSVdata, err := reader.ReadAll()
    if err != nil {
        return RepoData{}, errors.Wrap(err, "Could not read in raw CSV data")
    }

    var data RepoData
    for _, each := range rawCSVdata {
        fork, err := strconv.ParseFloat(each[3], 64)
        if err != nil {
            return RepoData{}, errors.Wrap(err, "Could not convert fork value to float")
        }
        stars, err := strconv.ParseFloat(each[5], 64)
        if err != nil {
            return RepoData{}, errors.Wrap(err, "Could not convert stars value to float")
        }
        issues, err := strconv.ParseFloat(each[4], 64)
        if err != nil {
            return RepoData{}, errors.Wrap(err, "Could not convert issues value to float")
        }
        size, err := strconv.ParseFloat(each[6], 64)
        if err != nil {
            return RepoData{}, errors.Wrap(err, "Could not convert size value to float")
        }
        data.Forks = append(data.Forks, fork)
        data.Stars = append(data.Stars, stars)
        data.Issues = append(data.Issues, issues)
        data.Size = append(data.Size, size)
    }

    return data, nil

}


# Import Data

In [8]:
data, _ := prepareData("repodata.csv")

main.[32mRepoData[0m{
  [33mForks[0m:  [][32mfloat64[0m{...},
  [33mStars[0m:  [][32mfloat64[0m{...},
  [33mIssues[0m: [][32mfloat64[0m{...},
  [33mSize[0m:   [][32mfloat64[0m{...},
}


# Explore information about stars, issues, size

Now that we have parsed the data from the repos into slices of float64, we can utilize the `floats` and `stat` packages from `gonum` to answer some basic questions about the Go repos committed to Github.

(1) What is the aggregate size of Go repos in GB?

In [9]:
aggregateSize := floats.Sum(data.Size)/1000000

[35m[1m164.963009[0m


(2) What are the mean and standard deviation of Github issues, respectively?

In [10]:
meanIssues := stat.Mean(data.Issues, nil)

[35m[1m0.759885[0m


In [11]:
stdDevIssues := stat.StdDev(data.Issues, nil)

[35m[1m15.715922[0m


(3) What is the maximum number of stars on a Go repo?

In [12]:
maxStars := floats.Max(data.Stars)

[35m[1m31788.000000[0m


# Something a little more interesting - Principle Components

First get the data into `gonum` matrix form, then calculate the principle components:

In [13]:
var allData []float64
allData = append(allData, data.Forks...)
allData = append(allData, data.Issues...)
allData = append(allData, data.Size...)
allData = append(allData, data.Stars...)

inputData := mat64.NewDense(4, len(data.Forks), allData)
transposedData := inputData.T()

// Calculate the principal component direction vectors
// and variances.
vecs, vars, ok := stat.PrincipalComponents(transposedData, nil)
if !ok {
    return
}

// Project the data onto the first 2 principal components.
k := 2
var proj mat64.Dense
proj.Mul(transposedData, vecs.View(0, 0, 4, k))

Then we can perform PCA easily:

Here is the input 4 dimensional data (forks, stars, issues, size) projected onto a 2 dimensional space:

In [14]:
fmt.Printf("proj = %.4f", mat64.Formatted(&proj, mat64.Prefix("       ")))

proj = ⎡    -578.3435       136.1622⎤
       ⎢    -125.1537        60.8076⎥
       ⎢    -111.1413        55.2106⎥
       ⎢     -63.1047        41.2043⎥
       ⎢    -644.1091        40.9536⎥
       ⎢    -124.0742        29.1249⎥
       ⎢     -20.0706        28.0229⎥
       ⎢      -5.0718        28.3896⎥
       ⎢     -34.0537        21.1444⎥
       ⎢    -149.0623        23.1292⎥
       ⎢    -132.0500        19.6943⎥
       ⎢      -9.0516        20.0591⎥
       ⎢    -112.0372        14.6556⎥
       ⎢    -142.0391        15.1245⎥
       ⎢    -356.0309        11.9023⎥
       ⎢   -1284.0271         9.0559⎥
       ⎢    -184.0285        10.9136⎥
       ⎢    -448.0238         8.8781⎥
       ⎢    -140.0259         9.9880⎥
       ⎢    -110.0199         7.7655⎥
       ⎢   -3476.0090        -0.7734⎥
       ⎢    -122.0199         7.7350⎥
       ⎢    -104.0175         6.7957⎥
       ⎢    -518.0216         7.3882⎥
       ⎢    -307.0199         7.1543⎥
       ⎢   -6445.9939       -10.4420⎥
       ⎢    