In [1]:
using CUDA

In [2]:
d = CuArray([5,10])

2-element CuArray{Int64,1}:
  5
 10

In [3]:
using DataFrames
abstract type LinearModel end
mutable struct LinearRegression{P} <: LinearModel
    A::Float64
    B::Float64
    predict::P
    regressors::Array{LinearModel}
    function LinearRegression(x::Array,y::Array; cuda = false)
        # a = ((∑y)(∑x^2)-(∑x)(∑xy)) / (n(∑x^2) - (∑x)^2)
        # b = (x(∑xy) - (∑x)(∑y)) / n(∑x^2) - (∑x)^2
        regressors = []
        if cuda == true
            x = CuArray(x)
            y = CuArray(y)
        end
        if length(x) != length(y)
            throw(ArgumentError("The array shape does not match!"))
        end
        # Get our Summations:
        Σx = sum(x)
        Σy = sum(y)
        # dot x and y
        xy = x .* y
        # ∑dot x and y
        Σxy = sum(xy)
        # dotsquare x
        x2 = x .^ 2
        # ∑ dotsquare x
        Σx2 = sum(x2)
        # n = sample size
        n = length(x)
        # Calculate a
        a = (((Σy) * (Σx2)) - ((Σx * (Σxy)))) / ((n * (Σx2))-(Σx^2))
        # Calculate b
        b = ((n*(Σxy)) - (Σx * Σy)) / ((n * (Σx2)) - (Σx ^ 2))
        predict(xt::Array) = (xt = [i = a + (b * i) for i in xt])
        P = typeof(predict)
        return new{P}(a, b, predict, [])
    end
        function LinearRegression(x::DataFrame,y::Array)
            # a = ((∑y)(∑x^2)-(∑x)(∑xy)) / (n(∑x^2) - (∑x)^2)
            # b = (x(∑xy) - (∑x)(∑y)) / n(∑x^2) - (∑x)^2
            regressors = []
            count = 1
            [push!(regressors, LinearRegression(feature, y) for feature in x)]
            a = nothing
            b = nothing
            for m in regressors
                if a != nothing
                    a = mean(a, m.a)
                    b = mean(b, m.b)
                else
                    a = m.a
                    b = m.b
                end
            end
            predict(xt::DataFrame) = _compare_predCon(models, xt)
            P = typeof(predict)
            return new{P}(a, b, predict, regressors)
    end
end

In [4]:
using Lathe.preprocess: TrainTestSplit

In [5]:
df = DataFrame(:A => randn(5000000), :B => randn(5000000))

Unnamed: 0_level_0,A,B
Unnamed: 0_level_1,Float64,Float64
1,0.577005,-0.971065
2,-1.12369,-0.646088
3,1.45753,-1.08604
4,0.239653,1.09619
5,-0.692247,-0.282517
6,-0.702029,1.48493
7,-0.023858,0.701455
8,1.10218,-1.17157
9,0.567547,0.134177
10,-0.154495,-0.614274


In [6]:
train, test = TrainTestSplit(df)

([1m3747974×2 DataFrame[0m
[1m     Row [0m│[1m A           [0m[1m B         [0m
[1m         [0m│[90m Float64     [0m[90m Float64   [0m
─────────┼────────────────────────
       1 │  0.577005    -0.971065
       2 │ -1.12369     -0.646088
       3 │  1.45753     -1.08604
       4 │  0.239653     1.09619
       5 │ -0.692247    -0.282517
       6 │ -0.702029     1.48493
       7 │ -0.023858     0.701455
       8 │  0.567547     0.134177
       9 │ -0.0206048   -1.47767
      10 │  0.280465     1.80941
      11 │  0.317778    -0.458131
    ⋮    │      ⋮           ⋮
 3747965 │  0.919352    -0.999579
 3747966 │ -0.438849    -0.791899
 3747967 │ -0.655646     1.32953
 3747968 │  1.57285     -0.182869
 3747969 │ -0.499785     0.483187
 3747970 │  0.110402     0.376069
 3747971 │  1.29261     -1.77421
 3747972 │ -1.14882     -0.419361
 3747973 │  2.11255     -0.997588
 3747974 │  0.659585     0.158184
[36m              3747953 rows omitted[0m, [1m1252026×2 DataFrame[0m
[1m  

In [7]:
@time LinearRegression(train[!, :A], train[!, :B]).predict(test[!, :A])

  5.799333 seconds (20.39 M allocations: 1.069 GiB, 4.14% gc time)


1252026-element Array{Float64,1}:
 0.0007593812368251255
 0.0006031036239309942
 0.0007447593375925508
 0.000650084630560349
 0.0005760612869706101
 0.0005697558771464991
 0.000554673367806285
 0.0008102356529406358
 0.0006110969318424915
 0.0007474061098006959
 0.0006052051410636166
 0.0007227634485248923
 0.00047798855040247766
 ⋮
 0.0006798954610642182
 0.000297086561590541
 0.0004773663033831785
 0.0005580751920272239
 0.0005050876651470564
 0.0005989512879201476
 0.0004971244974083756
 0.0008261441700263702
 0.000630211015944771
 0.0008306262880419847
 0.0006089065807559879
 0.00041552772483634547

In [8]:
CUDA.@time LinearRegression(train[!, :A], train[!, :B], cuda = true).predict(test[!, :A])

  9.127757 seconds (13.78 M CPU allocations: 714.041 MiB, 1.30% gc time) (12 GPU allocations: 114.379 MiB, 2.06% gc time of which 0.65% spent allocating)


1252026-element Array{Float64,1}:
 0.0007593812368251252
 0.0006031036239309938
 0.0007447593375925504
 0.0006500846305603486
 0.0005760612869706097
 0.0005697558771464987
 0.0005546733678062846
 0.0008102356529406353
 0.0006110969318424911
 0.0007474061098006955
 0.0006052051410636161
 0.0007227634485248919
 0.0004779885504024772
 ⋮
 0.0006798954610642178
 0.0002970865615905405
 0.000477366303383178
 0.0005580751920272234
 0.000505087665147056
 0.0005989512879201472
 0.0004971244974083752
 0.0008261441700263697
 0.0006302110159447706
 0.0008306262880419844
 0.0006089065807559874
 0.000415527724836345

In [9]:
df = DataFrame(:A => randn(5000000), :B => randn(5000000))
train, test = TrainTestSplit(df)

([1m3748152×2 DataFrame[0m
[1m     Row [0m│[1m A          [0m[1m B          [0m
[1m         [0m│[90m Float64    [0m[90m Float64    [0m
─────────┼────────────────────────
       1 │  0.607555   -1.67637
       2 │  0.169965   -0.677239
       3 │  1.68827    -0.0997401
       4 │ -0.110792    0.392875
       5 │  0.952066    0.140255
       6 │  0.809013    0.350373
       7 │  0.725505    0.582751
       8 │  0.702776    0.898695
       9 │ -1.15123     2.15362
      10 │  0.402691   -0.0669361
      11 │  0.357018    1.39073
    ⋮    │     ⋮           ⋮
 3748143 │  0.593798    0.393911
 3748144 │ -0.45311     0.667211
 3748145 │ -0.764961    0.411348
 3748146 │  0.0594568  -0.407051
 3748147 │  0.111333    0.818817
 3748148 │ -1.65268    -0.120496
 3748149 │ -1.15869     0.335531
 3748150 │ -2.16815     1.77178
 3748151 │  0.72471     0.425346
 3748152 │ -0.469236   -0.59999
[36m              3748131 rows omitted[0m, [1m1251848×2 DataFrame[0m
[1m     Row [0m│[1m A