In [1]:
dbutils.library.installPyPI("koalas")
dbutils.library.restartPython()

In [2]:
def run_linearreg(data, rows, cols):
  try:
    import time
    start_time = float(time.time())
    from sklearn.linear_model import LinearRegression
    X_cols = []
    for i in range (1,cols):
      X_cols.append(str(i))

    lr = LinearRegression().fit(data[X_cols],data['0'])

    end_time = float(time.time())
    return end_time-start_time
  except:
    return 666


In [3]:
def run_logregclass(data, rows, cols):
  try:
    import time
    from sklearn.linear_model import LogisticRegression
    data['0'] = pd.DataFrame(np.random.rand(rows, 1)).round(0)

    X_cols = []
    for i in range (1,cols):
      X_cols.append(str(i))

    start_time = float(time.time())
    lr = LogisticRegression().fit(data[X_cols],data['0'])
    end_time = float(time.time())
    return end_time-start_time
  except:
    return 666

In [4]:
def run_dectreereg(data, rows, cols):
  try:
    import time
    data['0'] = pd.DataFrame(np.random.rand(rows, 1)).round(0)

    X_cols = []
    for i in range (1,cols):
      X_cols.append(str(i))

    start_time = float(time.time())

    from sklearn.tree import DecisionTreeRegressor
    dtr = DecisionTreeRegressor(max_depth=10).fit(data[X_cols],data['0'])

    end_time = float(time.time())
    return end_time-start_time
  except:
    return 666

In [5]:
def run_dectreeclass(data, rows, cols):
  try:
    import time
    data['0'] = pd.DataFrame(np.random.rand(rows, 1)).round(0)

    X_cols = []
    for i in range (1,cols):
      X_cols.append(str(i))

    start_time = float(time.time())

    from sklearn.tree import DecisionTreeClassifier
    dtc = DecisionTreeClassifier(max_depth=10).fit(data[X_cols],data['0'])

    end_time = float(time.time())
    return end_time-start_time
  except:
    return 666

In [6]:
def run_naivebayesclass(data, rows, cols):
  try:
    import time
    data['0'] = pd.DataFrame(np.random.rand(rows, 1)).round(0)

    X_cols = []
    for i in range (1,cols):
      X_cols.append(str(i))

    start_time = float(time.time())

    from sklearn.naive_bayes import GaussianNB
    nbc = GaussianNB().fit(data[X_cols],data['0'])

    end_time = float(time.time())
    return end_time-start_time
  except:
    return 666

In [7]:
import signal
from contextlib import contextmanager

class TimeoutException(Exception): pass

@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)

In [8]:
import pandas as pd
import numpy as np
import time
import databricks.koalas as ks

rows_list = []

for rows in range(1000000,1000001,100000):
  for cols in range(1000,1001,100):
    print("generating a dataframe of {} rows x {} cols".format(rows,cols))
    data  = pd.DataFrame(np.random.rand(rows, cols))
    data.columns = data.columns.astype(str) 

    print("running linear regresssion on dataframe of {} rows x {} cols".format(rows,cols))
    try:
      with time_limit(300):
        t = run_linearreg(data, rows, cols)
        print('executed in {} seconds'.format(round(t,3)))
        result = {"model":"linearreg","modeltype":"regression","rows": rows, "cols": cols, "time": t, "nodes":1, "lib":"sklearn" }
    except TimeoutException as e:
      print("Timed out!")      
      result = {"model":"linearreg","modeltype":"regression","rows": rows, "cols": cols, "time": t, "nodes":1, "lib":"sklearn" }
      pass
    rows_list.append(result)
    time.sleep(1)

#    print("running dectree regresssion on dataframe of {} rows x {} cols".format(rows,cols))
#    try:
#      with time_limit(60):
#        t = run_dectreereg(data, rows, cols)
#        print('executed in {} seconds'.format(round(t,3)))
#        result = {"model":"dectreereg","modeltype":"regression","rows": rows, "cols": cols, "time": t, "nodes":1, "lib":"sklearn" }
#    except TimeoutException as e:
#      print("Timed out!")      
#      result = {"model":"dectreereg","modeltype":"regression","rows": rows, "cols": cols, "time": t, "nodes":1, "lib":"sklearn" }
#      pass
#    rows_list.append(result)
#    time.sleep(10)
    
    print("running logreg classification on dataframe of {} rows x {} cols".format(rows,cols))
    try:
      with time_limit(300):
        t = run_logregclass(data, rows, cols)
        print('executed in {} seconds'.format(round(t,3)))
        result = {"model":"logreg","modeltype":"classification","rows": rows, "cols": cols, "time": t, "nodes":1, "lib":"sklearn" }
    except TimeoutException as e:
      print("Timed out!")      
      result = {"model":"logreg","modeltype":"classification","rows": rows, "cols": cols, "time": t, "nodes":1, "lib":"sklearn" }
      pass
    rows_list.append(result)
    time.sleep(1)

#    print("running dectree classification on dataframe of {} rows x {} cols".format(rows,cols))
#    try:
#      with time_limit(60):
#        t = run_dectreeclass(data, rows, cols)
#        print('executed in {} seconds'.format(round(t,3)))
#        result = {"model":"dectreeclass","modeltype":"classification","rows": rows, "cols": cols, "time": t, "nodes":1, "lib":"sklearn" }
#    except TimeoutException as e:
#      print("Timed out!")      
#      result = {"model":"dectreeclass","modeltype":"classification","rows": rows, "cols": cols, "time": t, "nodes":1, "lib":"sklearn" }
#      pass
#    rows_list.append(result)
#    time.sleep(10)

    print("running naivebayes classification on dataframe of {} rows x {} cols".format(rows,cols))
    try:
      with time_limit(300):
        t = run_naivebayesclass(data, rows, cols)
        print('executed in {} seconds'.format(round(t,3)))
        result = {"model":"naivebayes","modeltype":"classification","rows": rows, "cols": cols, "time": t, "nodes":1, "lib":"sklearn" }
    except TimeoutException as e:
      print("Timed out!")      
      result = {"model":"naivebayes","modeltype":"classification","rows": rows, "cols": cols, "time": t, "nodes":1, "lib":"sklearn" }
      pass
    rows_list.append(result)
    time.sleep(1)

    df = ks.DataFrame(rows_list, columns=['model','modeltype','nodes','lib','rows', 'cols', 'time'])
    df.to_csv("/FileStore/tables/sklearn_1n_nodectree_20200311c.csv")
  
print(rows_list)
    