In [4]:
# INSPIRED BY: https://docs.microsoft.com/en-us/sql/advanced-analytics/tutorials/python-ski-rental-linear-regression?view=sql-server-ver15

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from revoscalepy import RxComputeContext, RxInSqlServer, RxSqlServerData
from revoscalepy import rx_import

conn_str = 'Driver=SQL Server;Server=DEVNOTEBOOK\MSSQLSERVER01;Database=ML;Trusted_Connection=True;'

column_info = {
         "ParcelId" : { "type" : "integer" },
         "Swis" : { "type" : "integer" },
         "TotalAV" : { "type" : "integer" },                  
         "Acres" : { "type" : "integer" },         
         "Zip" : { "type" : "integer" }
     }

# Get data in intermediate format (I think)
data_source = RxSqlServerData(
    table="dbo.AssessmentTrainingDataLinReg", 
    connection_string=conn_str, 
    column_info=column_info
)
# Set up compute context
RxInSqlServer(
    connection_string=conn_str, 
    num_tasks=1, 
    auto_cleanup=False
)
# convert data to pandas dataframe
df = pd.DataFrame(rx_import(input_data = data_source))

# Create list of all column names from the table
columns = df.columns.tolist()

# Name of the column containing data we want to predict
predictionTarget = "TotalAV"

training_set = df.sample(
    frac=0.8, 
    random_state=1
)

# "Use anything not present in the training set for the testing set."
testing_set = df.loc[~df.index.isin(training_set.index)]

print(
    "Training data shape:", 
    training_set.shape
)
print(
    "Testing data shape:", 
    testing_set.shape
)

linearReg_model = LinearRegression()

# Fit the model to the training data.
linearReg_model.fit(training_set[columns], training_set[predictionTarget])

# Generate our predictions for the test set.
linearReg_predictions = linearReg_model.predict(testing_set[columns])
print(
    "Model predictions:", 
    linearReg_predictions
)

error = mean_squared_error(
    linearReg_predictions, 
    testing_set[predictionTarget]
)
print(
    "Error:", 
    error
)


Rows Read: 59932, Total Rows Processed: 59932, Total Chunk Time: 0.200 seconds 
Training data shape: (47946, 5)
Testing data shape: (11986, 5)
Model predictions: [  69300.  322000.   90000. ...,   95000.  129000.  146100.]
Error: 2.55044250259e-18
