In [0]:
# Data processing
import pandas as pd
import numpy as np
from pyspark.sql.functions import log, col, exp, when, col, avg, count
from pyspark.sql.functions import weekofyear, month, quarter, year
from pyspark.sql.window import Window 
import pyspark.sql.functions as func
# Modeling
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
# MLflow
import mlflow
import mlflow.spark
from mlflow.tracking import MlflowClient



In [0]:
%run ./utils/Forecasting_Utils

In [0]:
kbm = KeyBrandModel()
testss = {'weekofyear': [3], 'month': [1], 'quarter':[1], 'year':[2021]}
kbm.predict(testss, 'WY', 'DR PEPPER')

Out[48]: [8.03576760554597]

In [0]:
kpm = KeyPackageModel()
testss = {'weekofyear': [3], 'month': [1], 'quarter':[1], 'year':[2021]}
kpm.predict(testss, 'WY', '2L SINGLE BOTTLE')

Out[49]: [2.5338889419199404]

In [0]:
# Enable autolog()
mlflow.sklearn.autolog()
# With autolog() enabled, all model parameters, a model score, and the fitted model are automatically logged. 


dep_var = 'avg_standard_physical_volume'
indep_vars = ['weekofyear', 'month', 'quarter', 'year']
cnt = 0        
for state in kbm.states_list:
    for brand in kbm.keys_list:
        cnt +=1
        kbm.get_model_name(state, brand)
        
        df = kbm.get_filtered_series(state, brand)
        df = kbm.append_time_features(df)
        sdf = df.select(indep_vars + [dep_var])
        
        print(f"#### working on model {model_name} ####")
        print(f"### model {cnt} out of {len(kbm.states_list)*len(kbm.keys_list)} models")
        
        # Train test split
        trainDF, testDF = sdf.randomSplit([.8, .2], seed=42)
        # Print the number of records
        print(f'There are {trainDF.cache().count()} records in the training dataset.')
        print(f'There are {testDF.cache().count()} records in the testing dataset.')

        if trainDF.count()==0 or testDF.count()==0:
            print('^^^ This Dataset has zero data points and will be excluded')
            continue
            
        with mlflow.start_run(run_name=model_name) as run:
            # Define pipeline
            elastic_net_param = 0.5
            vecAssembler = VectorAssembler(inputCols=indep_vars, outputCol="features")
            lr = LinearRegression(featuresCol="features", labelCol=dep_var, predictionCol="prediction",elasticNetParam=elastic_net_param)
            pipeline = Pipeline(stages=[vecAssembler, lr])
            pipelineModel = pipeline.fit(trainDF)
            # Log parameters
            mlflow.log_param("target_variable", dep_var)
            mlflow.log_param("elasticNetParam", elastic_net_param)
            # Log the model for this run
            mlflow.spark.log_model(pipelineModel, "SparkML-linear-regression")
            # Make predictions
            predDF = pipelineModel.transform(testDF)    

            # Evaluate predictions
            regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol=dep_var)
            rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
            r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
            mse = regressionEvaluator.setMetricName("mse").evaluate(predDF)
            mae = regressionEvaluator.setMetricName("mae").evaluate(predDF)

            # Log metrics
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2", r2)    
            mlflow.log_metric("mse", mse)    
            mlflow.log_metric("mae", mae)    
            print(f'r2 is {r2}')
            print(f'rmse is {rmse}')
            print(f'mse is {mse}')
            print(f'mae is {mae}')
            print(f'*************************************')

In [0]:
get_model_name = lambda state, keyVal: (state + '_' + keyVal).replace(" ", "_").replace(".", "_")

model_name('AZ', 'COKE')

Out[14]: 'AZ_COKE'

In [0]:
model_list = []
for state in kbm.states_list:
    for brand in kbm.keys_list:
        model_name = get_model_name(state,brand) 
        model_list.append(model_name)
        
print(len(model_list))
model_list

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-4394589272978365>:2[0m
[1;32m      1[0m model_list [38;5;241m=[39m []
[0;32m----> 2[0m [38;5;28;01mfor[39;00m state [38;5;129;01min[39;00m kbm[38;5;241m.[39mstates_list:
[1;32m      3[0m     [38;5;28;01mfor[39;00m brand [38;5;129;01min[39;00m kbm[38;5;241m.[39mkeys_list:
[1;32m      4[0m         model_name [38;5;241m=[39m get_model_name(state,brand) 

[0;31mNameError[0m: name 'kbm' is not defined

In [0]:
def make_prediction(model_params_dict: dict, state: str, keyCol: str):
    run_name_col = 'run_name'
    experiments = mlflow.MlflowClient().search_experiments()
    experiment_ids = [dict(x).get('experiment_id') for x in experiments if dict(x).get('name').find('keyBrand') >=0]
    
    runs_df = mlflow.search_runs(experiment_ids, order_by=["tags.mlflow.runName ASC"]).\
            rename({'tags.mlflow.runName':run_name_col}, axis=1)[['run_id',run_name_col]]
    
    
    model_name = kbm.get_model_name(state, keyCol)
    run_id = runs_df[runs_df[run_name_col] == model_name]['run_id'].values[0]
    
    model_uri = f'runs:/{run_id}/SparkML-linear-regression'
    
    # Load model as a PyFuncModel.
    model = mlflow.pyfunc.load_model(model_uri)
    
    # Predict on a Pandas DataFrame.
    results = model.predict(pd.DataFrame(model_params_dict))
    
    return results
    

make_prediction(testss, 'WY', 'COKE')

Out[94]: [16.60153277201789]

In [0]:
T

Out[93]: [16.60153277201789]

In [0]:
make_prediction(testss, 'AZ', 'COKE')

Out[95]: [16.60153277201789]

In [0]:
experiments = mlflow.MlflowClient().search_experiments()
#mlf_client
experiment_ids = [dict(x).get('experiment_id') for x in experiments if dict(x).get('name').find('keyBrand') >=0]
runs_df = mlflow.search_runs(experiment_ids, order_by=["tags.mlflow.runName ASC"]).\
            rename({'tags.mlflow.runName':'run_name'}, axis=1)[['run_id','run_name']]
display(runs_df)

run_id,run_name
4435f593e60c4dc8afec8c6996ec97fe,AZ_CHERRY_COKE
2e31412305ca41f280a981bc4fb9b178,AZ_CINNAMON_COKE
5839cc0e725a49e08210e7e0f59b8110,AZ_COCA-COLA_DREAMWORLD
23bcb4eed9c24bca8566e04afbe63c4a,AZ_COKE
7cebcd6f493c4273be56c78ec37a264b,AZ_COKE_CF
d37fa577d28a4309849ca2e269755534,AZ_COKE_CHERRY_VANILLA
fba7fa3ceb634959bc39cbb64348bb50,AZ_COKE_STARLIGHT
6856e99931ee4fa19c89b4263354bd7a,AZ_COKE_W/LIME
4cf082c7ffc74ce69637864a4db63a1a,AZ_DIET_DR_PEPPER
f1a692a181e24ef7b776e2b579c74ee5,AZ_DIET_DR_PEPPER_10


In [0]:
runs_df[runs_df['run_name'] == 'AZ_CHERRY_COKE']['run_name'][0]

Out[64]: 'AZ_CHERRY_COKE'

In [0]:
pd.DataFrame(testss)

Unnamed: 0,weekofyear,month,quarter,year
0,3,1,1,2021


In [0]:
testss = {'weekofyear': [3], 'month': [1], 'quarter':[1], 'year':[2021]}

#logged_model = 'runs:/e8444b254d1c4e22a475cf130203110a/SparkML-linear-regression'
logged_model = 'runs:/b1ddd20d82f14ac5a850176d5d0908ce/SparkML-linear-regression'


# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
loaded_model.predict(pd.DataFrame(testss))


Out[2]: [1.0633873032303427]

In [0]:
runs_df.drop_duplicates()

Unnamed: 0,run_id,tags.mlflow.runName
0,4435f593e60c4dc8afec8c6996ec97fe,AZ_CHERRY_COKE
1,2e31412305ca41f280a981bc4fb9b178,AZ_CINNAMON_COKE
2,5839cc0e725a49e08210e7e0f59b8110,AZ_COCA-COLA_DREAMWORLD
3,23bcb4eed9c24bca8566e04afbe63c4a,AZ_COKE
4,7cebcd6f493c4273be56c78ec37a264b,AZ_COKE_CF
...,...,...
237,1a0466e0b4df4f6a98c4e00bd66577c4,WY_DR_PEPPER_ZERO_SUGAR
238,3eea4f56c6a14bb4a5587f2a20143f51,WY_MEXICAN_COKE
239,9e39df539a264219ab775c5e422bb3a3,WY_ORANGE_VANILLA_COKE
240,2d66db2be2184dafbd9f9045232cfe54,WY_REMAINING_BRAND


In [0]:
df[]

run_id,tags.mlflow.runName
4435f593e60c4dc8afec8c6996ec97fe,AZ_CHERRY_COKE
2e31412305ca41f280a981bc4fb9b178,AZ_CINNAMON_COKE
5839cc0e725a49e08210e7e0f59b8110,AZ_COCA-COLA_DREAMWORLD
23bcb4eed9c24bca8566e04afbe63c4a,AZ_COKE
7cebcd6f493c4273be56c78ec37a264b,AZ_COKE_CF
d37fa577d28a4309849ca2e269755534,AZ_COKE_CHERRY_VANILLA
fba7fa3ceb634959bc39cbb64348bb50,AZ_COKE_STARLIGHT
6856e99931ee4fa19c89b4263354bd7a,AZ_COKE_W/LIME
4cf082c7ffc74ce69637864a4db63a1a,AZ_DIET_DR_PEPPER
f1a692a181e24ef7b776e2b579c74ee5,AZ_DIET_DR_PEPPER_10


In [0]:
experiment_id = 142448669977004
df = mlflow.search_runs([experiment_id], order_by=["metrics.rmse DESC"])
df['tags.mlflow.runName'].values

Out[90]: array(['NM_COKE', 'AZ_COKE', 'NM_DIET_DR_PEPPER_CREAM_SODA',
       'CA_DR_PEPPER_ZERO_SUGAR', 'OR_COKE', 'ID_COKE', 'NE_COKE',
       'CA_DR_PEPPER_CREAM_SODA', 'WY_COKE',
       'CA_DR_PEPPER_CREAM_SODA_ZERO_SUGAR', 'NM_DR_PEPPER', 'WA_COKE',
       'NE_DR_PEPPER', 'CA_COKE', 'WY_DR_PEPPER', 'CO_COKE',
       'NM_DIET_DR_PEPPER_10', 'NM_DR_PEPPER_DARK_BERRY', 'CA_DR_PEPPER',
       'ID_DIET_DR_PEPPER_CREAM_SODA', 'CA_DIET_DR_PEPPER_CREAM_SODA',
       'WY_DIET_DR_PEPPER_CREAM_SODA',
       'ID_DR_PEPPER_CREAM_SODA_ZERO_SUGAR', 'OR_DIET_DR_PEPPER_CF',
       'WY_DR_PEPPER_CREAM_SODA_ZERO_SUGAR', 'AZ_DR_PEPPER',
       'NM_DR_PEPPER_CREAM_SODA', 'CO_DIET_DR_PEPPER_CREAM_SODA',
       'CO_DIET_DR_PEPPER_CHERRY', 'NM_DR_PEPPER_CREAM_SODA_ZERO_SUGAR',
       'OR_DR_PEPPER_CREAM_SODA_ZERO_SUGAR',
       'NM_DR_PEPPER_CHERRY_ZERO_SUGAR', 'WA_DR_PEPPER_DARK_BERRY',
       'WA_DR_PEPPER_CREAM_SODA_ZERO_SUGAR', 'NE_DIET_DR_PEPPER_CF',
       'WA_DIET_DR_PEPPER_CREAM_SODA', 'ID_DR_PEPP

Out[80]: ['142448669977004']

In [0]:
[x for x in if dict(experiments[0]).get('name').find('keyBrand') > 0]

Out[75]: 50

In [0]:
msk = 'model_state_keyBrand'
mlflow.get_experiment_by_name(msk).name

Unexpected exception formatting exception. Falling back to standard exception
Traceback (most recent call last):
  File "/databricks/python/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3378, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<command-4394589272978372>", line 2, in <module>
    mlflow.get_experiment_by_name(msk).name
AttributeError: 'NoneType' object has no attribute 'name'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/databricks/python/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 1997, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/databricks/python/lib/python3.9/site-packages/IPython/core/ultratb.py", line 1112, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/databricks/python/lib/python3.9/site-packages/IPython/core/ultratb.py", line 1006, in structured_traceback
    return



In [0]:
RN = 'WY_VANILLA_COKE'
mlflow.search_runs()

[0;31m---------------------------------------------------------------------------[0m
[0;31mRestException[0m                             Traceback (most recent call last)
File [0;32m<command-4394589272978406>:2[0m
[1;32m      1[0m RN [38;5;241m=[39m [38;5;124m'[39m[38;5;124mWY_VANILLA_COKE[39m[38;5;124m'[39m
[0;32m----> 2[0m mlflow[38;5;241m.[39msearch_runs(RN)

File [0;32m/databricks/python/lib/python3.9/site-packages/mlflow/tracking/fluent.py:1448[0m, in [0;36msearch_runs[0;34m(experiment_ids, filter_string, run_view_type, max_results, order_by, output_format, search_all_experiments, experiment_names)[0m
[1;32m   1438[0m [38;5;28;01mdef[39;00m [38;5;21mpagination_wrapper_func[39m(number_to_get, next_page_token):
[1;32m   1439[0m     [38;5;28;01mreturn[39;00m MlflowClient()[38;5;241m.[39msearch_runs(
[1;32m   1440[0m         experiment_ids,
[1;32m   1441[0m         filter_string,
[0;32m   (...)[0m
[1;32m   1445[0m         next_page_token,
[

In [0]:
experiment_id = 142448669977004
df = mlflow.search_runs([experiment_id], order_by=["metrics.rmse DESC"])
df['tags.mlflow.runName'].values

Out[34]: array(['NM_COKE', 'AZ_COKE', 'NM_DIET_DR_PEPPER_CREAM_SODA',
       'CA_DR_PEPPER_ZERO_SUGAR', 'OR_COKE', 'ID_COKE', 'NE_COKE',
       'CA_DR_PEPPER_CREAM_SODA', 'WY_COKE',
       'CA_DR_PEPPER_CREAM_SODA_ZERO_SUGAR', 'NM_DR_PEPPER', 'WA_COKE',
       'NE_DR_PEPPER', 'CA_COKE', 'WY_DR_PEPPER', 'CO_COKE',
       'NM_DIET_DR_PEPPER_10', 'NM_DR_PEPPER_DARK_BERRY', 'CA_DR_PEPPER',
       'ID_DIET_DR_PEPPER_CREAM_SODA', 'CA_DIET_DR_PEPPER_CREAM_SODA',
       'WY_DIET_DR_PEPPER_CREAM_SODA',
       'ID_DR_PEPPER_CREAM_SODA_ZERO_SUGAR', 'OR_DIET_DR_PEPPER_CF',
       'WY_DR_PEPPER_CREAM_SODA_ZERO_SUGAR', 'AZ_DR_PEPPER',
       'NM_DR_PEPPER_CREAM_SODA', 'CO_DIET_DR_PEPPER_CREAM_SODA',
       'CO_DIET_DR_PEPPER_CHERRY', 'NM_DR_PEPPER_CREAM_SODA_ZERO_SUGAR',
       'OR_DR_PEPPER_CREAM_SODA_ZERO_SUGAR',
       'NM_DR_PEPPER_CHERRY_ZERO_SUGAR', 'WA_DR_PEPPER_DARK_BERRY',
       'WA_DR_PEPPER_CREAM_SODA_ZERO_SUGAR', 'NE_DIET_DR_PEPPER_CF',
       'WA_DIET_DR_PEPPER_CREAM_SODA', 'ID_DR_PEPP

In [0]:
run_names_list = df['tags.mlflow.runName'].values
print(len(run_names_list))

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-4394589272978350>:1[0m
[0;32m----> 1[0m run_names_list [38;5;241m=[39m df[[38;5;124m'[39m[38;5;124mtags.mlflow.runName[39m[38;5;124m'[39m][38;5;241m.[39mvalues
[1;32m      2[0m [38;5;28mprint[39m([38;5;28mlen[39m(run_names_list))

[0;31mNameError[0m: name 'df' is not defined

In [0]:
import requests
response = requests.get("https://adb-4647227049398975.15.azuredatabricks.net/?o=4647227049398975#mlflow/experiments")

In [0]:
b1ddd20d82f14ac5a850176d5d0908ce

[0;31m---------------------------------------------------------------------------[0m
[0;31mModuleNotFoundError[0m                       Traceback (most recent call last)
File [0;32m<command-4394589272978411>:1[0m
[0;32m----> 1[0m [38;5;28;01mfrom[39;00m [38;5;21;01mdatabricks_api[39;00m [38;5;28;01mimport[39;00m DatabricksAPI

File [0;32m/databricks/python_shell/dbruntime/PythonPackageImportsInstrumentation/__init__.py:171[0m, in [0;36m_create_import_patch.<locals>.import_patch[0;34m(name, globals, locals, fromlist, level)[0m
[1;32m    166[0m thread_local[38;5;241m.[39m_nest_level [38;5;241m+[39m[38;5;241m=[39m [38;5;241m1[39m
[1;32m    168[0m [38;5;28;01mtry[39;00m:
[1;32m    169[0m     [38;5;66;03m# Import the desired module. If you’re seeing this while debugging a failed import,[39;00m
[1;32m    170[0m     [38;5;66;03m# look at preceding stack frames for relevant error information.[39;00m
[0;32m--> 171[0m     original_result [38;5;241m=[3