# Azure AutoML 

## Install Azure Auto ML sdk 

Add ```azureml-sdk[automl_databricks]``` library to your Databricks cluster

In [2]:
# Set up notebook parameters
dbutils.widgets.text("STORAGE_ACCOUNT", "azureailabs")
dbutils.widgets.text("CONTAINER", "ingest")
dbutils.widgets.text("ACCOUNT_KEY", "")

In [3]:
# Load data from Azure Blob
STORAGE_ACCOUNT = dbutils.widgets.get("STORAGE_ACCOUNT").strip()
CONTAINER = dbutils.widgets.get("CONTAINER").strip()
ACCOUNT_KEY = dbutils.widgets.get("ACCOUNT_KEY").strip()

if ACCOUNT_KEY != "":
  # Set up account access key
  conf_key = "fs.azure.account.key.{storage_acct}.blob.core.windows.net".format(storage_acct=STORAGE_ACCOUNT)
  spark.conf.set(conf_key, ACCOUNT_KEY)

source_str = "wasbs://{container}@{storage_acct}.blob.core.windows.net/".format(container=CONTAINER, storage_acct=STORAGE_ACCOUNT)
  
# Read the data from the default datasets repository in Databricks
df = spark.read.option("header", True).option("inferSchema", True).csv(source_str)
display(df)


In [4]:
from pyspark.sql.types import *

schema = StructType([
  StructField("age", DoubleType()),
  StructField("annualincome", DoubleType()),
  StructField("calldroprate", DoubleType()),
  StructField("callfailurerate", DoubleType()),
  StructField("callingnum", StringType()),
  StructField("customerid", StringType()),
  StructField("customersuspended",  StringType()),
  StructField("education",  StringType()),
  StructField("gender", StringType()),
  StructField("homeowner", StringType()),
  StructField("maritalstatus", StringType()),
  StructField("monthlybilledamount", DoubleType()),
  StructField("noadditionallines", StringType()),
  StructField("numberofcomplaints", DoubleType()),
  StructField("numberofmonthunpaid", DoubleType()),
  StructField("numdayscontractequipmentplanexpiring", DoubleType()),
  StructField("occupation", StringType()),
  StructField("penaltytoswitch", DoubleType()),
  StructField("state", StringType()),
  StructField("totalminsusedinlastmonth", DoubleType()),
  StructField("unpaidbalance", DoubleType()),
  StructField("usesinternetservice", StringType()),
  StructField("usesvoiceservice", StringType()),
  StructField("percentagecalloutsidenetwork", DoubleType()),
  StructField("totalcallduration", DoubleType()),
  StructField("avgcallduration", DoubleType()),
  StructField("churn", DoubleType()),
  StructField("year", DoubleType()),
  StructField("month", DoubleType())
])

df = (spark.read
     .option("header", True)
     .schema(schema)
     .csv(source_str))

display(df)

In [5]:
clean_df = (df.drop("customerid", "callingnum", "year", "month")
    .dropDuplicates()
    .filter(~ ((df.age<14) & (df.annualincome>10000))))
  
display(clean_df.groupBy("churn").count())

In [6]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, VectorIndexer
from pyspark.sql.types import *
from pyspark.ml import Pipeline

# Create a list of string indexers - one for each string column
stringCols = [field.name for field in clean_df.schema if field.dataType == StringType()]
stringIndexers = [StringIndexer().setInputCol(name).setOutputCol(name+"_idx") for name in stringCols]

# Create a pipeline
stages = stringIndexers
pipeline = Pipeline(stages=stages)

# Check the Pipeline operation
indexed=pipeline.fit(clean_df).transform(clean_df)

In [7]:
# Split the dataset randomly into 85% for training and 15% for testing.

train, test = indexed.select([column for column in indexed.columns if column not in stringCols]).randomSplit([0.85, 0.15], 0)
print("We have {} training examples and {} test examples.".format(train.count(), test.count()))

In [8]:
pdTrain=train.toPandas()

In [9]:
label=['churn']
x_train = train.select([column for column in pdTrain.columns if column not in label]).toPandas()
y_train = train.select([column for column in pdTrain.columns if column in label]).toPandas()
y_train = y_train.pop('churn').values

In [10]:
print(x_train.shape)
print(y_train.shape)

In [11]:
subscription_id = "USE YOUR SUB ID"
resource_group = "USE YOUR RG NAEM"
workspace_name = "USE YOUR WS"
workspace_region = "USE YOUR REGION"

# import the Workspace class and check the azureml SDK version
# exist_ok checks if workspace exists or not.

from azureml.core import Workspace

ws = Workspace.create(name = workspace_name,
                      subscription_id = subscription_id,
                      resource_group = resource_group, 
                      location = workspace_region,
                      exist_ok=True)

ws.get_details()

In [12]:
import azureml.core
from azureml.core.experiment import Experiment
import pandas as pd

experiment_name =  'customer-churn-automl-exp' # choose a name for experiment
project_folder = './customer-churn-automl' # project folder

experiment=Experiment(ws, experiment_name)

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
output['Experiment Name'] = experiment.name
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(data=output, index=['']).T

In [13]:
from azureml.train.automl import AutoMLConfig

##Local compute 
Automl_config = AutoMLConfig(task = 'classification',
                             primary_metric = 'AUC_weighted',
                             iteration_timeout_minutes = 12000,
                             iterations = 10,
                             n_cross_validations = 3,
                             preprocess = False,
                             experiment_exit_score = 0.9000,
                             blacklist_models = ['LightGBM','kNN','LinearSVM'],
                             X = x_train,
                             y = y_train,
                             path=project_folder)

In [14]:
from azureml.core.experiment import Experiment
experiment=Experiment(ws, experiment_name)
local_run = experiment.submit(Automl_config, show_output=True)

## Install library to cluster

```
azureml-widgets
azureml-explain-model
```

In [16]:
from azureml.widgets import RunDetails
RunDetails(local_run).show()

In [17]:
best_run, fitted_model = local_run.get_output()

In [18]:
best_run.get_details()