## Here we will set up all the steps for the AKS cluster

In [1]:
from environs import Env

ENV = Env()
ENV.read_env()


True

In [127]:
subscription_id = ENV("AZURE_SUBSCRIPTION_ID")
resource_group = ENV("AZURE_RESOURCE_GROUP") #  i.e.'kuberg'
cluster_name = "kubeflow-labs-test"
location = 'eastus'
agent_size = "Standard_D1_v2" # i.e. 'Standard_NC6', Standard_D1_v2
aks_name = ENV("AKS_NAME") # i.e. 'kubeaks'
agent_count = 2 # agent count is the number of VMs that will be provisioned in the cluster, you can pick any number.
storage_account = "fenocerakubestorage"#ENV("STORAGE_ACCOUNT_NAME") # i.e. 'kubest'
storage_container = ENV("AKS_CONTAINER") # i.e. 'blobfuse'

In [126]:
# Create resource group 

!az group create --name {resource_group} --location {location}

{
  "id": "/subscriptions/fb45cb39-23ee-447d-a047-4c8ba0a5d527/resourceGroups/fenocera_rg_2",
  "location": "eastus",
  "managedBy": null,
  "name": "fenocera_rg_2",
  "properties": {
    "provisioningState": "Succeeded"
  },
  "tags": null,
  "type": null
}


In [10]:
# Check kubernetes versions 

!az aks get-versions --location eastus --output table

KubernetesVersion    Upgrades
-------------------  ------------------------
1.13.5               None available
1.12.8               1.13.5
1.12.7               1.12.8, 1.13.5
1.11.9               1.12.7, 1.12.8
1.11.8               1.11.9, 1.12.7, 1.12.8
1.10.13              1.11.8, 1.11.9
1.10.12              1.10.13, 1.11.8, 1.11.9
1.9.11               1.10.12, 1.10.13
1.9.10               1.9.11, 1.10.12, 1.10.13


In [128]:
# Create AKS cluster

!az aks create --node-vm-size {agent_size} --resource-group {resource_group} --name {cluster_name} --location {location} --kubernetes-version 1.12.7 --node-count {agent_count} --generate-ssh-keys

[K{- Finished ..
  "aadProfile": null,
  "addonProfiles": null,
  "agentPoolProfiles": [
    {
      "availabilityZones": null,
      "count": 3,
      "enableAutoScaling": null,
      "maxCount": null,
      "maxPods": 110,
      "minCount": null,
      "name": "nodepool1",
      "orchestratorVersion": "1.12.7",
      "osDiskSizeGb": 100,
      "osType": "Linux",
      "provisioningState": "Succeeded",
      "type": "AvailabilitySet",
      "vmSize": "Standard_D1_v2",
      "vnetSubnetId": null
    }
  ],
  "apiServerAuthorizedIpRanges": null,
  "dnsPrefix": "kubeflow-l-fenocerarg2-fb45cb",
  "enablePodSecurityPolicy": null,
  "enableRbac": true,
  "fqdn": "kubeflow-l-fenocerarg2-fb45cb-2e64e56f.hcp.eastus.azmk8s.io",
  "id": "/subscriptions/fb45cb39-23ee-447d-a047-4c8ba0a5d527/resourcegroups/fenocera_rg_2/providers/Microsoft.ContainerService/managedClusters/kubeflow-labs-test",
  "kubernetesVersion": "1.12.7",
  "linuxProfile": {
    "adminUsername": "azureuser",
    "ssh": {
      

In [16]:
# Get the kubeconfig file 

!az aks get-credentials --name {cluster_name} --resource-group {resource_group}

A different object named kubeflow-labs-test already exists in your kubeconfig file.
Overwrite? (y/n): 

In [19]:
# NVIDIA PLUGIN

!kubectl apply -f https://raw.githubusercontent.com/nvidia/k8s-device-plugin/v1.11/nvidia-device-plugin.yml

daemonset.extensions "nvidia-device-plugin-daemonset" created


In [20]:
!kubectl get nodes

NAME                       STATUS    ROLES     AGE       VERSION
aks-nodepool1-11457415-0   Ready     agent     21m       v1.12.7
aks-nodepool1-11457415-1   Ready     agent     21m       v1.12.7


In [21]:
!kubectl describe node aks-nodepool1-11457415-1

Name:               aks-nodepool1-11457415-1
Roles:              agent
Labels:             agentpool=nodepool1
                    beta.kubernetes.io/arch=amd64
                    beta.kubernetes.io/instance-type=Standard_D1_v2
                    beta.kubernetes.io/os=linux
                    failure-domain.beta.kubernetes.io/region=eastus
                    failure-domain.beta.kubernetes.io/zone=1
                    kubernetes.azure.com/cluster=MC_fenocera_rg_2_kubeflow-labs-test_eastus
                    kubernetes.io/hostname=aks-nodepool1-11457415-1
                    kubernetes.io/role=agent
                    node-role.kubernetes.io/agent=
                    storageprofile=managed
                    storagetier=Standard_LRS
Annotations:        node.alpha.kubernetes.io/ttl=0
                    volumes.kubernetes.io/controller-managed-attach-detach=true
CreationTimestamp:  Wed, 29 May 2019 15:06:34 -0400
Taints:             <none>
Unschedulable:      fa

In [12]:
!kubectl create -f 2-mnist-training.yaml

job.batch "2-mnist-training" created


In [22]:
!kubectl get job

NAME               DESIRED   SUCCESSFUL   AGE
2-mnist-training   1         0            1m


In [25]:
!kubectl get po

NAME                     READY     STATUS              RESTARTS   AGE
2-mnist-training-dqvrk   0/1       ContainerCreating   0          1m


In [26]:
!kubectl logs 2-mnist-training-dqvrk

2019-05-23 17:55:18.035020: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2019-05-23 17:55:18.148748: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: 
name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235
pciBusID: 601b:00:00.0
totalMemory: 11.17GiB freeMemory: 11.11GiB
2019-05-23 17:55:18.148792: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla K80, pci bus id: 601b:00:00.0, compute capability: 3.7)
2019-05-23 17:55:22.747308: I tensorflow/stream_executor/dso_loader.cc:139] successfully opened CUDA library libcupti.so.8.0 locally
Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting /tmp/tensorflow/input_data/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting /

In [28]:
!kubectl get job

NAME               DESIRED   SUCCESSFUL   AGE
2-mnist-training   1         1            16m


In [29]:
!kubectl get pod --all-namespaces | grep tiller

In [102]:
# If there is no tiller running have to install/set up help and tiller in the aks cluster 

!kubectl apply -f helm-rbac.yaml

serviceaccount "tiller" unchanged
clusterrolebinding.rbac.authorization.k8s.io "tiller" configured


In [103]:
!helm init --service-account tiller

$HELM_HOME has been configured at /Users/federica/.helm.
(Use --client-only to suppress this message, or --upgrade to upgrade Tiller to the current version.)
Happy Helming!


In [104]:
!kubectl get pod --all-namespaces | grep tiller

kube-system   tiller-deploy-7b65c7bff9-fndck              1/1       Running   0          18h


In [36]:
!helm install stable/wordpress

NAME:   named-cardinal
LAST DEPLOYED: Thu May 23 14:32:53 2019
NAMESPACE: default
STATUS: DEPLOYED

RESOURCES:
==> v1/ConfigMap
NAME                          DATA  AGE
named-cardinal-mariadb        1     0s
named-cardinal-mariadb-tests  1     0s

==> v1/Deployment
NAME                      READY  UP-TO-DATE  AVAILABLE  AGE
named-cardinal-wordpress  0/1    1           0          0s

==> v1/PersistentVolumeClaim
NAME                      STATUS   VOLUME   CAPACITY  ACCESS MODES  STORAGECLASS  AGE
named-cardinal-wordpress  Pending  default  0s

==> v1/Pod(related)
NAME                                       READY  STATUS   RESTARTS  AGE
named-cardinal-mariadb-0                   0/1    Pending  0         0s
named-cardinal-wordpress-5456db8f66-8nnsh  0/1    Pending  0         0s

==> v1/Secret
NAME                      TYPE    DATA  AGE
named-cardinal-mariadb    Opaque  2     0s
named-cardinal-wordpress  Opaque  1     0s

==> v1/Service
NAME                      TYPE          CLUSTER-IP   E

In [37]:
# To create your own helm chart

!helm create my_helm_chart

Creating my_helm_chart


In [None]:
# To deploy your helm chart to the cluster

!helm install . --name my-custom-chart

In [38]:
!helm install stable/dokuwiki --set dokuwikiWikiName="Hello FED"


NAME:   flippant-rat
LAST DEPLOYED: Thu May 23 14:59:05 2019
NAMESPACE: default
STATUS: DEPLOYED

RESOURCES:
==> v1/PersistentVolumeClaim
NAME                            STATUS   VOLUME   CAPACITY  ACCESS MODES  STORAGECLASS  AGE
flippant-rat-dokuwiki-apache    Pending  default  1s
flippant-rat-dokuwiki-dokuwiki  Pending  default  1s

==> v1/Pod(related)
NAME                                   READY  STATUS   RESTARTS  AGE
flippant-rat-dokuwiki-8fbfddd6f-bdmxw  0/1    Pending  0         1s

==> v1/Secret
NAME                   TYPE    DATA  AGE
flippant-rat-dokuwiki  Opaque  1     1s

==> v1/Service
NAME                   TYPE          CLUSTER-IP  EXTERNAL-IP  PORT(S)                     AGE
flippant-rat-dokuwiki  LoadBalancer  10.0.7.53   <pending>    80:31415/TCP,443:31901/TCP  1s

==> v1beta1/Deployment
NAME                   READY  UP-TO-DATE  AVAILABLE  AGE
flippant-rat-dokuwiki  0/1    1           0          1s


NOTES:

** Please be patient while the chart is being deployed **

1

In [40]:
!kubectl get svc --namespace default -w flippant-rat-dokuwiki

NAME                    TYPE           CLUSTER-IP   EXTERNAL-IP    PORT(S)                      AGE
flippant-rat-dokuwiki   LoadBalancer   10.0.7.53    13.82.31.171   80:31415/TCP,443:31901/TCP   49m
^C


In [51]:
KUBEFLOW_SRC = "Users/federica/Documents/Github/kubeflow-labs/kubeflow"
!mkdir {KUBEFLOW_SRC}
!cd {KUBEFLOW_SRC}
!export KUBEFLOW_TAG=v0.4.1
!curl https://raw.githubusercontent.com/kubeflow/kubeflow/{KUBEFLOW_TAG}/scripts/download.sh | bash

mkdir: /Users/federica/Documents/GitHub/kubeflow: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    15  100    15    0     0     97      0 --:--:-- --:--:-- --:--:--    98
bash: line 1: 404:: command not found


In [107]:
KFAPP="mykubeflowapp"
!{KUBEFLOW_SRC}/scripts/kfctl.sh init {KFAPP} --platform none
!cd {KUBEFLOW_SRC}
!{KUBEFLOW_SRC}/scripts/kfctl.sh generate k8s
!{KUBEFLOW_SRC}/scripts/kfctl.sh apply k8s

/bin/sh: Users/federica/Documents/Github/kubeflow-labs/kubeflow/scripts/kfctl.sh: No such file or directory
/bin/sh: Users/federica/Documents/Github/kubeflow-labs/kubeflow/scripts/kfctl.sh: No such file or directory
/bin/sh: Users/federica/Documents/Github/kubeflow-labs/kubeflow/scripts/kfctl.sh: No such file or directory


In [None]:
KFAPP=mykubeflowapp
!{KUBEFLOW_SRC}/scripts/kfctl.sh init {KFAPP} --platform none

# Generate kubeflow app
!cd {KFAPP}
!{KUBEFLOW_SRC}/scripts/kfctl.sh generate k8s

# Deploy Kubeflow app
!{KUBEFLOW_SRC}/scripts/kfctl.sh apply k8s


In [64]:
!kubectl get pods -n kubeflow

NAME                                                        READY     STATUS              RESTARTS   AGE
ambassador-5cdff47f4d-cdhxv                                 0/1       Running             0          1m
ambassador-5cdff47f4d-cxxv8                                 1/1       Running             0          1m
ambassador-5cdff47f4d-lw6wt                                 1/1       Running             0          1m
argo-ui-789c8577d5-rg4xh                                    1/1       Running             0          1m
centraldashboard-6f9948dc6d-ll697                           0/1       ContainerCreating   0          1m
jupyter-0                                                   1/1       Running             0          1m
jupyter-web-app-5c64c4f4cb-bp72s                            1/1       Running             0          1m
katib-ui-78f445bf8f-z779c                                   1/1       Running             0          1m
metacontroller-0                                      

In [None]:
# delete deployment 

!cd {KUBEFLOW_SRC}/{KFAPP}
!{KUBEFLOW_SRC}/scripts/kfctl.sh delete k8s # need to be in the app directory (as above suggests)

In [None]:
# We are now going to run Jupyter on Kubernetes

# We run this again to get the deployment up from the app container
!../scripts/kfctl.sh apply k8s


In [67]:
!kubectl get svc -n kubeflow

NAME                                     TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)             AGE
ambassador                               ClusterIP   10.0.49.249    <none>        80/TCP              1m
ambassador-admin                         ClusterIP   10.0.132.181   <none>        8877/TCP            1m
argo-ui                                  NodePort    10.0.25.117    <none>        80:30013/TCP        1m
centraldashboard                         ClusterIP   10.0.155.216   <none>        80/TCP              1m
jupyter-0                                ClusterIP   None           <none>        8000/TCP            1m
jupyter-lb                               ClusterIP   10.0.10.240    <none>        80/TCP              1m
jupyter-web-app                          ClusterIP   10.0.60.253    <none>        80/TCP              1m
katib-ui                                 ClusterIP   10.0.228.10    <none>        80/TCP              1m
minio-service                            Clus

In [217]:
# have to go into the ksonnet app part before setting the param

!cd ks_app
!ks param set jupyter serviceType LoadBalancer
!cd ..
!kubeflow/scripts/kfctl.sh apply k8s

/bin/sh: line 0: cd: ks_app: No such file or directory
[31mERROR[0m finding app root from starting path: : unable to find ksonnet project 
/bin/sh: kubeflow/scripts/kfctl.sh: No such file or directory


In [115]:
# Port forward the Kubeflow dashboard so can see whats going on 

!kubectl port-forward svc/ambassador -n kubeflow 8080:80

Forwarding from 127.0.0.1:8080 -> 80
Forwarding from [::1]:8080 -> 80
Handling connection for 8080
Handling connection for 8080
Handling connection for 8080
Handling connection for 8080
Handling connection for 8080
^C


In [83]:
!kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.alpha\.kubernetes\.io\/nvidia-gpu"

NAME                       GPU
aks-nodepool1-11457415-0   <none>
aks-nodepool1-11457415-1   <none>
aks-nodepool1-11457415-2   <none>


In [114]:
!kubectl -n kubeflow describe pods jupyter

Name:               jupyter-0
Namespace:          kubeflow
Priority:           0
PriorityClassName:  <none>
Node:               aks-nodepool1-11457415-0/10.240.0.5
Start Time:         Fri, 24 May 2019 08:44:57 -0400
Labels:             app=jupyter
                    app.kubernetes.io/name=mykubeflowapp
                    controller-revision-hash=jupyter-54549d585b
                    statefulset.kubernetes.io/pod-name=jupyter-0
Annotations:        <none>
Status:             Running
IP:                 10.244.1.51
Controlled By:      StatefulSet/jupyter
Containers:
  jupyter:
    Container ID:  docker://78f25ad8923ecc2ed02db7cd3c667a87aa610f8463f5b6c3159abdab0dc79e15
    Image:         gcr.io/kubeflow/jupyterhub-k8s:v20180531-3bb991b1
    Image ID:      docker-pullable://gcr.io/kubeflow/jupyterhub-k8s@sha256:5e2c71d050bec85c258a31aa4507ca8adb3b2f5158a4dc919a39118b8879a5ce
    Ports:         8000/TCP, 8081/TCP
    Host Ports:    0/TCP, 0/TCP
    Command:
      jupyterhub
      -f
     

In [212]:
!kubectl get svc -n kubeflow

No resources found.


In [251]:
# Making a TFJOB - had to create it with kubeflow.org/v1beta2 rather than v1beta1

!kubectl create -f tfjob.yaml


tfjob.kubeflow.org "tfjob" created


In [252]:
!kubectl get tfjob #If has a kubernetes namespace then have to use it 

NAME      AGE
tfjob     3s


In [253]:
!kubectl get pods

NAME                       READY     STATUS    RESTARTS   AGE
named-cardinal-mariadb-0   1/1       Running   0          8m
tfjob-master-0             1/1       Running   0          5s


In [255]:
!kubectl logs tfjob-master-0

2019-05-24 21:57:31.690859: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2019-05-24 21:57:31.805821: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: 
name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235
pciBusID: 2bde:00:00.0
totalMemory: 11.17GiB freeMemory: 11.11GiB
2019-05-24 21:57:31.805861: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla K80, pci bus id: 2bde:00:00.0, compute capability: 3.7)
2019-05-24 21:57:36.557517: I tensorflow/stream_executor/dso_loader.cc:139] successfully opened CUDA library libcupti.so.8.0 locally


In [211]:
!kubectl get pvc

NAME                             STATUS    VOLUME                                     CAPACITY   ACCESS MODES   STORAGECLASS   AGE
azurefile                        Bound     pvc-0ba98925-7e65-11e9-9597-8ea533332f11   5Gi        RWX            azurefile      43m
data-named-cardinal-mariadb-0    Bound     pvc-2e75d2ed-7d89-11e9-9597-8ea533332f11   8Gi        RWO            default        1d
flippant-rat-dokuwiki-apache     Bound     pvc-d78faf92-7d8c-11e9-9597-8ea533332f11   1Gi        RWO            default        1d
flippant-rat-dokuwiki-dokuwiki   Bound     pvc-d7915238-7d8c-11e9-9597-8ea533332f11   8Gi        RWO            default        1d
named-cardinal-wordpress         Bound     pvc-2e508c27-7d89-11e9-9597-8ea533332f11   10Gi       RWO            default        1d


In [None]:
!kubectl delete -f 

In [218]:
# Need to create and mount storage

!kubectl apply -f azure-file-sc.yaml
!kubectl apply -f azure-pvc-roles.yaml
!kubectl apply -f azure-file-pvc.yaml

storageclass.storage.k8s.io "azurefile" configured
clusterrole.rbac.authorization.k8s.io "system:azure-cloud-provider" configured
clusterrolebinding.rbac.authorization.k8s.io "system:azure-cloud-provider" configured
persistentvolumeclaim "azurefile" unchanged


In [245]:
!kubectl create -f tfjob.yaml

error: unable to recognize "tfjob.yaml": no matches for kind "TFJob" in version "kubeflow.org/v1beta2"


In [246]:
# Deleting the deployments we dont want! 

In [256]:
!kubectl get pods

NAME                       READY     STATUS      RESTARTS   AGE
named-cardinal-mariadb-0   1/1       Running     1          1h
tfjob-master-0             0/1       Completed   0          1h


In [250]:
!kubectl get deployments --all-namespaces

NAMESPACE     NAME                   DESIRED   CURRENT   UP-TO-DATE   AVAILABLE   AGE
kube-system   coredns                2         2         2            2           1d
kube-system   coredns-autoscaler     1         1         1            1           1d
kube-system   heapster               1         1         1            1           1d
kube-system   kubernetes-dashboard   1         1         1            1           1d
kube-system   metrics-server         1         1         1            1           1d
kube-system   tiller-deploy          1         1         1            1           1d
kube-system   tunnelfront            1         1         1            1           1d


In [402]:
!kubectl delete pods,services,deployments named-cardinal-mariadb-0

pod "named-cardinal-mariadb-0" deleted
Error from server (NotFound): services "named-cardinal-mariadb-0" not found
Error from server (NotFound): deployments.extensions "named-cardinal-mariadb-0" not found


In [233]:
!kubectl delete pod named-cardinal-wordpress-5456db8f66-8nnsh

!kubectl delete pods --all

Error from server (NotFound): pods "named-cardinal-wordpress-5456db8f66-8nnsh" not found
pod "flippant-rat-dokuwiki-8fbfddd6f-zwtrm" deleted
pod "named-cardinal-mariadb-0" deleted
pod "named-cardinal-wordpress-5456db8f66-qdt46" deleted


In [260]:
!kubectl describe tfjob-master-0

error: the server doesn't have a resource type "tfjob-master-0"


In [258]:
!kubectl logs tfjob-master-0

2019-05-24 21:57:31.690859: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2019-05-24 21:57:31.805821: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: 
name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235
pciBusID: 2bde:00:00.0
totalMemory: 11.17GiB freeMemory: 11.11GiB
2019-05-24 21:57:31.805861: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla K80, pci bus id: 2bde:00:00.0, compute capability: 3.7)
2019-05-24 21:57:36.557517: I tensorflow/stream_executor/dso_loader.cc:139] successfully opened CUDA library libcupti.so.8.0 locally
Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting /tmp/tensorflow/input_data/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting /tmp/tensor

In [208]:
!kubectl -n kubeflow exec -it tfjob-master-0 -- bash

Error from server (BadRequest): pod tfjob-master-0 does not have a host assigned


In [261]:
!kubectl get pods

NAME                       READY     STATUS      RESTARTS   AGE
named-cardinal-mariadb-0   1/1       Running     1          3h
tfjob-master-0             0/1       Completed   0          2h


In [270]:
cd ../7-distributed-tensorflow/solution-src



In [339]:
!kubectl create -f tfjob-dist.yaml

tfjob.kubeflow.org "tfjobdist" created


In [342]:
!kubectl apply -f tensorboard-tfjob-dist.yaml

deployment.extensions "tensorboard" created


In [356]:
!kubectl get pods

NAME                           READY     STATUS      RESTARTS   AGE
named-cardinal-mariadb-0       1/1       Running     0          23m
tensorboard-5f96cf6b45-rbw8d   1/1       Running     0          24m
tfjob-master-0                 0/1       Completed   0          3h
tfjobdist-master-0             1/1       Running     0          24m
tfjobdist-ps-0                 0/1       Completed   0          24m
tfjobdist-worker-0             1/1       Running     0          24m


In [357]:
!kubectl logs tfjobdist-master-0 

2019-05-25 01:21:36.762429: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-05-25 01:21:36.881302: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1405] Found device 0 with properties: 
name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235
pciBusID: 2bde:00:00.0
totalMemory: 11.17GiB freeMemory: 11.11GiB
2019-05-25 01:21:36.881344: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1484] Adding visible gpu devices: 0
2019-05-25 01:21:37.158675: I tensorflow/core/common_runtime/gpu/gpu_device.cc:965] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-05-25 01:21:37.158731: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971]      0 
2019-05-25 01:21:37.158740: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] 0:   N 
2019-05-25 01:21:37.159270: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1097] Created TensorFlow device (/job:master

In [349]:
# Run this in terminal

!PODNAME=$(kubectl get pod -l app=tensorboard -o jsonpath='{.items[0].metadata.name}')
!kubectl port-forward ${PODNAME} 6006:6006

error: TYPE/NAME and list of ports are required for port-forward
See 'kubectl port-forward -h' for help and examples.


In [358]:
!kubectl delete -f tensorboard-tfjob-dist.yaml

deployment.extensions "tensorboard" deleted


In [363]:
cd ../../8-hyperparam-sweep/solution-chart/

/Users/federica/Documents/GitHub/kubeflow-labs/8-hyperparam-sweep/solution-chart


In [370]:
# This did not work :( 

!kubectl create -f values.yaml

error: error validating "values.yaml": error validating data: [apiVersion not set, kind not set]; if you choose to ignore these errors, turn validation off with --validate=false


In [376]:
!helm install .

NAME:   lanky-bobcat
LAST DEPLOYED: Fri May 24 21:55:21 2019
NAMESPACE: default
STATUS: DEPLOYED

RESOURCES:
==> v1/Pod(related)
NAME                                 READY  STATUS             RESTARTS  AGE
module8-tensorboard-57b9c9565-jz256  0/1    ContainerCreating  0         0s

==> v1/Service
NAME                 TYPE          CLUSTER-IP  EXTERNAL-IP  PORT(S)       AGE
module8-tensorboard  LoadBalancer  10.0.38.97  <pending>    80:32302/TCP  1s

==> v1beta1/Deployment
NAME                 READY  UP-TO-DATE  AVAILABLE  AGE
module8-tensorboard  0/1    1           0          1s

==> v1beta2/TFJob
NAME                  AGE
module8-tf-paint-0-0  0s
module8-tf-paint-0-1  0s
module8-tf-paint-0-2  1s
module8-tf-paint-1-0  0s
module8-tf-paint-1-1  1s
module8-tf-paint-1-2  0s
module8-tf-paint-2-0  0s
module8-tf-paint-2-1  0s
module8-tf-paint-2-2  0s




In [433]:
!kubectl get pods

NAME                                  READY     STATUS      RESTARTS   AGE
module8-tensorboard-57b9c9565-jz256   1/1       Running     0          9m
module8-tf-paint-0-0-master-0         0/1       Pending     0          9m
module8-tf-paint-0-1-master-0         0/1       Pending     0          9m
module8-tf-paint-0-2-master-0         1/1       Running     0          9m
module8-tf-paint-1-0-master-0         0/1       Pending     0          9m
module8-tf-paint-1-1-master-0         0/1       Pending     0          9m
module8-tf-paint-1-2-master-0         0/1       Pending     0          9m
module8-tf-paint-2-0-master-0         1/1       Running     0          9m
module8-tf-paint-2-1-master-0         0/1       Pending     0          9m
module8-tf-paint-2-2-master-0         0/1       Pending     0          9m
named-cardinal-mariadb-0              1/1       Running     0          5m
tfjob-master-0                        0/1       Completed   0          4h
tfjobdist-ps-0          

In [434]:
!kubectl get service

NAME                            TYPE           CLUSTER-IP   EXTERNAL-IP     PORT(S)        AGE
kubernetes                      ClusterIP      10.0.0.1     <none>          443/TCP        4h
module8-tensorboard             LoadBalancer   10.0.38.97   13.68.227.187   80:32302/TCP   9m
module8-tf-paint-0-0-master-0   ClusterIP      None         <none>          2222/TCP       9m
module8-tf-paint-0-1-master-0   ClusterIP      None         <none>          2222/TCP       9m
module8-tf-paint-0-2-master-0   ClusterIP      None         <none>          2222/TCP       9m
module8-tf-paint-1-0-master-0   ClusterIP      None         <none>          2222/TCP       9m
module8-tf-paint-1-1-master-0   ClusterIP      None         <none>          2222/TCP       9m
module8-tf-paint-1-2-master-0   ClusterIP      None         <none>          2222/TCP       9m
module8-tf-paint-2-0-master-0   ClusterIP      None         <none>          2222/TCP       9m
module8-tf-paint-2-1-master-0   ClusterIP      No

In [438]:
!kubectl delete pods --all

pod "module8-tensorboard-57b9c9565-cndlj" deleted
pod "named-cardinal-mariadb-0" deleted


In [442]:
!kubectl get pods

Unable to connect to the server: dial tcp: lookup kubeflow-l-fenocerarg2-fb45cb-1cf097e8.hcp.eastus.azmk8s.io on [2001:4898::1050:5050]:53: no such host


In [440]:
# Spin down Kubernetes cluster 

!az aks delete --yes --name {aks_name} --resource-group {resource_group}

[K - Finished ..[0m

## Hyperparameter Sweep

In [None]:
# Create Resource Group
!az group create --name {resource_group} --location {location}
# Create AKS cluster
!az aks create --node-vm-size {agent_size} --resource-group {resource_group} --name {cluster_name} --location {location} --kubernetes-version 1.12.7 --node-count {agent_count} --generate-ssh-keys

In [None]:
# Get the kubeconfig file 
# Run this in terminal !az aks get-credentials --name {cluster_name} --resource-group {resource_group}

# This is setuop for GPU 
# !kubectl apply -f https://raw.githubusercontent.com/nvidia/k8s-device-plugin/v1.11/nvidia-device-plugin.yml

In [129]:
!kubectl get nodes

NAME                       STATUS    ROLES     AGE       VERSION
aks-nodepool1-11457415-0   Ready     agent     6m        v1.12.7
aks-nodepool1-11457415-1   Ready     agent     6m        v1.12.7
aks-nodepool1-11457415-2   Ready     agent     6m        v1.12.7


In [134]:
cd ..

/Users/federica/Documents/GitHub/kubeflow-labs/kubeflow


In [135]:
# Setup Helm/Tiller
!kubectl apply -f helm-rbac.yaml

serviceaccount "tiller" created
clusterrolebinding.rbac.authorization.k8s.io "tiller" created


In [136]:
!helm init --service-account tiller
!kubectl get pod --all-namespaces | grep tiller

$HELM_HOME has been configured at /Users/federica/.helm.

Tiller (the Helm server-side component) has been installed into your Kubernetes Cluster.

Please note: by default, Tiller is deployed with an insecure 'allow unauthenticated users' policy.
To prevent this, run `helm init` with the --tiller-tls-verify flag.
For more information on securing your installation see: https://docs.helm.sh/using_helm/#securing-your-helm-installation
Happy Helming!
kube-system   tiller-deploy-7b65c7bff9-nwtsd          0/1       ContainerCreating   0          0s


In [None]:
# SETTING KUBEFLOW UP: 
# If don't have Kubeflow set up run this: 

# KUBEFLOW_SRC = "Users/federica/Documents/Github/kubeflow-labs/kubeflow"
# !mkdir {KUBEFLOW_SRC}
# !cd {KUBEFLOW_SRC}
# !export KUBEFLOW_TAG=v0.4.1
# !curl https://raw.githubusercontent.com/kubeflow/kubeflow/{KUBEFLOW_TAG}/scripts/download.sh | bash

# have to go into the ksonnet/kubeflow app part before setting the param
# cd ks_app
# !ks param set jupyter serviceType LoadBalancer
# cd ..
# !../scripts/kfctl.sh apply k8s

#USING KUBEFLOW: 
# To start a kubeflow deployment run this from inside the kubeflow app folder
# ../scripts/kfctl.sh apply k8s


# To delete kubeflow deployment run this from inside the kubeflow app folder 
# ../scripts/kfctl.sh delete all --delete_storage


# To mount storage run: 

# !kubectl apply -f azure-file-sc.yaml
# !kubectl apply -f azure-pvc-roles.yaml
# !kubectl apply -f azure-file-pvc.yaml

In [27]:
cd ../kubeflow/

/Users/federica/Documents/GitHub/kubeflow-labs/kubeflow


In [31]:
cd mykubeflowapp/

/Users/federica/Documents/GitHub/kubeflow-labs/kubeflow/mykubeflowapp


In [32]:
# Creating the kubeflow instance 
!../scripts/kfctl.sh apply k8s

+ ENV_FILE=env.sh
+ SKIP_INIT_PROJECT=false
+ GKE_API_VERSION=v1beta1
+ GCP_DEFAULT_ZONE=us-east1-d
+++ dirname ../scripts/kfctl.sh
++ cd ../scripts
++ pwd
+ DIR=/Users/federica/Documents/GitHub/kubeflow-labs/kubeflow/scripts
+ source /Users/federica/Documents/GitHub/kubeflow-labs/kubeflow/scripts/util.sh
+ source /Users/federica/Documents/GitHub/kubeflow-labs/kubeflow/scripts/gke/util.sh
++ set -xe
+ source /Users/federica/Documents/GitHub/kubeflow-labs/kubeflow/scripts/azure/util.sh
++ set -xe
+ source /Users/federica/Documents/GitHub/kubeflow-labs/kubeflow/scripts/aws/util.sh
++ set -xe
+ source /Users/federica/Documents/GitHub/kubeflow-labs/kubeflow/scripts/util-minikube.sh
++ RED='\033[0;31m'
++ GREEN='\033[0;32m'
++ YELLOW='\033[0;33m'
++ NC='\033[0m'
++ MOUNT_LOCAL=false
+ INPUT=()
+ FORMAT=()
+ export 'KUBEFLOW_COMPONENTS="ambassador","jupyter","notebook-controller","jupyter-web-app","profiles","centraldashboard","tf-job-operator","pytorch-operator","spartakus","argo","pipeline

clusterrolebinding.rbac.authorization.k8s.io "spartakus" created
clusterrolebinding.rbac.authorization.k8s.io "tf-job-dashboard" created
clusterrolebinding.rbac.authorization.k8s.io "tf-job-operator" created
compositecontroller.metacontroller.k8s.io "mykubeflowapp-controller" created
secret "mlpipeline-minio-artifact" created
secret "vizier-db-secrets" created
configmap "jupyter-config" created
configmap "jupyter-web-app-config" created
configmap "metricscollector-template" created
configmap "mykubeflowapp-controller-hooks" created
configmap "pytorch-operator-config" created
configmap "tf-job-operator-config" created
configmap "worker-template" created
configmap "workflow-controller-configmap" created
persistentvolumeclaim "katib-mysql" created
persistentvolumeclaim "minio-pvc" created
persistentvolumeclaim "mysql-pv-claim" created
serviceaccount "ambassador" created
serviceaccount "argo" created
serviceaccount "argo-ui" created
serviceaccount "centraldashboard" created
serviceaccount 

In [149]:
!kubectl get pods -n kubeflow

NAME                                                        READY     STATUS    RESTARTS   AGE
ambassador-5cdff47f4d-mvw4b                                 1/1       Running   0          4m
ambassador-5cdff47f4d-ssj6x                                 1/1       Running   0          4m
ambassador-5cdff47f4d-vswk2                                 1/1       Running   0          4m
argo-ui-789c8577d5-hwxsn                                    1/1       Running   0          4m
centraldashboard-6f9948dc6d-txrr8                           1/1       Running   0          4m
jupyter-0                                                   1/1       Running   0          4m
jupyter-web-app-5c64c4f4cb-xthpm                            1/1       Running   0          4m
katib-ui-78f445bf8f-r2vzn                                   1/1       Running   0          4m
metacontroller-0                                            1/1       Running   0          4m
minio-648694bc46-hvqt2                           

In [46]:
# have to go into the ksonnet/kubeflow app part before setting the param
# cd ks_app
# !ks param set jupyter serviceType LoadBalancer
# cd ..
# !kubeflow/scripts/kfctl.sh apply k8s


azure-file-pvc.yaml   helm-rbac.yaml        [34mscripts[m[m/
azure-file-sc.yaml    [34mhyperparam-chart[m[m/     tfjob.yaml
azure-pvc-roles.yaml  [34mkubeflow[m[m/
[34mdeployment[m[m/           [34mmykubeflowapp[m[m/


In [152]:
# We now have to deploy our helm charts for the parameter sweep to our cluster. 

!helm install hyperparam-chart/


NAME:   impressive-hydra
LAST DEPLOYED: Thu May 30 11:22:33 2019
NAMESPACE: default
STATUS: DEPLOYED

RESOURCES:
==> v1/Pod(related)
NAME                                 READY  STATUS             RESTARTS  AGE
module8-tensorboard-57b9c9565-vnz2h  0/1    ContainerCreating  0         0s

==> v1/Service
NAME                 TYPE          CLUSTER-IP   EXTERNAL-IP  PORT(S)       AGE
module8-tensorboard  LoadBalancer  10.0.115.45  <pending>    80:31190/TCP  1s

==> v1beta1/Deployment
NAME                 READY  UP-TO-DATE  AVAILABLE  AGE
module8-tensorboard  0/1    1           0          1s

==> v1beta2/TFJob
NAME                  AGE
module8-tf-paint-0-0  0s
module8-tf-paint-0-1  0s
module8-tf-paint-0-2  0s
module8-tf-paint-1-0  0s
module8-tf-paint-1-1  1s
module8-tf-paint-1-2  0s
module8-tf-paint-2-0  1s
module8-tf-paint-2-1  0s
module8-tf-paint-2-2  0s




In [99]:
!az aks show --resource-group fenocera_rg_2 --name kubeflow-labs-test --query kubernetesVersion

"1.12.7"


In [172]:
!kubectl get deployments

NAME                  DESIRED   CURRENT   UP-TO-DATE   AVAILABLE   AGE
module8-tensorboard   1         1         1            1           3m


In [173]:
# This did not work!! 

# !kubectl autoscale deployment module8-tensorboard --cpu-percent=50 --min=1 --max=10


In [182]:
!kubectl get pods

NAME                                  READY     STATUS      RESTARTS   AGE
module8-tensorboard-57b9c9565-vnz2h   1/1       Running     0          56m
module8-tf-paint-0-0-master-0         0/1       Completed   0          56m
module8-tf-paint-0-1-master-0         0/1       Completed   0          56m
module8-tf-paint-0-2-master-0         0/1       Completed   0          56m
module8-tf-paint-1-0-master-0         0/1       Completed   0          56m
module8-tf-paint-1-1-master-0         0/1       Completed   0          56m
module8-tf-paint-1-2-master-0         0/1       Completed   0          56m
module8-tf-paint-2-0-master-0         0/1       Completed   0          56m
module8-tf-paint-2-1-master-0         0/1       Completed   0          56m
module8-tf-paint-2-2-master-0         0/1       Completed   0          56m


In [204]:
!kubectl get service

NAME                            TYPE           CLUSTER-IP    EXTERNAL-IP     PORT(S)        AGE
kubernetes                      ClusterIP      10.0.0.1      <none>          443/TCP        2h
module8-tensorboard             LoadBalancer   10.0.115.45   104.45.133.25   80:31190/TCP   1h
module8-tf-paint-0-0-master-0   ClusterIP      None          <none>          2222/TCP       1h
module8-tf-paint-0-1-master-0   ClusterIP      None          <none>          2222/TCP       1h
module8-tf-paint-0-2-master-0   ClusterIP      None          <none>          2222/TCP       1h
module8-tf-paint-1-0-master-0   ClusterIP      None          <none>          2222/TCP       1h
module8-tf-paint-1-1-master-0   ClusterIP      None          <none>          2222/TCP       1h
module8-tf-paint-1-2-master-0   ClusterIP      None          <none>          2222/TCP       1h
module8-tf-paint-2-0-master-0   ClusterIP      None          <none>          2222/TCP       1h
module8-tf-paint-2-1-master-0   Cluster

In [202]:
!kubectl get deployments

NAME                  DESIRED   CURRENT   UP-TO-DATE   AVAILABLE   AGE
module8-tensorboard   1         1         1            1           1h


In [178]:
!kubectl describe module8-tf-paint-1-0-master-0

error: the server doesn't have a resource type "module8-tf-paint-1-0-master-0"


In [179]:
!kubectl exec -it module8-tf-paint-0-0-master-0 -- bash

]0;root@module8-tf-paint-0-0-master-0: /approot@module8-tf-paint-0-0-master-0:/app# 

In [205]:
!helm list

NAME            	REVISION	UPDATED                 	STATUS  	CHART                         	APP VERSION	NAMESPACE
impressive-hydra	1       	Thu May 30 11:22:33 2019	DEPLOYED	module8-hyperparam-sweep-0.1.0	           	default  


In [206]:
# delete helm deployment 

!helm delete --purge impressive-hydra

release "impressive-hydra" deleted


## Serving the model

In [207]:
# ACCESS_SECRET_KEY=hDrossgmaLFd7C6bRYqrQ6w8XTn+o2pQxpkhfKoyieD3mKKjkmesSpW7zpFxPkIze9R711pbOO1A0wjw1LQFJQ==
# ACCESS_KEY=f2c8637c082ee11e9a394d6

In [209]:
# Ran this in terminal 
# helm install --name minio --set accessKey=$ACCESS_KEY,secretKey=$ACCESS_SECRET_KEY,service.type=LoadBalancer stable/minio

/bin/sh: stable/minio: No such file or directory


In [None]:
# Then run this in terminal 

# SERVICE_IP=$(kubectl get svc minio --template="{{range .status.loadBalancer.ingress}}{{.ip}}{{end}}")
# S3_ENDPOINT=http://${SERVICE_IP}:9000   ##THIS WAS WRONG IN THE TUTORIAL 

# Followed by 

# mc config host add minio $S3_ENDPOINT $ACCESS_KEY $ACCESS_SECRET_KEY

# Finally: 

# BUCKET_NAME=kubeflow

# mc mb minio/$BUCKET_NAME

# mc cp --recursive ../model_tf/$BUCKET_NAME # THIS IS WRONG IT NEEDS A SPACE
# ie: 
# mc cp --recursive model_tf/ $BUCKET_NAME



# export NAMESPACE=serving

# kubectl create namespace $NAMESPACE


# kubectl create secret generic serving-creds --from-literal=accessKeyID=${ACCESS_KEY} \
#  --from-literal=secretAccessKey=${ACCESS_SECRET_KEY} -n $NAMESPACE

In [None]:
S3_USE_HTTPS=0
S3_VERIFY_SSL=0
JOB_NAME=myjob
MODEL_COMPONENT=serveInception
MODEL_NAME=inception
MODEL_PATH=s3://${BUCKET_NAME}/models/${JOB_NAME}/export/${MODEL_NAME}/
MODEL_SERVER_IMAGE=sozercan/tensorflow-model-server

In [None]:
MODEL_SERVER_IMAGE=tensorflow/serving

In [212]:
!az aks delete --yes --name {aks_name} --resource-group {resource_group}

[K - Finished ..[0m