# Transform your Data
The point of transforming your data is to make it as easy as possible for your algorithms to distinguish signal from noise.  Each data science project will require different transformations and creative transformations are often the key to a successful AI project.  First, load in your Workspace, Datastore and Datasets.<br><br>


In [None]:
# Load Azure Libaries
from azureml.core import Datastore
from azureml.core.dataset import Dataset
from azureml.core.workspace import Workspace
from azureml.core.authentication import InteractiveLoginAuthentication
from azure.storage.blob import BlockBlobService
import pandas as pd
import numpy as np
import json
import os
import math
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Retrieve your workspace by name by filling in the lower case values between double quotes
ws = Workspace.get(name="<my-workspace>",
        subscription_id="<my-azure-subscription-id>",
        resource_group="<my-resource-group>")

In [None]:
# Retrieve your Datastore by name by filling in the lower case values between double quotes
datastore_name = "<my-datastore-name>"
datastore = Datastore.get(ws, datastore_name)

In [None]:
# Retrieve your Refined Datasets by name by filling in the lower case values between double quotes
dataset_name_test = "<my-refined-dataset-name>"
dataset_name_train  = "<my-refined-dataset-name>"

# Load Data in as Tabular Datasets
testing_data  = Dataset.get_by_name(ws, dataset_name_test, version='latest')
training_data = Dataset.get_by_name(ws, dataset_name_train, version='latest')

In [None]:
# Convert your tabular dataset to pandas data frames
testDF = testing_data.to_pandas_dataframe()
trainDF = training_data.to_pandas_dataframe()

# It's time to start working with your data!
This next step should be customized to fit each project based on your understanding of the problem and the data.<br>
You should understand your data, transform your data, and save it as a Dataset so everyone on your workspace can access it.<br>

Common transformations include, but are not limited to:<br>
- Rebalancing your Dependent Variable for Classification Problems (The column you are trying to predict)
- Transforming your Dependent Variable to follow a Normal Distribution for Regression Problems
- Removing outliers
- Smoothing Noisy Numeric Indepedent Variables (IVs)
- Binning Numeric IVs
- Binning Categorical IVs
- Create New Columns based on your refined data
- Impute Null Values across the Dataset
- Create New Columns to indicate where Null Values existed before imputation for each column

In [None]:
# Since each transformation is problem specific, you will have to research your data to determine what is appropriate.
# For an example, looked at the filled version of this notebook.

In [None]:
# Write your own transformation code here. The file lines should be
testTransformedDF = 
trainTransformedDF = 

### Write your Transformed Data back to your Data Lake
After transforming your data, save it to a place on a data lake where it is accessible to all of the data workers who require access.<br>
Make sure that you indicate in the folder path and file name that it is transformed data for a specific data science project.

In [None]:
# Output a nice table with all of your essential datastore information so you have it on hand
dsoutput = {}
dsoutput['Workspace Name'] = ws.name
dsoutput['Datastore Name'] = datastore.name
dsoutput['Container Name'] = datastore.container_name
dsoutput['Resource Group'] = ws.resource_group
dsoutput['Storage Account'] = datastore.account_name
pd.set_option('display.max_colwidth', -1)
dsoutputDf = pd.DataFrame(data = dsoutput, index = [''])
dsoutputDf.T

In [None]:
# Test Data

# Create the BlockBlockService that the system will use to write data.  
# Specify your datastore storage account and account key.
block_blob_service = BlockBlobService(
    account_name='<my-storage-account>', account_key='<my-storage-account-key>')

# Specify your container in your storage account where you wish to save Transformed Data.
# It can be different from the container used in your primary datastore, in which case register it as another datastore
container_name = '<my-container>'

# Change your Pandas Dataframe to CSV
testTransformedCSV = testTransformedDF.to_csv(index=False)

# Assign a path and filename inside your container in your storage account
data_lake_file_path = "<my-path-on-datastore-to-transformed-test-data>"

# Upload the CSV into your storage account
block_blob_service.create_blob_from_text(
    container_name, data_lake_file_path,testTransformedCSV)

In [None]:
# Train Data

# Create the BlockBlockService that the system will use to write data.  
# Specify your datastore storage account and account key.
block_blob_service = BlockBlobService(
    account_name='<my-storage-account>', account_key='<my-storage-account-key>')

# Specify your container in your storage account where you wish to save Transformed Data.
# It can be different from the container used in your primary datastore, in which case register it as another datastore
container_name = '<my-container>'

# Change your Pandas Dataframe to CSV
trainTransformedCSV = trainTransformedDF.to_csv(index=False)

# Assign a path and filename inside your container in your storage account
data_lake_file_path = "<my-path-on-datastore-to-transformed-train-data>"

# Upload the CSV into your storage account
block_blob_service.create_blob_from_text(
    container_name, data_lake_file_path, trainTransformedCSV)

### Register your Transformed Datasets
This allows you to share your Transformed Datasets with others in your workspace, to version and keep track of them.

In [None]:
# If you stored your Transformed Datasets in a separate container, first create another Datastore for that container.

In [2]:
# Specify the files and/or directories in your datastore.  You can pull in multiple or single files.
# Here, we pull in testing data and training data separately

datastore_path_transformed_test = [
                  (datastore, "<my-path-on-datastore-to-transformed-test-data>")
                 ]

datastore_path_transformed_train = [
                  (datastore, "<my-path-on-datastore-to-transformed-train-data>")
                 ]

NameError: name 'datastore' is not defined

In [None]:
# Create Tabular Data Sets
test_data_transformed = Dataset.Tabular.from_delimited_files(path=datastore_path_transformed_test)
train_data_transformed = Dataset.Tabular.from_delimited_files(path=datastore_path_transformed_train)

In [None]:
# Register and tag your data sets
test_data_transformed.register(workspace=ws,
                        name="<my-dataset-name>",
                        description="<my-dataset-description>",
                        tags = {"<my-tag-name>": "<my-tag-value>", "<my-tag-name2>": "<my-tag-value2>"},
                        create_new_version=True)

In [None]:
# Register and tag your data sets
train_data_transformed.register(workspace=ws,
                        name="<my-dataset-name>",
                        description="<my-dataset-description>",
                        tags = {"<my-tag-name>": "<my-tag-value>", "<my-tag-name2>": "<my-tag-value2>"},
                        create_new_version=True)

<br><br><br><br><br>



Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.