<h1>Copy raw data in your S3 bucket</h1>

In [2]:
import boto3
import sagemaker
import time

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

print(region)
print(role)
 
# replace [your-initials] according to the bucket name you have defined.
bucket_name = 'endtoendml-workshop-[your-initials]'
bucket_name = 'endtoendml-workshop-ad82'
prefix = '02'

print(bucket_name)

eu-west-1
arn:aws:iam::041631420165:role/service-role/AmazonSageMaker-ExecutionRole-20180507T143636
endtoendml-workshop-ad82


In [3]:
import boto3

s3 = boto3.resource('s3')

copy_source = {
    'Bucket': 'gianpo-public',
    'Key': 'windturbine_raw_data.csv'
}

file_name = 'windturbine_raw_data.csv'
file_key = 'data/{0}'.format(file_name)
s3.Bucket(bucket_name).copy(copy_source, file_key)

<h1>Create a Glue Crawler to infer schema for your data</h1>


In [4]:
glue_client = boto3.client('glue')
response = glue_client.create_database(DatabaseInput={'Name': 'endtoendml-db'})
response = glue_client.get_database(Name='endtoendml-db')
response
assert response['Database']['Name'] == 'endtoendml-db'


In [5]:
response = glue_client.create_crawler(
    Name='endtoendml-crawler',
    Role='GlueServiceRole-endtoendml', 
    DatabaseName='endtoendml-db',
    Targets={'S3Targets': [{'Path': '{0}/data/'.format(bucket_name)}]}
)

In [17]:
glue_client.start_crawler(Name='endtoendml-crawler')

{'ResponseMetadata': {'RequestId': 'd765a599-c98f-11e9-a417-353584e62773',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Wed, 28 Aug 2019 12:32:15 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'd765a599-c98f-11e9-a417-353584e62773'},
  'RetryAttempts': 0}}

In [18]:
glue_client.get_crawler_metrics(CrawlerNameList=['endtoendml-crawler'])

{'CrawlerMetricsList': [{'CrawlerName': 'endtoendml-crawler',
   'TimeLeftSeconds': 0.0,
   'StillEstimating': True,
   'LastRuntimeSeconds': 48.223,
   'MedianRuntimeSeconds': 48.223,
   'TablesCreated': 0,
   'TablesUpdated': 0,
   'TablesDeleted': 0}],
 'ResponseMetadata': {'RequestId': 'df198acb-c98f-11e9-9559-d74bf715f71f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Wed, 28 Aug 2019 12:32:17 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '218',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'df198acb-c98f-11e9-9559-d74bf715f71f'},
  'RetryAttempts': 0}}

In [19]:
while glue_client.get_crawler_metrics(CrawlerNameList=['endtoendml-crawler'])['CrawlerMetricsList'][0]['TablesCreated'] == 0:
    print('RUNNING')
    time.sleep(15)
    
assert glue_client.get_crawler_metrics(CrawlerNameList=['endtoendml-crawler'])['CrawlerMetricsList'][0]['TablesCreated'] == 1


RUNNING
RUNNING
RUNNING
RUNNING
RUNNING
RUNNING
RUNNING
RUNNING
RUNNING
RUNNING


In [20]:
table = glue_client.get_table(DatabaseName='endtoendml-db', Name='data')
table

{'Table': {'Name': 'data',
  'DatabaseName': 'endtoendml-db',
  'Owner': 'owner',
  'CreateTime': datetime.datetime(2019, 8, 28, 12, 33, 6, tzinfo=tzlocal()),
  'UpdateTime': datetime.datetime(2019, 8, 28, 12, 33, 6, tzinfo=tzlocal()),
  'LastAccessTime': datetime.datetime(2019, 8, 28, 12, 33, 6, tzinfo=tzlocal()),
  'Retention': 0,
  'StorageDescriptor': {'Columns': [{'Name': 'col0', 'Type': 'string'},
    {'Name': 'col1', 'Type': 'string'},
    {'Name': 'col2', 'Type': 'bigint'},
    {'Name': 'col3', 'Type': 'bigint'},
    {'Name': 'col4', 'Type': 'double'},
    {'Name': 'col5', 'Type': 'bigint'},
    {'Name': 'col6', 'Type': 'bigint'},
    {'Name': 'col7', 'Type': 'bigint'},
    {'Name': 'col8', 'Type': 'bigint'},
    {'Name': 'col9', 'Type': 'bigint'},
    {'Name': 'col10', 'Type': 'string'},
    {'Name': 'col11', 'Type': 'string'}],
   'Location': 's3://endtoendml-workshop-ad82/data/',
   'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
   'OutputFormat': 'org.apache.had

In [21]:
table['Table']['StorageDescriptor']['Columns'] = [{'Name': 'turbine_id', 'Type': 'string'},
                                                  {'Name': 'turbine_type', 'Type': 'string'},
                                                  {'Name': 'wind_speed', 'Type': 'double'},
                                                  {'Name': 'RPM_blade', 'Type': 'double'},
                                                  {'Name': 'oil_temperature', 'Type': 'double'},
                                                  {'Name': 'oil_level', 'Type': 'double'},
                                                  {'Name': 'temperature', 'Type': 'double'},
                                                  {'Name': 'humidity', 'Type': 'double'},
                                                  {'Name': 'vibrations_frequency', 'Type': 'double'},
                                                  {'Name': 'pressure', 'Type': 'double'},
                                                  {'Name': 'wind_direction', 'Type': 'string'},
                                                  {'Name': 'breakdown', 'Type': 'string'}]


In [22]:
updated_table = table['Table']
updated_table.pop('DatabaseName', None)
updated_table.pop('CreateTime', None)
updated_table.pop('UpdateTime', None)
updated_table.pop('CreatedBy', None)
updated_table.pop('IsRegisteredWithLakeFormation', None)

glue_client.update_table(
    DatabaseName='endtoendml-db',
    TableInput=updated_table
)

{'ResponseMetadata': {'RequestId': '1a1cf6f7-c990-11e9-bf24-b1a67eeb407f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Wed, 28 Aug 2019 12:33:56 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': '1a1cf6f7-c990-11e9-bf24-b1a67eeb407f'},
  'RetryAttempts': 0}}

In [23]:
!pip install pyathena

[33mYou are using pip version 10.0.1, however version 19.2.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [24]:
import pyathena
from pyathena import connect
import pandas as pd

conn = connect(s3_staging_dir='s3://{0}/staging/'.format(bucket_name), 
               region_name='eu-west-1')

df = pd.read_sql('SELECT * FROM "endtoendml-db".data limit 8;', conn)
df


Unnamed: 0,turbine_id,turbine_type,wind_speed,rpm_blade,oil_temperature,oil_level,temperature,humidity,vibrations_frequency,pressure,wind_direction,breakdown
0,TID003,HAWT,80.0,61.0,,34.0,33.0,26.0,1.0,77.0,E,no
1,TID010,HAWT,85.0,78.0,36.0,28.0,35.0,43.0,15.0,62.0,NE,yes
2,TID007,HAWT,47.0,31.0,31.0,23.0,46.0,62.0,15.0,32.0,N,no
3,TID008,VAWT,73.0,70.0,38.0,8.0,17.0,66.0,6.0,80.0,SW,yes
4,TID003,HAWT,16.0,23.0,46.0,9.0,76.0,53.0,14.0,29.0,W,no
5,TID001,HAWT,78.0,71.0,30.0,11.0,66.0,79.0,1.0,81.0,SW,no
6,TID009,HAWT,80.0,25.0,37.0,31.0,40.0,75.0,4.0,56.0,NW,no
7,TID002,VAWT,59.0,29.0,37.0,10.0,25.0,83.0,13.0,55.0,SE,no


Another SQL query to count how many records we have

In [25]:
pd.read_sql('SELECT COUNT(*) FROM "endtoendml-db".data;', conn)

Unnamed: 0,_col0
0,1000000


Let's try to see what are possible values for the field "alarm" and how frequently they occur over the entire dataset

In [26]:
pd.read_sql('SELECT breakdown, (COUNT(breakdown) * 100.0 / (SELECT COUNT(*) FROM "endtoendml-db".data)) \
            AS percent FROM "endtoendml-db".data GROUP BY breakdown;', conn)


Unnamed: 0,breakdown,percent
0,yes,13.6579
1,no,86.3421


In [36]:
pd.read_sql('SELECT DISTINCT(turbine_type) FROM "endtoendml-db".data', conn)

Unnamed: 0,turbine_type
0,
1,VAWT
2,HAWT


In [33]:
pd.read_sql('SELECT COUNT(*) FROM "endtoendml-db".data WHERE oil_temperature IS NULL GROUP BY oil_temperature', conn)

Unnamed: 0,_col0
0,38297


Now we want to see if there is a correlation between temperature and humidity. To do so we run a SQL query to select only these two columns and populate a Pandas dataframe that we will use for our analysis

In [None]:
temp_hum_df = pd.read_sql('SELECT temperature, humidity FROM "endtoendml-db".data', conn)
temp_hum_df.head()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(temp_hum_df.temperature, temp_hum_df.humidity)

In [None]:
plt.hist(temp_hum_df.humidity, bins=10)

In [None]:
wind_rpm_df = pd.read_sql('SELECT wind_speed, rpm_blade FROM "endtoendml-db".data', conn)
plt.scatter(wind_rpm_df.wind_speed, wind_rpm_df.rpm_blade)

Note: you can go to Amazon Athena console and check for query duration under History tab: usually queries are executed in a few seconds, then it takes a while for pandas to load results into a dataframe

In [None]:
wind_rpm_df.describe()

Now we select our entire dataset and populate a dataframe.  

In [None]:
df = pd.read_sql('SELECT * FROM "endtoendml-db".data;', conn)
df.info()

You can notice that col4float has some missing values

In [37]:
df.describe(include=['object', 'int64', 'float64'])

Unnamed: 0,turbine_id,turbine_type,wind_speed,rpm_blade,oil_temperature,oil_level,temperature,humidity,vibrations_frequency,pressure,wind_direction,breakdown
count,8,8,8.0,8.0,7.0,8.0,8.0,8.0,8.0,8.0,8,8
unique,7,2,,,,,,,,,7,2
top,TID003,HAWT,,,,,,,,,SW,no
freq,2,6,,,,,,,,,2,6
mean,,,64.75,48.5,36.428571,19.25,42.25,60.875,8.625,59.0,,
std,,,23.432273,23.555406,5.255383,10.898886,19.983922,19.430738,6.254998,20.396078,,
min,,,16.0,23.0,30.0,8.0,17.0,26.0,1.0,29.0,,
25%,,,56.0,28.0,33.5,9.75,31.0,50.5,3.25,49.25,,
50%,,,75.5,46.0,37.0,17.0,37.5,64.0,9.5,59.0,,
75%,,,80.0,70.25,37.5,28.75,51.0,76.0,14.25,77.75,,


In [38]:
!wget https://s3-us-west-2.amazonaws.com/sparkml-mleap/0.9.6/python/python.zip
!wget https://s3-us-west-2.amazonaws.com/sparkml-mleap/0.9.6/jar/mleap_spark_assembly.jar

--2019-08-28 13:04:11--  https://s3-us-west-2.amazonaws.com/sparkml-mleap/0.9.6/python/python.zip
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.252.56
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.252.56|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36872 (36K) [application/zip]
Saving to: ‘python.zip’


2019-08-28 13:04:12 (268 KB/s) - ‘python.zip’ saved [36872/36872]

--2019-08-28 13:04:12--  https://s3-us-west-2.amazonaws.com/sparkml-mleap/0.9.6/jar/mleap_spark_assembly.jar
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.220.184
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.220.184|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17319576 (17M) [application/java-archive]
Saving to: ‘mleap_spark_assembly.jar’


2019-08-28 13:04:14 (7.85 MB/s) - ‘mleap_spark_assembly.jar’ saved [17319576/17319576]



In [39]:
s3.Bucket(bucket_name).upload_file('python.zip', 'dependencies/python/python.zip')
s3.Bucket(bucket_name).upload_file('mleap_spark_assembly.jar', 'dependencies/jar/mleap_spark_assembly.jar')

In [43]:
s3.Bucket(bucket_name).upload_file('endtoendml_etl.py', 'code/endtoendml_etl.py')

ETLJob = glue_client.create_job(Name='endtoendml-job', 
                                Role='GlueServiceRole-endtoendml',
                                Command={
                                    'Name': 'glueetl',
                                    'ScriptLocation': 's3://{0}/code/endtoendml_etl.py'.format(bucket_name)
                                },
                               DefaultArguments={
                                   '--job-language': 'python',
                                   '--extra-jars' : 's3://{0}/dependencies/jar/mleap_spark_assembly.jar'.format(bucket_name),
                                   '--extra-py-files': 's3://{0}/dependencies/python/python.zip'.format(bucket_name)
                               })
glue_job_name = ETLJob['Name']
print(glue_job_name)

endtoendml-job


In [44]:
JobRun = glue_client.start_job_run(JobName=glue_job_name, 
                                  Arguments = {'--S3_BUCKET': bucket_name})
print(JobRun)


{'JobRunId': 'jr_15fdaa869b9de859992b969795117f2461192cb428295a642ab4845065413db5', 'ResponseMetadata': {'RequestId': 'abfc9269-c994-11e9-8dc2-99edd12b0f5e', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Wed, 28 Aug 2019 13:06:39 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '82', 'connection': 'keep-alive', 'x-amzn-requestid': 'abfc9269-c994-11e9-8dc2-99edd12b0f5e'}, 'RetryAttempts': 0}}


In [45]:
status = glue_client.get_job_run(JobName=ETLJob['Name'], RunId=JobRun['JobRunId'])
while status['JobRun']['JobRunState'] not in ('FAILED', 'SUCCEEDED', 'STOPPED'):
    print('Job status: ' + status['JobRun']['JobRunState'])
    time.sleep(30)
    status = glue_client.get_job_run(JobName=ETLJob['Name'], RunId=JobRun['JobRunId'])

print(status['JobRun']['JobRunState'])
    
#This will take around 15 minutes

Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
SUCCEEDED
