In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from azureml.core import Dataset, Workspace


from matplotlib.pyplot import figure

In [None]:
# import data
df = pd.read_csv('dataset/weather_dataset_raw.csv')

## data quality assessment

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.dtypes

### calibrate missing values

In [None]:
df.isnull().values.any()

In [None]:
df.isnull().sum()

In [None]:
df.Weather_conditions.fillna(method='ffill', inplace=True, axis=0)

In [None]:
df.isna().values.any()

In [None]:
df.Weather_conditions.value_counts()

In [None]:
df['Weather_conditions'].replace({'snow':'no_rain', 'clear':'no_rain'}, inplace=True)

In [None]:
df.Weather_conditions.value_counts()

### convert timestamp to datetime format

In [None]:
df['Timestamp'] = pd.to_datetime(df.Timestamp)

### convert text data to numeric data using label encoding

In [None]:
y = df['Weather_conditions']

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
y = pd.DataFrame(y, columns=['Current_weather_condition'])
df = pd.concat([df, y], axis=1)
df.drop('Weather_conditions', axis=1, inplace=True)

In [None]:
df['Future_weather_condition'] = df.Current_weather_condition.shift(4, axis=0)

In [None]:
df.head()

In [None]:
df.dropna(inplace=True)

## data correlation and filtering

In [None]:
df.corr(method='pearson')

In [None]:
# visualizing using heatmap
corrMatrix = df.corr()
figure(num=None, figsize=(12, 10), dpi=80, facecolor='w', edgecolor='w')
sns.heatmap(corrMatrix, annot=True)

In [None]:
df = df.drop(['S_No', 'Apparent_Temperature_C'], axis=1)

In [None]:
corrMatrix = df.corr()
figure(num=None, figsize=(12, 10), dpi=80, facecolor='w', edgecolor='w')
sns.heatmap(corrMatrix, annot=True)

In [None]:
figure(num=None, figsize=(12, 10), dpi=80, facecolor='w', edgecolor='w')
df.corr(method ='pearson')['Future_weather_condition'].sort_values(ascending=True).drop(['Future_weather_condition']).plot(kind='bar', width=0.9)

### time series analysis

In [None]:
time = df['Timestamp']
temp = df['Temperature_C']

In [None]:
figure(num=None, figsize=(12, 10), dpi=80, facecolor='w', edgecolor='w')
plt.plot(time, temp)

In [None]:
#df.to_csv('dataset/weather_dataset_processed.csv')

### Data registration and versioning

In [None]:
import json

with open('config.json', 'r') as f:
    config = json.load(f)

subscription_id = config['subscription_id']
resource_group = config['resource_group']
workspace_name = config['workspace_name']

In [None]:
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [None]:
# get the default datastore linked to upload prepared data
datastore = workspace.get_default_datastore()

# upload the local file from src_dir to target_path in datastore
datastore.upload(src_dir='Dataset', target_path='data')

dataset = Dataset.Tabular.from_delimited_files(datastore.path('data/weather_dataset_processed.csv'))

In [None]:
## register data to workspace
weather_ds = dataset.register(
    workspace=workspace, 
    name='processed_data_portofTurku',
    description='processed weatehr data')

# typo in description
# fixed on web

In [39]:
dataset.take(3).to_pandas_dataframe()

Unnamed: 0,Column1,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Current_weather_condition,Future_weather_condition
0,4,2006-04-01 02:00:00,"Port of Turku, Finland",8.755556,0.83,11.0446,259,15.8263,1016.51,1,1.0
1,5,2006-04-01 03:00:00,"Port of Turku, Finland",9.222222,0.85,13.9587,258,14.9569,1016.66,1,1.0
2,6,2006-04-01 04:00:00,"Port of Turku, Finland",7.733333,0.95,12.3648,259,9.982,1016.72,1,1.0
