In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()

In [None]:
df = pd.read_csv("../../../csv/v2/010_weather_data_processed.csv.zip")
df

# Process date and timedate

In [None]:
df["collectedAt"] = pd.to_datetime(df["collectedAt"],utc=True)
df.head()

In [None]:
df["month"] = df.collectedAt.dt.month_name()
df["day"] = df.collectedAt.dt.day
df["day_of_week"] = df.collectedAt.dt.day_of_week
df["day_of_week_name"] = df.collectedAt.dt.day_name()
df["hour"] = df.collectedAt.dt.hour
df["minute"] = df.collectedAt.dt.minute
df.head()

## Get time span

In [None]:
print(df["collectedAt"].min())
print(df["collectedAt"].max())

In [None]:
df.dtypes

# Analyze "weather"

In [None]:
df.weather.value_counts()

## Group categories
- Leave 800 as is (clear)
- Map 801-804 (cloudy)
- Map 7xx: atmosphere (what may decrease visibility)
- Map 6xx: snowy conditions
- Map 5xx: rainy conditions
- Map 3xx: drizzle (can be included as wet condition in rain)
- Map 2xx: thunderstorm (can also be included as wet conditions in rain)

In [None]:
rainy_condition_codes = np.concatenate([np.arange(200,300,1),np.arange(300,400,1),np.arange(500,600,1)])

df["cloudy"] = df.apply(lambda x: 1 if 801 <= x["weather"] <= 804 else 0,axis=1)
df["clear"] = df.apply(lambda x: 1 if x["weather"] == 800 else 0,axis=1)
df["atmosphere"] = df.apply(lambda x: 1 if 700 <= x["weather"] <= 799 else 0,axis=1)
df["snowy"] = df.apply(lambda x: 1 if 600 <= x["weather"] <= 699 else 0,axis=1)
df["rainy"] = df.apply(lambda x: 1 if x["weather"] in rainy_condition_codes else 0,axis=1)

df.head()

### How many readings are cloudy?

In [None]:
df.cloudy.value_counts()

### How many readings are clear?

In [None]:
df.clear.value_counts()

### How many readings are atmosphere (fog, haze, mist, etc; anything that lowers visibility)?

In [None]:
df.atmosphere.value_counts()

### How many readings are snowy?

In [None]:
df.snowy.value_counts()

### How many readings are rainy?

In [None]:
df.rainy.value_counts()

## Plot weather histogram

In [None]:
def convert_weather_code_to_weather_name(x):
    if 801 <= x <= 804:
        return "Clouds"
    if x == 800:
        return "Clear"
    if 700 <= x <= 799:
        return "Fog/Mist"
    if 600 <= x <= 699:
        return "Snowy"
    if x in rainy_condition_codes:
        return "Rainy"
        

tmp_df = df.copy(deep=True)

tmp_df["Weather Condition"] = tmp_df["weather"].apply(lambda x: convert_weather_code_to_weather_name(x))
tmp_df

In [None]:
weather_condition_order = list(tmp_df.groupby("Weather Condition").count().reset_index().sort_values(by="weather", ascending=False)["Weather Condition"])
tmp_df["weather_condition_ordered"] = pd.Categorical(tmp_df["Weather Condition"], weather_condition_order)

ax = sns.histplot(data=tmp_df,x="weather_condition_ordered")
ax.set(ylabel="Num. Observations", xlabel="Weather Condition")

plt.savefig("./plots/000_weather_category_histogram.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_weather_category_histogram.png",bbox_inches="tight")
plt.savefig("./plots/000_weather_category_histogram.pdf",bbox_inches="tight")

### Drop "weather" attribute because it has been dealt with already

In [None]:
df.drop("weather",inplace=True,axis=1)
df.head()

# Analyze Visibility

## Plot Visibility Histogram

In [None]:
ax = sns.histplot(data=df,x="visibility", kde=True)
ax.set(ylabel="Num. Observations (log)", yscale="log")
ax.set(xlabel="Visibility (m)")

plt.savefig("./plots/000_visibility_histogram.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_visibility_histogram.png",bbox_inches="tight")
plt.savefig("./plots/000_visibility_histogram.pdf",bbox_inches="tight")

## Plot Visibility Line Plot

In [None]:
ax = sns.lineplot(data=df,x="collectedAt",y="visibility")
ax.set(ylabel="Visibility (m)")
ax.set(xlabel="Date")
ax.set_xticklabels(ax.get_xticklabels(), rotation=30)

plt.savefig("./plots/000_visibility.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_visibility.png",bbox_inches="tight")
plt.savefig("./plots/000_visibility.pdf",bbox_inches="tight")

# Visibility Line Plot for January

In [None]:
from matplotlib.dates import DateFormatter
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

x_labels = df[df["month"] == "January"]["collectedAt"].dt.day.drop_duplicates()

ax = sns.lineplot(data=df[df["month"] == "January"],x="collectedAt",y="visibility")
ax.set(ylabel="Visibility (m)")
ax.set(xlabel="Day")

date_form = DateFormatter("%d")
ax.xaxis.set_major_formatter(date_form)

# plt.xticks(rotation=30)

plt.savefig("./plots/000_visibility_january.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_visibility_january.png",bbox_inches="tight")
plt.savefig("./plots/000_visibility_january.pdf",bbox_inches="tight")

# Analyze Temperature

## Plot Current Temperature

In [None]:
ax = sns.lineplot(data=df,x="collectedAt",y="temperature")
ax.set(ylabel="Temperature (Celsius)")
ax.set(xlabel="Date")
ax.set_xticklabels(ax.get_xticklabels(), rotation=30)

plt.savefig("./plots/000_current_temperature.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_current_temperature.png",bbox_inches="tight")
plt.savefig("./plots/000_current_temperature.pdf",bbox_inches="tight")

## Plot Feels Like

In [None]:
ax = sns.lineplot(data=df,x="collectedAt",y="FeelsLike")
ax.set(ylabel="Feels Like (Celsius)")
ax.set(xlabel="Date")
ax.set_xticklabels(ax.get_xticklabels(), rotation=30)

plt.savefig("./plots/000_feels_like.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_feels_like.png",bbox_inches="tight")

## Plot current temperature and feels like

In [None]:
df["Temperature"] = df["temperature"]
df["Feels Like"] = df["FeelsLike"]
temp_df = df[["collectedAt","Temperature","Feels Like"]].melt("collectedAt",var_name='cols', value_name='vals')
temp_df.head()

In [None]:
ax = sns.lineplot(data=temp_df,x="collectedAt", y="vals", hue='cols', alpha=0.7)
ax.set(ylabel="Temperature (Celsius)")
ax.set(xlabel="Date")
ax.set_xticklabels(ax.get_xticklabels(), rotation=30)
plt.legend(title="")

plt.savefig("./plots/000_temp_and_feels_like.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_temp_and_feels_like.png",bbox_inches="tight")
plt.savefig("./plots/000_temp_and_feels_like.pdf",bbox_inches="tight")

# Analyze humidity

## Plot Humidity Histogram

In [None]:
ax = sns.histplot(data=df,x="humidity",binwidth=4,kde=True)
ax.set(ylabel="Num. Observations", xlabel="Humidity (%)")

plt.savefig("./plots/000_humidity_histogram.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_humidity_histogram.png",bbox_inches="tight")
plt.savefig("./plots/000_humidity_histogram.pdf",bbox_inches="tight")

## Plot Humidity Line Plot

In [None]:
ax = sns.lineplot(data=df[df["month"] == "December"],x="collectedAt",y="humidity")
ax.set(ylabel="Humidity (%)")
ax.set(xlabel="Day")
ax.set_xticklabels(ax.get_xticklabels())

ax.xaxis.set_major_formatter(date_form)

plt.savefig("./plots/000_humidity_december.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_humidity_december.png",bbox_inches="tight")
plt.savefig("./plots/000_humidity_december.pdf",bbox_inches="tight")

# Analyze Wind

## Wind Speed Histogram

In [None]:
ax = sns.histplot(data=df,x="windSpeed",kde=True)
ax.set(ylabel="Num. Observations", xlabel="Wind Speed (km/h)")

plt.savefig("./plots/000_wind_speed_histogram.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_wind_speed_histogram.png",bbox_inches="tight")
plt.savefig("./plots/000_wind_speed_histogram.pdf",bbox_inches="tight")

## Wind Speed Line Plot

In [None]:
tmp_wind_df = df[(df["month"] == "December") & (df["minute"].isin([0,30]))]

ax = sns.lineplot(data=tmp_wind_df,x="collectedAt",y="windSpeed")
ax.set(ylabel="Wind Speed (km/h)")
ax.set(xlabel="Day")
ax.set_xticklabels(ax.get_xticklabels())

ax.xaxis.set_major_formatter(date_form)

plt.savefig("./plots/000_wind_speed_december.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_wind_speed_december.png",bbox_inches="tight")
plt.savefig("./plots/000_wind_speed_december.pdf",bbox_inches="tight")

## Wind Degree Histogram

In [None]:
ax = sns.histplot(data=df,x="windDegree",kde=True)
ax.set(xlabel="Wind Degree")

plt.savefig("./plots/000_wind_degree_histogram.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_wind_degree_histogram.png",bbox_inches="tight")

## Wind Degree Line Plot

In [None]:
ax = sns.lineplot(data=df,x="collectedAt",y="windDegree")
ax.set(ylabel="Wind Degree")
ax.set(xlabel="Date")
ax.set_xticklabels(ax.get_xticklabels(), rotation=30)

plt.savefig("./plots/000_wind_degree.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_wind_degree.png",bbox_inches="tight")

# Analyze Cloudy Conditions

## Plot Cloudiness Histogram

In [None]:
ax = sns.histplot(data=df,x="cloudsAll",kde=True)
ax.set(ylabel="Num. Observations", xlabel="Cloudiness (%)")

plt.savefig("./plots/000_cloudiness_hist.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_cloudiness_hist.png",bbox_inches="tight")
plt.savefig("./plots/000_cloudiness_hist.pdf",bbox_inches="tight")

## Plot Cloudiness Line Plot

In [None]:
tmp_df = df[(df["month"] == "December") & (df["minute"].isin([0,30]))]

ax = sns.lineplot(data=tmp_df,x="collectedAt",y="cloudsAll")
ax.set(ylabel="Cloudiness (%)")
ax.set(xlabel="Day")
ax.set_xticklabels(ax.get_xticklabels())

ax.xaxis.set_major_formatter(date_form)

plt.savefig("./plots/000_cloudiness_december.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_cloudiness_december.png",bbox_inches="tight")
plt.savefig("./plots/000_cloudiness_december.pdf",bbox_inches="tight")

# Analyze Precipitation

## Rain 1h

### Plot Rain 1h Histogram

In [None]:
ax = sns.histplot(data=df,x="rain1h")
ax.set(yscale="log", ylabel="Num. Observations (log)")
ax.set(xlabel="Rainfall (mm/h)")

plt.savefig("./plots/000_rain_1h_histogram.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_rain_1h_histogram.png",bbox_inches="tight")
plt.savefig("./plots/000_rain_1h_histogram.pdf",bbox_inches="tight")

### Plot Rain 1h Line Plot

In [None]:
ax = sns.lineplot(data=df,x="collectedAt",y="rain1h")
ax.set(ylabel="Rainfall (mm/h)")
ax.set(xlabel="Date")
ax.set_xticklabels(ax.get_xticklabels())

for idx, label in enumerate(ax.get_xticklabels()):
    if idx % 2 == 0:  # every 2nd label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)

plt.savefig("./plots/000_rain_1h.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_rain_1h.png",bbox_inches="tight")
plt.savefig("./plots/000_rain_1h.pdf",bbox_inches="tight")

## Snow 1h

### Plot Snow 1h Histogram

In [None]:
ax = sns.histplot(data=df,x="snow1h")
ax.set(yscale="log", ylabel="Num. Observations (log)")
ax.set(xlabel="Snowfall (mm/h)")

plt.savefig("./plots/000_snow_1h_histogram.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_snow_1h_histogram.png",bbox_inches="tight")
plt.savefig("./plots/000_snow_1h_histogram.pdf",bbox_inches="tight")

### Plot Snow 1h Line Plot

In [None]:
ax = sns.lineplot(data=df[df["month"].isin(["November","December", "January", "February", "March"])],x="collectedAt",y="snow1h")
ax.set(ylabel="Snowfall (mm/h)")
ax.set(xlabel="Date")
ax.set_xticklabels(ax.get_xticklabels())

for idx, label in enumerate(ax.get_xticklabels()):
    if idx % 1 == 0:  # every 2nd label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)

plt.savefig("./plots/000_snow_1h.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_snow_1h.png",bbox_inches="tight")
plt.savefig("./plots/000_snow_1h.pdf",bbox_inches="tight")

# Analyze Visibility and Precipitation

## Visibility vs. Rain

In [None]:
df["Visibility"] = df["visibility"]
df["Rain"] = df["rain1h"]
temp_df = df[["collectedAt","Visibility","Rain"]].melt("collectedAt",var_name='cols', value_name='vals')
print(len(temp_df))
temp_df.head()

In [None]:
ax = sns.lineplot(data=temp_df,x="collectedAt", y="vals", hue='cols')
ax.set(ylabel="Visibility/Rain (log)")
ax.set(yscale="log")
ax.set(xlabel="Date")
ax.set_xticklabels(ax.get_xticklabels(), rotation=30)
plt.legend(title="")

plt.savefig("./plots/000_visibility_rain.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_visibility_rain.png",bbox_inches="tight")
plt.savefig("./plots/000_visibility_rain.pdf",bbox_inches="tight")

## Visibility vs. Snow

In [None]:
df["Snow"] = df["snow1h"]
temp_df = df[["collectedAt","Visibility","Snow"]].melt("collectedAt",var_name='cols', value_name='vals')
print(len(temp_df))
temp_df.head()

In [None]:
ax = sns.lineplot(data=temp_df,x="collectedAt", y="vals", hue='cols')
ax.set(ylabel="Visibility/Snow (log)")
ax.set(yscale="log")
ax.set(xlabel="Date")
ax.set_xticklabels(ax.get_xticklabels(), rotation=30)
plt.legend(title="")

plt.savefig("./plots/000_visibility_snow.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_visibility_snow.png",bbox_inches="tight")
plt.savefig("./plots/000_visibility_snow.pdf",bbox_inches="tight")

## Visibility vs. Precipitation

In [None]:
df["Visibility"] = df["visibility"]
df["Rain"] = df["rain1h"]
df["Snow"] = df["snow1h"]

temp_df = df[["collectedAt","Visibility","Rain", "Snow"]].melt("collectedAt",var_name='cols', value_name='vals')
print(len(temp_df))
temp_df.head()

In [None]:
ax = sns.lineplot(data=temp_df,x="collectedAt", y="vals", hue='cols', alpha=0.7)
ax.set(ylabel="Visibility/Precipitation (log)")
ax.set(yscale="log")
ax.set(xlabel="Date")
ax.set_xticklabels(ax.get_xticklabels())
plt.legend(title="")

for idx, label in enumerate(ax.get_xticklabels()):
    if idx % 2 == 0:  # every 2nd label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)

plt.savefig("./plots/000_visibility_precipitation.eps",format="eps",bbox_inches="tight")
plt.savefig("./plots/000_visibility_precipitation.png",bbox_inches="tight")
plt.savefig("./plots/000_visibility_precipitation.pdf",bbox_inches="tight")

# Show Correlation Between Visibility, Temperature and Precipitation

In [None]:
data_to_correlate = df[["visibility","temperature","rain1h","snow1h"]]

corr_mat = data_to_correlate.corr()

In [None]:
data_to_correlate

In [None]:
labels=["Vis.", "Temp.", "Rain", "Snow"]

ax = sns.heatmap(corr_mat,annot=True,linewidth=0.5,xticklabels=labels,yticklabels=labels)
ax.xaxis.tick_top()
plt.xticks(rotation=45)

plt.savefig("./plots/000_correlations.eps", format="eps", bbox_inches="tight")
plt.savefig("./plots/000_correlations.png", bbox_inches="tight")
plt.savefig("./plots/000_correlations.pdf", bbox_inches="tight")

- There is no sense in looking for correlation between these attributes and time period because nature is not bound to societal constructs such as daily routine