# Visualizations - Pandas
## Pandas and matplotlib
You should now be able to control matplotlib charts quite well. Now see how to combine pandas with matplotib objects.

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display, HTML

### Milan weather data
Let us begin with importing weather dataset.

A weather station from milano data sets:

https://dandelion.eu/datagems/SpazioDati/milano-weather-station-data/resource/
* 6045,Milano - via Filippo Juvara,45.473622,9.220392,Wind Direction,degree
* 5908,Milano - via Filippo Juvara,45.473622,9.220392,Precipitation,mm
* 6502,Milano - via Filippo Juvara,45.473622,9.220392,Atmospheric Pressure,hPa
* 6457,Milano - via Filippo Juvara,45.473622,9.220392,Net Radiation,W/m^2
* 5909,Milano - via Filippo Juvara,45.473622,9.220392,Temperature,Celsius degree
* 6179,Milano - via Filippo Juvara,45.473622,9.220392,Relative Humidity,%
* 6129,Milano - via Filippo Juvara,45.473622,9.220392,Wind Speed,m/s

In [None]:
sets = [
    ("6045", "windDirection"),
    ("5908", "precipitation"),
    ("6502", "pressure"),
    ("6457", "radiation"),
    ("5909", "temp"),
    ("6179", "humidity"),
    ("6129", "windSpeed"),
]

for i, oneSet in enumerate(sets):
    df = pd.read_csv("/kaggle/input/intro-visual-data/datasets/Milano_WeatherPhenomena/mi_meteo_"+oneSet[0]+".csv", names=["code", "date", oneSet[1]])
    print(df.shape)
#     df.set_index("date", inplace=True)
    df.drop("code", axis=1, inplace=True)
    if i == 0:
        milano = df
    else:
        # pandas sees there is only one common column to perform merge on (date)
        milano = milano.merge(df)
#     display(df.head())

In [None]:
milano.date = pd.to_datetime(milano.date)
display(milano.head())
print(milano.isnull().sum())
print(milano.dtypes)

Create columns which tell us something useful about the date.

In [None]:
milano["month"] = milano.date.dt.month
milano["weekday"] = milano.date.dt.weekday
milano["hour"] = milano.date.dt.hour

See how hourly temperatures change from November to December.

In [None]:
monthDay = milano.groupby(["month", "hour"]).agg("mean")
monthDay.plot()

In [None]:
monthDay

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,4))

for month in monthDay.index.get_level_values("month").unique():
    monthDay["temp"].loc[month].plot(ax=ax, label=month)
ax.set_title("Temperature")
ax.legend()
plt.show()

It seems that December is considerably colder. How does wind speed and humidity change? Assume that these indicators are less important and you want them to take less space on the chart.

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(8,8), gridspec_kw={'height_ratios':[3, 2, 2]})


for month in monthDay.index.get_level_values("month").unique():
    monthDay["temp"].loc[month].plot(ax=ax[0], label=month)
ax[0].legend()
ax[0].set_title("Temperature")
                                                         
for month in monthDay.index.get_level_values("month").unique():
    monthDay["humidity"].loc[month].plot(ax=ax[1], label=month)
ax[1].legend()
ax[1].set_title("Humidity")

for month in monthDay.index.get_level_values("month").unique():
    monthDay["windSpeed"].loc[month].plot(ax=ax[2], label=month)
ax[2].legend()
ax[2].set_title("Wind speed")
plt.tight_layout()                    
plt.show()
                 

You may also want to compare humidity and temperature on one chart.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,5))
ax1 = ax.twinx()

for month in monthDay.index.get_level_values("month").unique():
    monthDay["temp"].loc[month].plot(ax=ax, label=month)
ax.legend()
# ax[0].set_title("Temperature")
                                                         
for month in monthDay.index.get_level_values("month").unique():
    monthDay["humidity"].loc[month].plot(ax=ax1, label=str(str(month)+str("- Humidity")), style="--")
ax1.legend(loc=2)
fig.tight_layout()

ax.set_ylabel('Temperature [C]')
ax1.set_ylabel('Humidity [%]')
ax1.set_xlabel('Hour')

# ax[1].set_title("Humidity")

# t = np.linspace(0., 10., 100)
# ax1.plot(t, t ** 2, 'b-')
# ax2.plot(t, 1000 / (t + 1), 'r-')
# ax1.set_ylabel('Density (cgs)', color='red')
# ax1.set_ylabel('Temperature (K)', color='blue')
# ax1.set_xlabel('Time (s)')

### Airport and air routes data
Now you will use a well-known and interesting dataset about airports and air routes.
* https://openflights.org/data.html

As usual, let us begin with reading the dataset, checking its shape and column types, which we improve if necessary. We will also get rid of unnecessary columns.

In [None]:
cols = ['airportID', 'name', 'city', 'country', 'IATA', 'ICAO', 'lat', 'lon', 'altitude', 'timezone', 'DST', 'tz', 'type', 'source']
airports = pd.read_csv("/kaggle/input/intro-visual-data/datasets/air/airports.bin",sep=',',names=cols, dtype={'airportID':object})
cols = ['airportID', 'name', 'city', 'country', 'IATA', 'ICAO', 'lat', 'lon', 'altitude', 'timezone', 'DST', 'tz']
airports = airports[cols]

cols = ['airline', 'airlineID', 'sourceAirport', 'sourceAirportID', 'destAirport', 'destAirportID', 'codeshare', 'stops', 'equipment']
routes = pd.read_csv("/kaggle/input/intro-visual-data/datasets/air/routes.bin",sep=',',names=cols)
cols = ['airline', 'airlineID', 'sourceAirport', 'sourceAirportID', 'destAirport', 'destAirportID', 'stops', 'equipment']
routes = routes[cols]

In [None]:
print("Airports", airports.shape, "Routes", routes.shape)
display(airports.head())
display(routes.head())
print(airports.dtypes)
print(routes.dtypes)

Just in case, check before merging how many routes are without a corresponding airport. For 67000 routes, 850 unknown airports is not that bad.

In [None]:
print((~routes.sourceAirportID.isin(airports.airportID)).sum())
print((~routes.sourceAirport.isin(airports.IATA)).sum())
print((~routes.destAirportID.isin(airports.airportID)).sum())
print((~routes.destAirport.isin(airports.IATA)).sum())

Now, merge on proper columns. Choose an "inner" option. In this case we do not care about routes with unidentified airport. Do a double merge, so that you know latitude and longitude of both departure and arrival.
* Why do we choose "inner" instead of "right"? What would be the shape of routAir dataframe if you choose "right"? Would it make sense?

In [None]:
routAir = routes.merge(airports, left_on="sourceAirportID", right_on="airportID", how="inner" )
routAir = routAir.merge(airports, left_on="destAirportID", right_on="airportID", how="left", suffixes=["", "_dest"])
print(routAir.shape)
display(routAir.head())

You may want to know the distance of the routes. They are not available directly in the dataset, but fortunately we have geographical coordinates of both airports (disregard stopovers).
geopy library will be useful in this case.

In [None]:
from geopy.distance import distance

In [None]:
#!pip install geopy

In [None]:
distances = []
for k,l,x,y in zip(routAir.lat, routAir.lon, routAir.lat_dest, routAir.lon_dest):
    try:
        distances.append(distance((x,y), (k,l)).meters/1000)
    except:
        distances.append(np.nan)
routAir["distance"] = distances

Choose only flights departing from European airports.

In [None]:
euro = routAir.loc[routAir.tz.str.contains("Europe")]
euro.shape

Count some interesting aggregate values.

In [None]:
euroAir = euro.groupby("airportID").agg({"airline":"count", "distance":"mean"})

In [None]:
euroAir["size"] = np.sqrt(euroAir.airline)
euroAir["sqrDistance"] = np.sqrt(euroAir.distance)

In [None]:
euroAir = euroAir.reset_index()

In [None]:
# there is only one common column, so Pandas guesses which one to use
euroAir = euroAir.merge(airports)


Now create a more interesting plot. Draw airports consideting their geographical coordinates, size of an airport (mean number of flights) and mean distance of flights.

In [None]:
import matplotlib.cm as cmaps
euroAir.plot("lon", "lat", kind="scatter", figsize=(13,8), s=euroAir["distance"]/25, alpha=0.7, c=euroAir["size"], cmap=cmaps.viridis)

This chart may look pretty, but as long as coordinates are not exact and you do not see country borders, it is not that useful.

In [None]:
euroAir.plot("sqrDistance", "size", kind="scatter", figsize=(12,8))