-
Notifications
You must be signed in to change notification settings - Fork 20
/
homework-solutions.rmd
128 lines (105 loc) · 4.5 KB
/
homework-solutions.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
---
title: "CEU DV3: Example homework solutions"
author: "Gergely Daroczi"
date: "January 24, 2024"
output: html_document
---
```{r suppress, echo = FALSE}
knitr::opts_chunk$set(echo = FALSE, fig.height = 10, fig.width = 15)
```
## 0. Load the `nycflights13` package and check what datasets are bundled.
```{r}
library(nycflights13)
knitr::kable(data(package = 'nycflights13')$results)
```
## 1. Visualize the distribution of arrival delays per origin airport!
```{r}
library(ggplot2)
ggplot(flights, aes(origin, arr_delay)) + geom_boxplot()
```
## 2. Visualize the distribution of arrival delays per destination airport! Note that the x axis labels need to be rotated, and spend some time cleaning up the axis and plot titles! Might need to tweak `fig.height` and `fig.width` params of the code chunk in Rmd!
```{r}
ggplot(flights, aes(dest, arr_delay)) + geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
xlab('') + ylab('Arrival delays (minutes)') +
ggtitle('Flights deparing from NY in 2013')
```
## 3. Compute and visualize the average arrival delay per destination! Make sure to handle `NA`s and order the barplot by the average delay.
```{r}
library(data.table)
dt <- data.table(flights)[, .(delay = mean(arr_delay, na.rm = TRUE)), by = dest][order(delay)]
dt[, dest := factor(dest, levels = dest)]
ggplot(
dt,
aes(dest, delay)) + geom_col() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
xlab('') + ylab('Average arrival delays (minutes)') +
ggtitle('Flights deparing from NY in 2013')
```
Using `reorder` without an explicit `factor` call:
```{r}
dt <- data.table(flights)[, .(delay = mean(arr_delay, na.rm = TRUE)), by = dest][order(delay)]
ggplot(
dt,
aes(reorder(dest, delay), delay)) + geom_col() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
xlab('') + ylab('Average arrival delays (minutes)') +
ggtitle('Flights deparing from NY in 2013')
```
Note that this is not a `ggplot2` trick, but using `reorder` from base R to set the factor levels on the fly instead of making it through the dataset.
## 4. Redo the above plot by showing the actual name of the destination airport instead of it's FAA id!
```{r}
dt <- merge(
data.table(flights),
data.table(airports)[, .(dest = faa, name)])
dt <- dt[, .(delay = mean(arr_delay, na.rm = TRUE)), by = name][order(delay)]
dt[, name := factor(name, levels = name)]
ggplot(
dt,
aes(name, delay)) + geom_col() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
xlab('') + ylab('Average arrival delays (minutes)') +
ggtitle('Flights deparing from NY in 2013')
```
## 5. Color the bars by the timezone of the airport! Make sure to render the legend on the top, using a single line.
```{r}
dt <- merge(
data.table(flights),
data.table(airports)[, .(dest = faa, name, tzone)])
dt <- dt[, .(delay = mean(arr_delay, na.rm = TRUE)), by = .(name, tzone)][order(delay)]
dt[, name := factor(name, levels = name)]
ggplot(
dt,
aes(name, delay, fill = tzone)) + geom_col() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
xlab('') + ylab('Average arrival delays (minutes)') +
theme(legend.position = 'top') +
guides(fill = guide_legend(title = 'Timezone', nrow = 1)) +
ggtitle('Flights deparing from NY in 2013')
```
## 6. Geocode the destination airports, then visualize those on a worldmap with the point sizes relative to the number of flights to there from NY!
```{r}
dt <- data.table(flights)[, .N, by = dest]
dt[, address := paste(dest, 'airport')]
dt <- tidygeocoder::geocode(dt, 'address')
saveRDS(dt, '/tmp/dt.rda')
world <- map_data('world')
ggplot() +
geom_map(data = world, map = world, aes(long, lat, map_id = region)) +
geom_point(data = dt, aes(long, lat, size = N), color = 'orange') +
coord_fixed(1.3) +
theme_void()
```
## 7. Compute the average departure and arrival delays, also the average air time and distance per destination, then pass the scaled dataframe through MDS to visualize the similarities/dissimilarities of the destination airports!
```{r}
dt <- data.table(flights)[, .(
dep_delay = mean(dep_delay, na.rm = TRUE),
arr_delay = mean(arr_delay, na.rm = TRUE),
air_time = mean(air_time, na.rm = TRUE),
distance = mean(distance, na.rm = TRUE)
), by = dest]
mds <- as.data.frame(cmdscale(dist(scale(dt[, 2:5]))))
mds$faa <- dt$dest
library(ggrepel)
ggplot(mds, aes(V1, V2, label = faa)) + geom_text_repel() + theme_void()
```