-
Notifications
You must be signed in to change notification settings - Fork 0
/
3 Practice.R
135 lines (104 loc) · 4.03 KB
/
3 Practice.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# 1. Term frequency in Agatha Christie's books --------------------
library(dplyr)
library(tidytext)
library(ggplot2)
library(data.table)
library(gutenbergr)
options(scipen = 999)
# Hoping to find some Agatha Christie books and analyze them
authors_dt <- data.table(gutenberg_authors)
authors_dt[author == "Christie, Agatha"]
# pull all books written by her
gutenberg_metadata_dt <- data.table(gutenberg_metadata)
gutenberg_metadata_dt[author == "Christie, Agatha"]
# A bit of a dissappointment - only two are found. Let's see what they have
ids_christie_books <- gutenberg_metadata_dt[
author == "Christie, Agatha",
gutenberg_id]
books <- gutenberg_download(ids_christie_books)
# saving in case I need to work while on the road
saveRDS(books, file = "books.RDS")
books <- readRDS("books.RDS")
# Counting the words by book
book_words <- books %>%
unnest_tokens(word, text) %>%
count(gutenberg_id, word, sort = TRUE)
total_words <- book_words %>%
group_by(gutenberg_id) %>%
summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
# One of the books is a lot longer than the other
# As expected, most of the words are very widespread, there are some, which
# are not at all
ggplot(book_words, aes(n / total, fill = gutenberg_id)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~gutenberg_id, ncol = 2, scales = "free_y")
# Zipf's Law ------------
freq_by_rank <- book_words %>%
mutate(rank = row_number(),
term_frequency = n/total)
# About the same as the previous one
ggplot(freq_by_rank,
aes(rank, term_frequency, col = as.factor(gutenberg_id))) +
geom_line() +
scale_x_log10() +
scale_y_log10()
# when we try to fit a line on it, we get about the same
rank_subset <- freq_by_rank %>%
filter(rank < 500, rank > 10)
lm(log10(rank) ~ log10(term_frequency), data = rank_subset)
lm(log10(rank) ~ log10(term_frequency),
data = rank_subset[rank_subset$gutenberg_id == 1155,])
lm(log10(rank) ~ log10(term_frequency),
data = rank_subset[rank_subset$gutenberg_id == 863,])
# Get quite different lines depending on what we fit them on. The two books are
# Quite different in their logged frequencies
ggplot(freq_by_rank,
aes(rank, term_frequency, col = as.factor(gutenberg_id))) +
geom_line() +
scale_x_log10() +
scale_y_log10() +
geom_abline(
intercept = -0.45, slope = -0.96, color = "gray50", linetype = 2) +
geom_abline(
intercept = -0.58, slope = -0.99, color = "gray50", linetype = 2) +
geom_abline(
intercept = -0.44, slope = -0.987, color = "gray50", linetype = 2)
# Calculating the tf_idf
book_words <- book_words %>%
bind_tf_idf(word, gutenberg_id, n)
book_words %>%
select(-total) %>%
arrange(desc(tf_idf))
book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(gutenberg_id) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = gutenberg_id)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gutenberg_id, ncol = 2, scales = "free") +
coord_flip()
# We do have some words in the Tommy and Tuppence book that we shouldn't have
# Some clean up required
# Fast fix for the encoding problem
book_words <- book_words[-grep(
"dat2", iconv(book_words$word, "latin1", "ASCII", sub = "dat2")),]
book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(gutenberg_id) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = gutenberg_id)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gutenberg_id, ncol = 2, scales = "free") +
coord_flip()
# We still talk about mostly names, which is good. Surprising amount of taxis in
# the Tommy and Tuppence book as opposed to the Poirot one, but given how he
# likes to sit tight and let the little gray cells do their job, perhaps not
# that surprising.