-
Notifications
You must be signed in to change notification settings - Fork 1
/
final project_v2.html
1477 lines (1451 loc) · 178 KB
/
final project_v2.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
<meta charset="utf-8">
<meta name="generator" content="quarto-1.3.353">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
<meta name="author" content="Carlos San Juan, Eric Hausken-Brates">
<meta name="dcterms.date" content="2024-03-26">
<title>Final Project: A text mining analysis of the Harry Potter films</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
width: 0.8em;
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
vertical-align: middle;
}
/* CSS for syntax highlighting */
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
}
pre.numberSource { margin-left: 3em; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
</style>
<script src="final project_v2_files/libs/clipboard/clipboard.min.js"></script>
<script src="final project_v2_files/libs/quarto-html/quarto.js"></script>
<script src="final project_v2_files/libs/quarto-html/popper.min.js"></script>
<script src="final project_v2_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="final project_v2_files/libs/quarto-html/anchor.min.js"></script>
<link href="final project_v2_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="final project_v2_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="final project_v2_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="final project_v2_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="final project_v2_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
</head>
<body>
<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
<nav id="TOC" role="doc-toc" class="toc-active">
<h2 id="toc-title">Table of contents</h2>
<ul class="collapse">
<li><a href="#libraries" id="toc-libraries" class="nav-link active" data-scroll-target="#libraries"><span class="header-section-number">0.1</span> Libraries</a></li>
<li><a href="#introduction" id="toc-introduction" class="nav-link" data-scroll-target="#introduction"><span class="header-section-number">0.2</span> Introduction</a></li>
<li><a href="#databases" id="toc-databases" class="nav-link" data-scroll-target="#databases"><span class="header-section-number">0.3</span> Databases</a></li>
<li><a href="#initial-hypothesis" id="toc-initial-hypothesis" class="nav-link" data-scroll-target="#initial-hypothesis"><span class="header-section-number">0.4</span> Initial Hypothesis</a></li>
<li><a href="#tf-idf" id="toc-tf-idf" class="nav-link" data-scroll-target="#tf-idf"><span class="header-section-number">1</span> TF-IDF</a></li>
<li><a href="#sentiment-analysis" id="toc-sentiment-analysis" class="nav-link" data-scroll-target="#sentiment-analysis"><span class="header-section-number">2</span> SENTIMENT ANALYSIS</a>
<ul class="collapse">
<li><a href="#bigrams" id="toc-bigrams" class="nav-link" data-scroll-target="#bigrams"><span class="header-section-number">2.1</span> Bigrams</a></li>
<li><a href="#trigrams" id="toc-trigrams" class="nav-link" data-scroll-target="#trigrams"><span class="header-section-number">2.2</span> Trigrams</a></li>
</ul></li>
<li><a href="#topic-modelling" id="toc-topic-modelling" class="nav-link" data-scroll-target="#topic-modelling"><span class="header-section-number">3</span> TOPIC MODELLING</a></li>
</ul>
</nav>
</div>
<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Final Project: A text mining analysis of the Harry Potter films</h1>
<p class="subtitle lead">Text Mining - UC3M</p>
</div>
<div class="quarto-title-meta">
<div>
<div class="quarto-title-meta-heading">Author</div>
<div class="quarto-title-meta-contents">
<p>Carlos San Juan, Eric Hausken-Brates </p>
</div>
</div>
<div>
<div class="quarto-title-meta-heading">Published</div>
<div class="quarto-title-meta-contents">
<p class="date">March 26, 2024</p>
</div>
</div>
</div>
</header>
<section id="libraries" class="level2" data-number="0.1">
<h2 data-number="0.1" class="anchored" data-anchor-id="libraries"><span class="header-section-number">0.1</span> Libraries</h2>
<p>The libraries we are going to use for the work are the following:</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidyverse)</span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(dplyr)</span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(magrittr)</span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(scales)</span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(RColorBrewer)</span>
<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ggsci)</span>
<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ggthemes)</span>
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(lubridate)</span>
<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(viridis)</span>
<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ggrepel)</span>
<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(reshape)</span>
<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(gridExtra)</span>
<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidyverse)</span>
<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(reshape)</span>
<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(viridis)</span>
<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tm)</span>
<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(SnowballC)</span>
<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(wordcloud)</span>
<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(NLP)</span>
<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(reshape)</span>
<span id="cb1-21"><a href="#cb1-21" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(widyr)</span>
<span id="cb1-22"><a href="#cb1-22" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(wordcloud2)</span>
<span id="cb1-23"><a href="#cb1-23" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidytext)</span>
<span id="cb1-24"><a href="#cb1-24" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(janeaustenr)</span>
<span id="cb1-25"><a href="#cb1-25" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(htmlwidgets)</span>
<span id="cb1-26"><a href="#cb1-26" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(topicmodels)</span>
<span id="cb1-27"><a href="#cb1-27" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(stringr)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="introduction" class="level2" data-number="0.2">
<h2 data-number="0.2" class="anchored" data-anchor-id="introduction"><span class="header-section-number">0.2</span> Introduction</h2>
<p>In this paper we will apply different Text Mining techniques using the different scripts of the Harry Potter films to reveal different patterns and trends in the narrative, the characters and the emotions they may have experienced. Using natural language processing, we will look for insights into plot evolution and emotional development throughout the saga, providing unique insights into one of the most iconic universes of literature and cinema.</p>
<p>Before we continue, we want to warn that this work is made by big fans of the saga, so we will do it with great affection and we apologise in advance if we are too much of a fan. It is also necessary to warn that we may make some spoilers but we promise that they will be small (not related to the plot). For that reason, we recommend that those who read us, watch the movies before, or better yet, read the books. You will thank us when you finish them.</p>
</section>
<section id="databases" class="level2" data-number="0.3">
<h2 data-number="0.3" class="anchored" data-anchor-id="databases"><span class="header-section-number">0.3</span> Databases</h2>
<p>The database we will use in this project has been compiled from GitHub and is directly accessible through the following link: <a href="https://github.com/Kornflex28/hp-dataset/tree/main/datasets">GitHub - HP Dataset</a>.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>hp1 <span class="ot"><-</span> <span class="fu">read_csv</span>(<span class="st">"hp1.csv"</span>)</span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>hp2 <span class="ot"><-</span> <span class="fu">read_csv</span>(<span class="st">"hp2.csv"</span>)</span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>hp3 <span class="ot"><-</span> <span class="fu">read_csv</span>(<span class="st">"hp3.csv"</span>)</span>
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>hp4 <span class="ot"><-</span> <span class="fu">read_csv</span>(<span class="st">"hp4.csv"</span>)</span>
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>hp5 <span class="ot"><-</span> <span class="fu">read_csv</span>(<span class="st">"hp5.csv"</span>)</span>
<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>hp6 <span class="ot"><-</span> <span class="fu">read_csv</span>(<span class="st">"hp6.csv"</span>)</span>
<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>hp7 <span class="ot"><-</span> <span class="fu">read_csv</span>(<span class="st">"hp7.csv"</span>)</span>
<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a>hp8 <span class="ot"><-</span> <span class="fu">read_csv</span>(<span class="st">"hp8.csv"</span>)</span>
<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Fix misspelling of movie #4</span></span>
<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>hp4 <span class="ot"><-</span> hp4 <span class="sc">|></span> </span>
<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">movie =</span> <span class="fu">str_replace_all</span>(<span class="at">string =</span> movie, <span class="at">pattern =</span> <span class="st">"Gobelt"</span>, <span class="at">replacement =</span> <span class="st">"Goblet"</span>))</span>
<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a>df <span class="ot"><-</span> <span class="fu">rbind</span>(hp1,hp2,hp3,hp4,hp5,hp6,hp7,hp8)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<section id="data-wrangling" class="level3">
<h3 class="anchored" data-anchor-id="data-wrangling">Data wrangling</h3>
<div class="cell">
<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>movie_order <span class="ot"><-</span> <span class="fu">tribble</span>(<span class="sc">~</span>num, <span class="sc">~</span>movie, <span class="sc">~</span>film.name,</span>
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a> <span class="dv">1</span>, <span class="st">"Harry Potter and the Philosopher's Stone"</span>, <span class="st">"1-Philosopher's Stone"</span>,</span>
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a> <span class="dv">2</span>, <span class="st">"Harry Potter and the Chamber of Secrets"</span>, <span class="st">"2-Chamber of Secrets"</span>,</span>
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a> <span class="dv">3</span>, <span class="st">"Harry Potter and the Prisoner of Azkaban"</span>, <span class="st">"3-Prisoner of Azkaban"</span>,</span>
<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a> <span class="dv">4</span>, <span class="st">"Harry Potter and the Goblet of Fire"</span>, <span class="st">"4-Goblet of Fire"</span>,</span>
<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a> <span class="dv">5</span>, <span class="st">"Harry Potter and the Order of the Phoenix"</span>, <span class="st">"5-Order of the Phoenix"</span>,</span>
<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a> <span class="dv">6</span>, <span class="st">"Harry Potter and the Half-Blood Prince"</span>, <span class="st">"6-Half-Blood Prince"</span>,</span>
<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a> <span class="dv">7</span>, <span class="st">"Harry Potter and the Deathly Hallows Part 1"</span>, <span class="st">"7-Deathly Hallows Part 1"</span>,</span>
<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a> <span class="dv">8</span>, <span class="st">"Harry Potter and the Deathly Hallows Part 2"</span>, <span class="st">"8-Deathly Hallows Part 2"</span></span>
<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a> )</span>
<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>df <span class="ot"><-</span> df <span class="sc">|></span> </span>
<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">left_join</span>(movie_order, <span class="at">by =</span> <span class="st">"movie"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
</section>
<section id="initial-hypothesis" class="level2" data-number="0.4">
<h2 data-number="0.4" class="anchored" data-anchor-id="initial-hypothesis"><span class="header-section-number">0.4</span> Initial Hypothesis</h2>
<p>Some of the questions we are going to address in the paper are:</p>
<ul>
<li>Who are the characters that have made the greatest impact on popular culture?</li>
<li>Is the number of words related to the length of the films?</li>
<li>What are the most distinctive words or characters in each film?</li>
<li>How do the most frequently used words differ from the most common bigrams and trigrams?</li>
<li>Which films, scenes, and characters have the most positive and negative sentiment throughout the series?</li>
<li>Does the grouping size of dialogue chunks affect our results in sentiment analysis?</li>
</ul>
</section>
<section id="tf-idf" class="level1" data-number="1">
<h1 data-number="1"><span class="header-section-number">1</span> TF-IDF</h1>
<section id="most-sentences-in-movies" class="level3">
<h3 class="anchored" data-anchor-id="most-sentences-in-movies">Most sentences in movies</h3>
<p>One of the most useful tools in Text Mining is the word count in each text to determine how relevant a certain word or topic may be within a corpus. This approach allows us to identify key terms, frequencies and patterns that emerge in the discourse, offering a solution to discover the predominant themes and relative importance of different concepts throughout the narrative.</p>
<p>However, the count is also useful to identify who are the main characters in different novels or in this case, films. Whereas those people who show the highest number of scripted lines in a film should be the main characters.</p>
<p>That’s going to be the first step in our work, identifying the main characters of the different films. Luckily, we are big fans of the saga and we will be able to check the results quite easily, but we could do it with any script to know its importance.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Save df as a dataframe with variables 'character' and 'movie'</span></span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>Char_Dial <span class="ot"><-</span> <span class="fu">data.frame</span>(<span class="fu">table</span>(df<span class="sc">$</span>character, df<span class="sc">$</span>movie))</span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Sum lines for each character throughout all movies</span></span>
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>Char_Dial_Sum <span class="ot"><-</span> Char_Dial <span class="sc">%>%</span></span>
<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">group_by</span>(Var1) <span class="sc">%>%</span></span>
<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">summarise</span>(<span class="at">Total_Freq =</span> <span class="fu">sum</span>(Freq)) <span class="sc">%>%</span></span>
<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">ungroup</span>()</span>
<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Select top 10 characters with the most spoken lines</span></span>
<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a>Char_Dial_Top10 <span class="ot"><-</span> Char_Dial_Sum <span class="sc">%>%</span></span>
<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a> <span class="fu">arrange</span>(<span class="fu">desc</span>(Total_Freq)) <span class="sc">%>%</span></span>
<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">slice_max</span>(Total_Freq, <span class="at">n =</span> <span class="dv">10</span>)</span>
<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a graph for the top 10 characters with the most lines</span></span>
<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(Char_Dial_Top10, <span class="fu">aes</span>(<span class="at">x =</span> <span class="fu">reorder</span>(Var1, Total_Freq), <span class="at">y =</span> Total_Freq)) <span class="sc">+</span></span>
<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>, <span class="at">width =</span> <span class="fl">0.62</span>, <span class="at">fill =</span> <span class="st">"steelblue"</span>) <span class="sc">+</span></span>
<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a> <span class="fu">coord_flip</span>() <span class="sc">+</span></span>
<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Characters with the most sentences"</span>,</span>
<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a> <span class="at">subtitle =</span> <span class="st">"Top 10 across all parts of a movie series"</span>,</span>
<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Character"</span>, <span class="at">y =</span> <span class="st">"Number of sentences"</span>) <span class="sc">+</span></span>
<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.position =</span> <span class="st">"none"</span>) <span class="co"># Remove legend because it is not relevant</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-3-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>In this graph we can see who are the 10 most important characters in the Harry Potter saga.</p>
<p>As you might expect, the character who has the most lines in the films is the one who appears in the title of the films and is known as ‘The Chosen One’ or Harry Potter, to those who aren’t such big fans. Harry is followed by his best friends, <code>Ron Weasley</code> and <code>Hermione Granger</code>, completing the <code>Golden Trio</code>.</p>
<p>However, when reviewing the results and as fans of the saga we are struck by the appearance of a particular character <code>Horace Slughorn</code>. This character appears in the sixth installment having great prominence only in this one, being more forgotten in the last two. In addition, in this list there are great forgotten characters such as <code>Draco Malfoy</code> who despite being a very important character in the saga, being one of the main enemies of Harry Potter, is not in the top 10. This could be an indication of the great impact that this character had on popular culture, as everyone who has seen the films or read the books remembers this character, but instead he hardly appears on screen, according to the results obtained.</p>
<p>Next, we are going to divide this analysis by films, to observe how the phrases are distributed throughout the saga. To do this, we will store the names of the characters mentioned in the following vector, with the licence to change <code>Horace Slughorn</code> to <code>Draco Malfoy</code>.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>top_characters <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"Harry Potter"</span>, <span class="st">"Ron Weasley"</span>, <span class="st">"Hermione Granger"</span>, <span class="st">"Albus Dumbledore"</span>, <span class="st">"Rubeus Hagrid"</span>, <span class="st">"Severus Snape"</span>, <span class="st">"Minerva McGonagall"</span>, <span class="st">"Voldemort"</span>,<span class="st">"Neville Longbottom"</span>, <span class="st">"Draco Malfoy"</span>)</span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>Char_Dial <span class="ot"><-</span> <span class="fu">data.frame</span>(<span class="fu">table</span>(df<span class="sc">$</span>character, df<span class="sc">$</span>film.name))</span>
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>Char_Dial <span class="sc">%>%</span></span>
<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">arrange</span>(<span class="fu">desc</span>(Freq)) <span class="sc">%>%</span></span>
<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">filter</span>(Var1 <span class="sc">%in%</span> top_characters) <span class="sc">%>%</span></span>
<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="fu">reorder</span>(Var1, <span class="sc">+</span>Freq), Freq, <span class="at">fill =</span> Var2)) <span class="sc">+</span></span>
<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>, <span class="at">width =</span> <span class="fl">0.62</span>)<span class="sc">+</span></span>
<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_brewer</span>(<span class="at">type =</span> <span class="st">"div"</span>, ) <span class="sc">+</span></span>
<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a> <span class="fu">coord_flip</span>()<span class="sc">+</span></span>
<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">guides</span>(<span class="at">fill =</span> <span class="fu">guide_legend</span>(<span class="at">title.position =</span> <span class="st">"top"</span>, <span class="at">reverse =</span> T))<span class="sc">+</span></span>
<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Characters with the most sentences"</span>,</span>
<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a> <span class="at">subtitle =</span> <span class="st">"Top 10, by movie"</span>, <span class="at">fill =</span> <span class="st">"Movie"</span>,</span>
<span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Character"</span>, <span class="at">y =</span> <span class="st">"Number of sentences"</span>)<span class="sc">+</span></span>
<span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>()<span class="sc">+</span></span>
<span id="cb5-18"><a href="#cb5-18" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.title.align =</span> <span class="fl">0.5</span>, <span class="at">legend.position =</span> <span class="st">"right"</span>, <span class="at">legend.direction =</span> <span class="st">"vertical"</span>) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-4-1.png" class="img-fluid" width="1056"></p>
</div>
</div>
</section>
<section id="most-used-spells" class="level3">
<h3 class="anchored" data-anchor-id="most-used-spells">Most used Spells</h3>
<p>In this section, we are going to talk about magic, more specifically spells. In the world of Harry Potter, in order to do magic, you have to cast a spell in a certain way. That is why we are going to see which are the most used spells in the saga.</p>
<p>To do this, first of all, we are going to store in a vector all the spells that are mentioned in the films. To see where we have taken the spells from, click on <a href="https://screenrant.com/harry-potter-spells-list-from-movies-and-books/">here</a>.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>spells <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">'Accio'</span>, <span class="st">'Alohomora'</span>, <span class="st">'Avada Kedavra'</span>, <span class="st">'Crucio'</span>, <span class="st">'Expecto Patronum'</span>, <span class="st">'Expelliarmus'</span>, <span class="st">'Imperio'</span>,</span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a> <span class="st">'Lumos'</span>, <span class="st">'Obliviate'</span>, <span class="st">'Petrificus Totalus'</span>, <span class="st">'Reparo'</span>, <span class="st">'Riddikulus'</span>, <span class="st">'Sectumsempra'</span>, <span class="st">'Wingardium Leviosa'</span>)</span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="co"># We add a column to identify the spell mentioned in each dialogue.</span></span>
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>df<span class="sc">$</span>spell <span class="ot"><-</span> <span class="cn">NA</span> <span class="co"># Initialize the variable with `NA`</span></span>
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Loop through the `spells` vector. For each spell, check if it is written in each line of the script. If it is present in that line, add that spell to the variable `spell` in `df` for that row. </span></span>
<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span>(spell <span class="cf">in</span> spells) {</span>
<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a> df<span class="sc">$</span>spell <span class="ot"><-</span> <span class="fu">ifelse</span>(<span class="fu">grepl</span>(spell, df<span class="sc">$</span>dialog, <span class="at">ignore.case =</span> <span class="cn">TRUE</span>), spell, df<span class="sc">$</span>spell)</span>
<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a>}</span>
<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a><span class="co"># We calculate the count of each spell.</span></span>
<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a>spell_counts <span class="ot"><-</span> df <span class="sc">%>%</span></span>
<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">filter</span>(<span class="sc">!</span><span class="fu">is.na</span>(spell)) <span class="sc">%>%</span> <span class="co"># Exclude lines without spells</span></span>
<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(spell, <span class="at">sort =</span> <span class="cn">TRUE</span>) <span class="co"># Count occurrances for each spell and sort in order</span></span>
<span id="cb6-16"><a href="#cb6-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(viridis) <span class="co"># Make sure to have this package installed</span></span>
<span id="cb6-18"><a href="#cb6-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-20"><a href="#cb6-20" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(spell_counts, <span class="fu">aes</span>(<span class="at">x =</span> <span class="fu">reorder</span>(spell, n), <span class="at">y =</span> n, <span class="at">fill =</span> spell)) <span class="sc">+</span></span>
<span id="cb6-21"><a href="#cb6-21" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>) <span class="sc">+</span></span>
<span id="cb6-22"><a href="#cb6-22" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_text</span>(<span class="fu">aes</span>(<span class="at">label =</span> n), <span class="at">position =</span> <span class="fu">position_dodge</span>(<span class="at">width =</span> <span class="fl">0.9</span>), <span class="at">hjust =</span> <span class="sc">-</span><span class="fl">0.1</span>, <span class="at">size =</span> <span class="fl">3.5</span>) <span class="sc">+</span> </span>
<span id="cb6-23"><a href="#cb6-23" aria-hidden="true" tabindex="-1"></a> <span class="fu">coord_flip</span>() <span class="sc">+</span></span>
<span id="cb6-24"><a href="#cb6-24" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_viridis</span>(<span class="at">discrete =</span> <span class="cn">TRUE</span>, <span class="at">option =</span> <span class="st">"D"</span>) <span class="sc">+</span> </span>
<span id="cb6-25"><a href="#cb6-25" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">'Spells most commonly used'</span>,</span>
<span id="cb6-26"><a href="#cb6-26" aria-hidden="true" tabindex="-1"></a> <span class="at">subtitle =</span> <span class="st">"Frequency of mentioning spells in dialogues"</span>,</span>
<span id="cb6-27"><a href="#cb6-27" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">'Spells'</span>,</span>
<span id="cb6-28"><a href="#cb6-28" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> <span class="st">'Frequency'</span>) <span class="sc">+</span></span>
<span id="cb6-29"><a href="#cb6-29" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb6-30"><a href="#cb6-30" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.title =</span> <span class="fu">element_blank</span>(), </span>
<span id="cb6-31"><a href="#cb6-31" aria-hidden="true" tabindex="-1"></a> <span class="at">axis.title.x =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">12</span>, <span class="at">face =</span> <span class="st">"bold"</span>),</span>
<span id="cb6-32"><a href="#cb6-32" aria-hidden="true" tabindex="-1"></a> <span class="at">axis.title.y =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">12</span>, <span class="at">face =</span> <span class="st">"bold"</span>),</span>
<span id="cb6-33"><a href="#cb6-33" aria-hidden="true" tabindex="-1"></a> <span class="at">plot.title =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">14</span>, <span class="at">face =</span> <span class="st">"bold"</span>),</span>
<span id="cb6-34"><a href="#cb6-34" aria-hidden="true" tabindex="-1"></a> <span class="at">plot.subtitle =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">10</span>),</span>
<span id="cb6-35"><a href="#cb6-35" aria-hidden="true" tabindex="-1"></a> <span class="at">legend.position =</span> <span class="st">"none"</span>) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-5-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>The most used spells are <code>Expeliarmus</code> and <code>Expecto Patronum</code> with a total of 12 times throughout all 8 movies. But let’s see how they are distributed across the series.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>Spels_df <span class="ot"><-</span> <span class="fu">data.frame</span>(<span class="fu">table</span>(df<span class="sc">$</span>character, df<span class="sc">$</span>film.name, df<span class="sc">$</span>spell))</span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>Spels_df <span class="sc">%>%</span></span>
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">arrange</span>(<span class="fu">desc</span>(Freq)) <span class="sc">%>%</span></span>
<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">filter</span>(Var3 <span class="sc">%in%</span> spells) <span class="sc">%>%</span></span>
<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="fu">reorder</span>(Var3, <span class="sc">+</span>Freq), Freq, <span class="at">fill =</span> Var2)) <span class="sc">+</span></span>
<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>, <span class="at">width =</span> <span class="fl">0.62</span>) <span class="sc">+</span></span>
<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_brewer</span>(<span class="at">palette =</span> <span class="st">"Set2"</span>) <span class="sc">+</span> </span>
<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">coord_flip</span>() <span class="sc">+</span></span>
<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">guides</span>(<span class="at">fill =</span> <span class="fu">guide_legend</span>(<span class="at">title.position =</span> <span class="st">"top"</span>, <span class="at">title =</span> <span class="st">"Movie Part"</span>)) <span class="sc">+</span></span>
<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Spells most commonly used"</span>,</span>
<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a> <span class="at">subtitle =</span> <span class="st">"Frequency of mentioning spells by movie"</span>,</span>
<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Spells"</span>,</span>
<span id="cb7-14"><a href="#cb7-14" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> <span class="st">"Number of appareances"</span>) <span class="sc">+</span></span>
<span id="cb7-15"><a href="#cb7-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb7-16"><a href="#cb7-16" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.title.align =</span> <span class="fl">0.5</span>, </span>
<span id="cb7-17"><a href="#cb7-17" aria-hidden="true" tabindex="-1"></a> <span class="at">legend.position =</span> <span class="st">"right"</span>, </span>
<span id="cb7-18"><a href="#cb7-18" aria-hidden="true" tabindex="-1"></a> <span class="at">legend.direction =</span> <span class="st">"vertical"</span>,</span>
<span id="cb7-19"><a href="#cb7-19" aria-hidden="true" tabindex="-1"></a> <span class="at">plot.title =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">14</span>, <span class="at">face =</span> <span class="st">"bold"</span>),</span>
<span id="cb7-20"><a href="#cb7-20" aria-hidden="true" tabindex="-1"></a> <span class="at">plot.subtitle =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">12</span>),</span>
<span id="cb7-21"><a href="#cb7-21" aria-hidden="true" tabindex="-1"></a> <span class="at">axis.title.x =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">12</span>),</span>
<span id="cb7-22"><a href="#cb7-22" aria-hidden="true" tabindex="-1"></a> <span class="at">axis.title.y =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">12</span>),</span>
<span id="cb7-23"><a href="#cb7-23" aria-hidden="true" tabindex="-1"></a> <span class="at">legend.text =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">10</span>)) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-6-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>However, there is some confusion in this review because, for example, the spell <code>Riddikulus</code> might seem to be of great importance to the plot, but in reality, it is not (again, we take advantage of the fact that we are fans to discover this). This spell appears, because in a scene numerous characters say it, because they are in a magic class, but they don’t use it again in any film, that’s why it only appears in the film of the ‘Prisoner of Azkaban’.</p>
<p>In consequence, we are going to do the same thing but this time instead of separating by films by the number of films they appear in:</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>Spels_df <span class="sc">|></span> </span>
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">filter</span>(Freq <span class="sc">></span> <span class="dv">0</span>) <span class="sc">|></span> </span>
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">select</span>(Var2, Var3) <span class="sc">|></span> </span>
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">distinct</span>() <span class="sc">|></span> </span>
<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>( Var3) <span class="sc">|></span> </span>
<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="at">y =</span> <span class="fu">reorder</span>(Var3, <span class="sc">+</span>n), <span class="at">x =</span> n, <span class="at">fill =</span> Var3 <span class="sc">==</span> <span class="st">"Riddikulus"</span>) ) <span class="sc">+</span></span>
<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>, <span class="at">width =</span> <span class="fl">0.62</span>, <span class="at">color =</span> <span class="st">"steelblue"</span>) <span class="sc">+</span></span>
<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_manual</span>(<span class="at">values =</span> <span class="fu">c</span>( <span class="st">"steelblue"</span>, <span class="st">"peru"</span>)) <span class="sc">+</span></span>
<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Riddikulus is in only one movie"</span>,</span>
<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a> <span class="at">subtitle =</span> <span class="st">"Number of movies mentioning this spell"</span>,</span>
<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> <span class="st">"Spells"</span>,</span>
<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Number of movies appearing in"</span>) <span class="sc">+</span> </span>
<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb8-14"><a href="#cb8-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.title.align =</span> <span class="fl">0.5</span>, </span>
<span id="cb8-15"><a href="#cb8-15" aria-hidden="true" tabindex="-1"></a> <span class="at">legend.position =</span> <span class="st">"none"</span>, </span>
<span id="cb8-16"><a href="#cb8-16" aria-hidden="true" tabindex="-1"></a> <span class="at">legend.direction =</span> <span class="st">"vertical"</span>,</span>
<span id="cb8-17"><a href="#cb8-17" aria-hidden="true" tabindex="-1"></a> <span class="at">plot.title =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">14</span>, <span class="at">face =</span> <span class="st">"bold"</span>),</span>
<span id="cb8-18"><a href="#cb8-18" aria-hidden="true" tabindex="-1"></a> <span class="at">plot.subtitle =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">12</span>),</span>
<span id="cb8-19"><a href="#cb8-19" aria-hidden="true" tabindex="-1"></a> <span class="at">axis.title.x =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">12</span>),</span>
<span id="cb8-20"><a href="#cb8-20" aria-hidden="true" tabindex="-1"></a> <span class="at">axis.title.y =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">12</span>),</span>
<span id="cb8-21"><a href="#cb8-21" aria-hidden="true" tabindex="-1"></a> <span class="at">legend.text =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">10</span>),</span>
<span id="cb8-22"><a href="#cb8-22" aria-hidden="true" tabindex="-1"></a> <span class="at">panel.grid.minor =</span> <span class="fu">element_blank</span>()</span>
<span id="cb8-23"><a href="#cb8-23" aria-hidden="true" tabindex="-1"></a> ) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-7-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>Having seen how the spells are distributed by film, another way to visualize it is to divide it up according to who the characters are who conjure them. Let’s get down to it:</p>
<div class="cell" data-fig.asp="0.65" data-preview="true">
<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>spell_character_counts <span class="ot"><-</span> df <span class="sc">%>%</span></span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">filter</span>(spell <span class="sc">%in%</span> spells) <span class="sc">%>%</span> </span>
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(spell, character) <span class="sc">%>%</span></span>
<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">arrange</span>(spell, <span class="fu">desc</span>(n))</span>
<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Create the graph</span></span>
<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(spell_character_counts, <span class="fu">aes</span>(<span class="at">y =</span> <span class="fu">reorder</span>(character, n), <span class="at">x =</span> n, <span class="at">fill =</span> character, <span class="at">label =</span> n)) <span class="sc">+</span></span>
<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>) <span class="sc">+</span></span>
<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_text</span>(<span class="at">hjust =</span> <span class="sc">-</span><span class="dv">1</span>) <span class="sc">+</span></span>
<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">facet_wrap</span>(<span class="sc">~</span> spell, <span class="at">scales =</span> <span class="st">"free_y"</span>) <span class="sc">+</span> </span>
<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_viridis_d</span>(<span class="at">begin =</span> <span class="fl">0.2</span>, <span class="at">end =</span> <span class="fl">0.8</span>, <span class="at">direction =</span> <span class="sc">-</span><span class="dv">1</span>, <span class="at">option =</span> <span class="st">"C"</span>) <span class="sc">+</span> </span>
<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Character Spell Usage"</span>,</span>
<span id="cb9-13"><a href="#cb9-13" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> <span class="cn">NULL</span>,</span>
<span id="cb9-14"><a href="#cb9-14" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Frequency of Spell Usage"</span>) <span class="sc">+</span></span>
<span id="cb9-15"><a href="#cb9-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb9-16"><a href="#cb9-16" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">angle =</span> <span class="dv">45</span>, <span class="at">hjust =</span> <span class="dv">1</span>),</span>
<span id="cb9-17"><a href="#cb9-17" aria-hidden="true" tabindex="-1"></a> <span class="at">strip.text.x =</span> <span class="fu">element_text</span>(<span class="at">face =</span> <span class="st">"bold"</span>, <span class="at">hjust =</span> <span class="dv">0</span>, <span class="at">size =</span> <span class="dv">12</span>),</span>
<span id="cb9-18"><a href="#cb9-18" aria-hidden="true" tabindex="-1"></a> <span class="at">legend.position =</span> <span class="st">"none"</span>,</span>
<span id="cb9-19"><a href="#cb9-19" aria-hidden="true" tabindex="-1"></a> <span class="at">panel.border =</span> <span class="fu">element_rect</span>(<span class="at">fill =</span> <span class="cn">NA</span>, <span class="at">color =</span> <span class="st">"gray20"</span>)</span>
<span id="cb9-20"><a href="#cb9-20" aria-hidden="true" tabindex="-1"></a> ) <span class="sc">+</span> </span>
<span id="cb9-21"><a href="#cb9-21" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_x_continuous</span>(<span class="at">breaks =</span> <span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">2</span>,<span class="dv">4</span>,<span class="dv">6</span>,<span class="dv">8</span>,<span class="dv">10</span>), <span class="at">limits =</span> <span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">10</span>)) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-8-1.png" class="img-fluid" style="width:100.0%"></p>
</div>
</div>
<p>Here we can see the distribution of the different spells cast by the different characters. We can see how most of them are dominated by either <code>Harry Potter</code> or <code>Hermione Granger</code>. Or in the case of the ‘Unforgivable Curses’ (for non-fans, we are referring to spells that are forbidden in the Harry Potter world, such as <code>Avada Kedabra</code>), predominate <code>Voldemort</code> and other dark wizards from the saga. This serves as a great indicator of the relevance of these characters, as well as showing who other influential characters are in the plot.</p>
<p>At this point, we have already found out which spells are used the most and which characters speak the most. The next step is to observe which words are repeated the most and calculate their frequency.</p>
</section>
<section id="most-used-words" class="level3">
<h3 class="anchored" data-anchor-id="most-used-words">Most used Words</h3>
<p>The first step is to check which film is the most scripted or the longest. That is to say, we will assume that the films that have the most dialogue are those that offer us the most minutes on the big screen.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(dplyr)</span>
<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>total_dialogs <span class="ot"><-</span> df <span class="sc">%>%</span></span>
<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">group_by</span>(film.name) <span class="sc">%>%</span></span>
<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">summarize</span>(<span class="at">total_dialogs =</span> <span class="fu">n</span>()) <span class="co"># Count the number of rows per group, which is equivalent to counting dialogues.</span></span>
<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a>total_dialogs</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 8 × 2
film.name total_dialogs
<chr> <int>
1 1-Philosopher's Stone 885
2 2-Chamber of Secrets 987
3 3-Prisoner of Azkaban 881
4 4-Goblet of Fire 741
5 5-Order of the Phoenix 1157
6 6-Half-Blood Prince 1070
7 7-Deathly Hallows Part 1 1012
8 8-Deathly Hallows Part 2 712</code></pre>
</div>
</div>
<p>Let’s represent it in a graph.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(total_dialogs, <span class="fu">aes</span>(<span class="at">y =</span> <span class="fu">reorder</span>(film.name, <span class="sc">-</span>total_dialogs), <span class="at">x =</span> total_dialogs, <span class="at">fill =</span> film.name)) <span class="sc">+</span></span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>) <span class="sc">+</span></span>
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Total Dialogues by Harry Potter Movie"</span>,</span>
<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> <span class="cn">NULL</span>,</span>
<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Total Dialogues"</span>) <span class="sc">+</span></span>
<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.position =</span> <span class="st">"none"</span>) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-10-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>Looking at the results and comparing it with the length of the films, we can see that it does not match, i.e. the film with the most dialogue is <code>Harry Potter and the Order of the Phoenix</code>, being the second shortest film of the saga. The information about the duration of the films is the following, although you can find more information about it, in the following <a href="https://www.pottertalk.net/harry-potter-movie-lengths/">link</a>.</p>
<ul>
<li><p>Philosopher’s Stone = 152 minutes = 2 hours 32 minutes</p></li>
<li><p>Chamber of Secrets = 161 minutes = 2 hours 41 minutes</p></li>
<li><p>Prisoner of Azkaban = 142 minutes = 2 hours 22 minutes</p></li>
<li><p>Goblet of Fire = 157 minutes = 2 hours 37 minutes</p></li>
<li><p>Order of the Phoenix = 139 minutes = 2 hours 18 minutes</p></li>
<li><p>Half Blood Prince = 153 minutes = 2 hours 33 minutes</p></li>
<li><p>Deathly Hallows pt 1 = 146 minutes = 2 hours 26 minutes</p></li>
<li><p>Deathly Hallows pt 2 = 130 minutes = 2 hours 10 minutes</p></li>
</ul>
<p>There doesn’t seem to be that much of a relationship between the amount of dialogue and the number of minutes in the film. Let’s do the same, but instead of dialogue for words.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>words <span class="ot"><-</span> df <span class="sc">%>%</span></span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">unnest_tokens</span>(word, dialog) <span class="sc">%>%</span></span>
<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(movie, word, <span class="at">sort =</span> <span class="cn">TRUE</span>)</span>
<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a>movie_words <span class="ot"><-</span> df <span class="sc">%>%</span></span>
<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a> <span class="co"># we tokenize as usual (as an exception we won't be filtering stopwords now)</span></span>
<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">unnest_tokens</span>(word, dialog) <span class="sc">%>%</span></span>
<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(movie, word, <span class="at">sort =</span> <span class="cn">TRUE</span>) <span class="sc">|></span> </span>
<span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">group_by</span>(movie) <span class="sc">%>%</span> </span>
<span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">summarize</span>(<span class="at">total_words =</span> <span class="fu">sum</span>(n))</span>
<span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a>movie_words</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 8 × 2
movie total_words
<chr> <int>
1 Harry Potter and the Chamber of Secrets 10814
2 Harry Potter and the Deathly Hallows Part 1 10212
3 Harry Potter and the Deathly Hallows Part 2 6504
4 Harry Potter and the Goblet of Fire 8230
5 Harry Potter and the Half-Blood Prince 11330
6 Harry Potter and the Order of the Phoenix 9672
7 Harry Potter and the Philosopher's Stone 9818
8 Harry Potter and the Prisoner of Azkaban 9720</code></pre>
</div>
<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(movie_words, <span class="fu">aes</span>(<span class="at">x =</span> <span class="fu">reorder</span>(movie, <span class="sc">-</span>total_words), <span class="at">y =</span> total_words, <span class="at">fill =</span> movie)) <span class="sc">+</span></span>
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>) <span class="sc">+</span></span>
<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">coord_flip</span>() <span class="sc">+</span> <span class="co"># Barras horizontales</span></span>
<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Total Words by Harry Potter Movie"</span>,</span>
<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Movie"</span>,</span>
<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> <span class="st">"Total Words"</span>) <span class="sc">+</span></span>
<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.position =</span> <span class="st">"none"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-11-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>Again, we see that although the number of words is more closely related to the length of the film, it does not coincide, so we can discard the hypothesis that the longer the dialogue, the longer the film.</p>
<p>Once we have checked this, the next step is to look at the term frequency of each word, to see which words are the most representative of each film. This is calculated as the number of times the word is repeated in the film divided by the total number of words in the film.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>movie_words <span class="ot"><-</span> words <span class="sc">%>%</span></span>
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">left_join</span>(movie_words, <span class="at">by =</span> <span class="st">"movie"</span>) <span class="sc">|></span> </span>
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a> <span class="co">#we add a column for term_frequency in each novel</span></span>
<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">term_frequency =</span> n<span class="sc">/</span>total_words)</span>
<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a>movie_words</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 14,022 × 5
movie word n total_words term_frequency
<chr> <chr> <int> <int> <dbl>
1 Harry Potter and the Half-Blood Prince you 432 11330 0.0381
2 Harry Potter and the Half-Blood Prince i 409 11330 0.0361
3 Harry Potter and the Order of the Pho… you 382 9672 0.0395
4 Harry Potter and the Prisoner of Azka… you 382 9720 0.0393
5 Harry Potter and the Chamber of Secre… you 376 10814 0.0348
6 Harry Potter and the Deathly Hallows … you 356 10212 0.0349
7 Harry Potter and the Deathly Hallows … the 344 10212 0.0337
8 Harry Potter and the Philosopher's St… you 326 9818 0.0332
9 Harry Potter and the Chamber of Secre… the 320 10814 0.0296
10 Harry Potter and the Goblet of Fire you 315 8230 0.0383
# ℹ 14,012 more rows</code></pre>
</div>
<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(movie_words, <span class="fu">aes</span>(<span class="at">x =</span> term_frequency)) <span class="sc">+</span></span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_histogram</span>(<span class="at">binwidth =</span> <span class="fl">0.0001</span>, <span class="at">fill =</span> <span class="st">"#0073C2FF"</span>, <span class="at">color =</span> <span class="st">"black"</span>) <span class="sc">+</span></span>
<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">xlim</span>(<span class="cn">NA</span>, <span class="fl">0.009</span>) <span class="sc">+</span> <span class="co"># Límites en el eje x para enfocar hasta 0.01</span></span>
<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_y_continuous</span>(<span class="at">breaks =</span> <span class="fu">seq</span>(<span class="dv">0</span>, <span class="dv">7000</span>, <span class="at">by =</span> <span class="dv">500</span>)) <span class="sc">+</span> <span class="co"># Ajusta los breaks del eje y</span></span>
<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Distribution of Term Frequency Across All Movies"</span>,</span>
<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Term Frequency (as a percentage of total words)"</span>,</span>
<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> <span class="st">"Count"</span>) <span class="sc">+</span></span>
<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">angle =</span> <span class="dv">45</span>, <span class="at">hjust =</span> <span class="dv">1</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-12-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>This graph is a histogram showing the distribution of the frequency of terms in dialogue across all the Harry Potter films. Each bar represents the number of terms (y-axis) that occur with a certain frequency (x-axis) within the total set of words in the films. It can be seen that there are a large number of words with low frequency, suggesting that there are a large number of words that are not repeated.</p>
<p>Now we are going to observe a frequency distribution but per film, looking at which film is richer in vocabulary.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(movie_words, <span class="fu">aes</span>(<span class="at">x =</span> term_frequency, <span class="at">fill =</span> movie)) <span class="sc">+</span></span>
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_histogram</span>(<span class="at">bins =</span> <span class="dv">30</span>, <span class="at">position =</span> <span class="st">"identity"</span>) <span class="sc">+</span> <span class="co"># Eliminamos la transparencia con alpha</span></span>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_x_continuous</span>(<span class="at">limits =</span> <span class="fu">c</span>(<span class="cn">NA</span>, <span class="fl">0.0009</span>), <span class="at">labels =</span> scales<span class="sc">::</span><span class="fu">percent_format</span>(<span class="at">accuracy =</span> <span class="fl">0.01</span>)) <span class="sc">+</span></span>
<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_manual</span>(<span class="at">values =</span> <span class="fu">c</span>(<span class="st">"Harry Potter and the Chamber of Secrets"</span> <span class="ot">=</span> <span class="st">"#1f77b4"</span>,</span>
<span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a> <span class="st">"Harry Potter and the Deathly Hallows Part 1"</span> <span class="ot">=</span> <span class="st">"#ff7f0e"</span>,</span>
<span id="cb19-6"><a href="#cb19-6" aria-hidden="true" tabindex="-1"></a> <span class="st">"Harry Potter and the Deathly Hallows Part 2"</span> <span class="ot">=</span> <span class="st">"#2ca02c"</span>,</span>
<span id="cb19-7"><a href="#cb19-7" aria-hidden="true" tabindex="-1"></a> <span class="st">"Harry Potter and the Goblet of Fire"</span> <span class="ot">=</span> <span class="st">"#d62728"</span>,</span>
<span id="cb19-8"><a href="#cb19-8" aria-hidden="true" tabindex="-1"></a> <span class="st">"Harry Potter and the Half-Blood Prince"</span> <span class="ot">=</span> <span class="st">"#9467bd"</span>,</span>
<span id="cb19-9"><a href="#cb19-9" aria-hidden="true" tabindex="-1"></a> <span class="st">"Harry Potter and the Order of the Phoenix"</span> <span class="ot">=</span> <span class="st">"#8c564b"</span>,</span>
<span id="cb19-10"><a href="#cb19-10" aria-hidden="true" tabindex="-1"></a> <span class="st">"Harry Potter and the Philosopher's Stone"</span> <span class="ot">=</span> <span class="st">"#e377c2"</span>,</span>
<span id="cb19-11"><a href="#cb19-11" aria-hidden="true" tabindex="-1"></a> <span class="st">"Harry Potter and the Prisoner of Azkaban"</span> <span class="ot">=</span> <span class="st">"#7f7f7f"</span>)) <span class="sc">+</span> <span class="co"># Añade más colores según sea necesario</span></span>
<span id="cb19-12"><a href="#cb19-12" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Distribution of Term Frequency Across Movies"</span>,</span>
<span id="cb19-13"><a href="#cb19-13" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Term Frequency (as a percentage of total words)"</span>,</span>
<span id="cb19-14"><a href="#cb19-14" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> <span class="st">"Count"</span>) <span class="sc">+</span></span>
<span id="cb19-15"><a href="#cb19-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb19-16"><a href="#cb19-16" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.position =</span> <span class="st">"right"</span>, <span class="at">legend.title =</span> <span class="fu">element_blank</span>())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-13-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>We can observe that the film richest in vocabulary is <code>Harry Potter and the Half-Blood Prince</code>. However, the distribution of the different films is not clear.</p>
<div class="cell" data-fig.asp="0.65" data-preview="true">
<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Vizualization</span></span>
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(movie_words, <span class="fu">aes</span>(<span class="at">x =</span> term_frequency, <span class="at">fill =</span> movie)) <span class="sc">+</span></span>
<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_histogram</span>(<span class="at">binwidth =</span> <span class="fl">0.0001</span>, <span class="at">color =</span> <span class="st">"white"</span>) <span class="sc">+</span> </span>
<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_x_continuous</span>(<span class="at">limits =</span> <span class="fu">c</span>(<span class="cn">NA</span>, <span class="fl">0.003</span>), <span class="at">labels =</span> scales<span class="sc">::</span><span class="fu">percent_format</span>(<span class="at">accuracy =</span> <span class="fl">0.0001</span>)) <span class="sc">+</span> </span>
<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">facet_wrap</span>(<span class="sc">~</span> movie, <span class="at">ncol =</span> <span class="dv">2</span>, <span class="at">scales =</span> <span class="st">"free_y"</span>) <span class="sc">+</span> </span>
<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_brewer</span>(<span class="at">palette =</span> <span class="st">"Set3"</span>) <span class="sc">+</span> </span>
<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Term Frequency Distribution by Movie"</span>,</span>
<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Term Frequency"</span>,</span>
<span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> <span class="st">"Count"</span>) <span class="sc">+</span></span>
<span id="cb20-10"><a href="#cb20-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_light</span>() <span class="sc">+</span> <span class="co"># Aplicamos un tema claro</span></span>
<span id="cb20-11"><a href="#cb20-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.position =</span> <span class="st">"bottom"</span>, </span>
<span id="cb20-12"><a href="#cb20-12" aria-hidden="true" tabindex="-1"></a> <span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">angle =</span> <span class="dv">45</span>, <span class="at">hjust =</span> <span class="dv">1</span>), </span>
<span id="cb20-13"><a href="#cb20-13" aria-hidden="true" tabindex="-1"></a> <span class="at">strip.background =</span> <span class="fu">element_rect</span>(<span class="at">fill =</span> <span class="st">"lightblue"</span>), </span>
<span id="cb20-14"><a href="#cb20-14" aria-hidden="true" tabindex="-1"></a> <span class="at">strip.text.x =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">8</span>, <span class="at">color =</span> <span class="st">"navy"</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-14-1.png" class="img-fluid" style="width:100.0%"></p>
</div>
</div>
<p>In general, we can observe that all the films present a similar distribution.</p>
</section>
<section id="tf-idf-1" class="level3">
<h3 class="anchored" data-anchor-id="tf-idf-1">TF-IDF</h3>
<p>The next step in our analysis is to use the term frequency-inverse document frequency (tf-idf) technique to highlight words that are distinctive in each film in the Harry Potter series. Tf-idf is useful because it helps us to identify not only the most frequent words, but also those that are particularly significant in a given document in relation to a collection of documents. This allows us to look beyond mere frequency and consider the relevance of a term, giving us a more nuanced view of how language is used in the different films.</p>
<p>By calculating the tf-idf of each term in the context of each film, we can filter and visualise the 20 most characteristic words per film, giving us a list of distinctive terms that define or are emblematic of each film.</p>
<div class="cell" data-fig.asp="0.65" data-preview="true">
<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="co">#we create a new variable with the analysis</span></span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>movie_tf_idf <span class="ot"><-</span> movie_words <span class="sc">%>%</span></span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">bind_tf_idf</span>(word, movie, n) <span class="sc">|></span> </span>
<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">select</span>(<span class="sc">-</span>total_words) <span class="sc">%>%</span></span>
<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a> <span class="co">#we arrange by tf-idf in descending order</span></span>
<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">arrange</span>(<span class="fu">desc</span>(tf_idf))</span>
<span id="cb21-7"><a href="#cb21-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb21-8"><a href="#cb21-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Visualization</span></span>
<span id="cb21-9"><a href="#cb21-9" aria-hidden="true" tabindex="-1"></a>movie_tf_idf <span class="sc">%>%</span></span>
<span id="cb21-10"><a href="#cb21-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">group_by</span>(movie) <span class="sc">%>%</span></span>
<span id="cb21-11"><a href="#cb21-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">slice_max</span>(tf_idf, <span class="at">n =</span> <span class="dv">20</span>) <span class="sc">%>%</span></span>
<span id="cb21-12"><a href="#cb21-12" aria-hidden="true" tabindex="-1"></a> <span class="fu">ungroup</span>() <span class="sc">%>%</span></span>
<span id="cb21-13"><a href="#cb21-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(tf_idf, <span class="fu">fct_reorder</span>(word, tf_idf), <span class="at">fill =</span> movie)) <span class="sc">+</span></span>
<span id="cb21-14"><a href="#cb21-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_col</span>(<span class="at">show.legend =</span> <span class="cn">FALSE</span>) <span class="sc">+</span></span>
<span id="cb21-15"><a href="#cb21-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">facet_wrap</span>(<span class="sc">~</span> movie, <span class="at">ncol =</span> <span class="dv">2</span>, <span class="at">scales =</span> <span class="st">"free"</span>) <span class="sc">+</span></span>
<span id="cb21-16"><a href="#cb21-16" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_brewer</span>(<span class="at">palette =</span> <span class="st">"Set2"</span>) <span class="sc">+</span> </span>
<span id="cb21-17"><a href="#cb21-17" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">x =</span> <span class="st">"TF-IDF"</span>, <span class="at">y =</span> <span class="st">"Words"</span>) <span class="sc">+</span> </span>
<span id="cb21-18"><a href="#cb21-18" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span> </span>
<span id="cb21-19"><a href="#cb21-19" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">axis.text.y =</span> <span class="fu">element_text</span>(<span class="at">angle =</span> <span class="dv">0</span>), </span>
<span id="cb21-20"><a href="#cb21-20" aria-hidden="true" tabindex="-1"></a> <span class="at">strip.background =</span> <span class="fu">element_rect</span>(<span class="at">fill =</span> <span class="st">"lightblue"</span>), </span>
<span id="cb21-21"><a href="#cb21-21" aria-hidden="true" tabindex="-1"></a> <span class="at">strip.text.x =</span> <span class="fu">element_text</span>(<span class="at">size =</span> <span class="dv">10</span>, <span class="at">color =</span> <span class="st">"navy"</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-15-1.png" class="img-fluid" style="width:100.0%"></p>
</div>
</div>
<p>This analysis is quite representative and is of great help in identifying very relevant factors in the plot. Moreover, when we look at the results, they are quite faithful to reality, because we as fans can state that in most cases, the words with the highest tf-idf are very important in the film.</p>
<p>For example, in <code>Harry Potter and the Deathly Hallows Part 1</code> the most distinctive word is Dobby, this character, without going into spoilers, has one of the most important and moving scenes of the whole saga in this film. On the other hand, if we look at the fifth instalment of the saga ( <code>Harry Potter and the Order of the Phoenix</code>), the most distinctive word is <code>prohesy</code> which makes sense because the whole film revolves around a very important prophecy in the story. Likewise, in the third instalment <code>Harry Potter and the Prisoner of Azkaban</code> the most important word is <code>Pettigrew</code> and <code>dementos</code>, both of which are quite relevant to this film. If the person reading this is a fan, he or she will agree with the results.</p>
<p>Finally, at the beginning of the analysis we were surprised by the sequences of dialogue that <code>Horace Slughorn</code> had, as he only appears in 3 of the 8 films, but apparently this character is very distinctive in the sixth film, with his name and surname being the most important words in the whole film.</p>
</section>
</section>
<section id="sentiment-analysis" class="level1" data-number="2">
<h1 data-number="2"><span class="header-section-number">2</span> SENTIMENT ANALYSIS</h1>
<p>We will perform Sentiment Analysis on all the dialogues using the <code>Bing</code> and <code>AFINN</code> lexicon dictionaries. The Bing lexicon provides two options for each word in its dictionary: <code>positive</code> or <code>negative</code>. The AFINN dictionary includes a numerical score between [-5, +5], which will be more useful for quantifying the mood of a dialogue, scene or large portions of the movie.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Bing dataset </span></span>
<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a>bing_sentiment <span class="ot"><-</span> df <span class="sc">%>%</span></span>
<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">unnest_tokens</span>(<span class="at">output =</span> word, <span class="at">input =</span> dialog) <span class="sc">%>%</span></span>
<span id="cb22-4"><a href="#cb22-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">inner_join</span>(<span class="fu">get_sentiments</span>(<span class="st">"bing"</span>), <span class="st">"word"</span>)</span>
<span id="cb22-5"><a href="#cb22-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-6"><a href="#cb22-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Afinn dataset </span></span>
<span id="cb22-7"><a href="#cb22-7" aria-hidden="true" tabindex="-1"></a>AFINN_sentiments <span class="ot"><-</span> df <span class="sc">|></span></span>
<span id="cb22-8"><a href="#cb22-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">linenumber =</span> <span class="fu">row_number</span>(), <span class="at">.by =</span> film.name) <span class="sc">%>%</span> <span class="co"># adds a line number to each dialogue, starting at one for each movie. </span></span>
<span id="cb22-9"><a href="#cb22-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">unnest_tokens</span>(<span class="at">output =</span> word, <span class="at">input =</span> dialog) <span class="sc">%>%</span></span>
<span id="cb22-10"><a href="#cb22-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">inner_join</span>(<span class="fu">get_sentiments</span>(<span class="st">"afinn"</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<section id="sentiment-by-chunks-of-dialogue" class="level3">
<h3 class="anchored" data-anchor-id="sentiment-by-chunks-of-dialogue">Sentiment by chunks of dialogue</h3>
<div class="cell">
<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a>AFINN_sentiments <span class="sc">|></span> </span>
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">group_by</span>(film.name, <span class="at">index =</span> linenumber <span class="sc">%/%</span> <span class="dv">100</span>) <span class="sc">%>%</span> <span class="co"># split movie dialogue into chunks of lines. </span></span>
<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">summarise</span>(<span class="at">sentiment =</span> <span class="fu">sum</span>(value)) <span class="sc">|></span> <span class="co"># sum up the sentiment </span></span>
<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(index, sentiment, <span class="at">fill =</span> sentiment, <span class="at">label =</span> sentiment)) <span class="sc">+</span></span>
<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_col</span>(<span class="at">show.legend =</span> <span class="cn">FALSE</span>) <span class="sc">+</span></span>
<span id="cb23-6"><a href="#cb23-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_text</span>( ) <span class="sc">+</span></span>
<span id="cb23-7"><a href="#cb23-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_gradient2</span>(<span class="at">high =</span> <span class="st">"darkgreen"</span>, </span>
<span id="cb23-8"><a href="#cb23-8" aria-hidden="true" tabindex="-1"></a> <span class="at">midpoint =</span> <span class="dv">0</span>,</span>
<span id="cb23-9"><a href="#cb23-9" aria-hidden="true" tabindex="-1"></a> <span class="at">low =</span> <span class="st">"red3"</span>)<span class="sc">+</span></span>
<span id="cb23-10"><a href="#cb23-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">facet_wrap</span>(<span class="sc">~</span>film.name , <span class="at">ncol =</span> <span class="dv">2</span>, <span class="at">scales =</span> <span class="st">"free_x"</span>) <span class="sc">+</span></span>
<span id="cb23-11"><a href="#cb23-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb23-12"><a href="#cb23-12" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(</span>
<span id="cb23-13"><a href="#cb23-13" aria-hidden="true" tabindex="-1"></a> <span class="at">panel.grid.major.x =</span> <span class="fu">element_blank</span>(),</span>
<span id="cb23-14"><a href="#cb23-14" aria-hidden="true" tabindex="-1"></a> <span class="at">panel.grid.minor =</span> <span class="fu">element_blank</span>(),</span>
<span id="cb23-15"><a href="#cb23-15" aria-hidden="true" tabindex="-1"></a> <span class="at">axis.text =</span> <span class="fu">element_blank</span>(), </span>
<span id="cb23-16"><a href="#cb23-16" aria-hidden="true" tabindex="-1"></a> <span class="at">strip.text =</span> <span class="fu">element_text</span>(<span class="at">face =</span> <span class="st">"bold"</span>, <span class="at">hjust =</span> <span class="dv">0</span>),</span>
<span id="cb23-17"><a href="#cb23-17" aria-hidden="true" tabindex="-1"></a> <span class="at">panel.border =</span> <span class="fu">element_rect</span>(<span class="at">fill =</span> <span class="cn">NA</span>, <span class="at">color =</span> <span class="st">"gray20"</span>)</span>
<span id="cb23-18"><a href="#cb23-18" aria-hidden="true" tabindex="-1"></a> ) <span class="sc">+</span></span>
<span id="cb23-19"><a href="#cb23-19" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(</span>
<span id="cb23-20"><a href="#cb23-20" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Each chunk split up by 100 lines"</span>,</span>
<span id="cb23-21"><a href="#cb23-21" aria-hidden="true" tabindex="-1"></a> <span class="at">subtitle =</span> <span class="st">"AFINN method"</span></span>
<span id="cb23-22"><a href="#cb23-22" aria-hidden="true" tabindex="-1"></a> )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/plot-AFINN-100-1.png" class="img-fluid" width="1056"></p>
</div>
</div>
<p>When splitting up the dialogue into 100-line chunks, you can see which movies have the most positive and negative sentiment. The 6th movie appears to be the most positive. One chunk in the early middle part of the movie has the highest rating of all. The most negative chunk of dialogue is at the middle of the 2nd movie. The 7th movie appears to be neutral throughout and the 8th movie is mostly negative. The first two movies end very positively, unlike any of the others.</p>
<p>Now let’s get more detailed.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a>AFINN_sentiments <span class="sc">|></span> </span>
<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">group_by</span>(film.name, <span class="at">index =</span> linenumber <span class="sc">%/%</span> <span class="dv">20</span>) <span class="sc">%>%</span> </span>
<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">summarise</span>(<span class="at">sentiment =</span> <span class="fu">sum</span>(value)) <span class="sc">|></span> </span>
<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(index, sentiment, <span class="at">fill =</span> sentiment)) <span class="sc">+</span></span>
<span id="cb24-5"><a href="#cb24-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_col</span>(<span class="at">show.legend =</span> <span class="cn">FALSE</span>) <span class="sc">+</span></span>
<span id="cb24-6"><a href="#cb24-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_gradient2</span>(<span class="at">high =</span> <span class="st">"darkgreen"</span>, <span class="at">midpoint =</span> <span class="dv">0</span>,</span>
<span id="cb24-7"><a href="#cb24-7" aria-hidden="true" tabindex="-1"></a> <span class="at">low =</span> <span class="st">"red3"</span>) <span class="sc">+</span></span>
<span id="cb24-8"><a href="#cb24-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">facet_wrap</span>(<span class="sc">~</span>film.name , <span class="at">ncol =</span> <span class="dv">2</span>, <span class="at">scales =</span> <span class="st">"free_x"</span>) <span class="sc">+</span></span>
<span id="cb24-9"><a href="#cb24-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb24-10"><a href="#cb24-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(</span>
<span id="cb24-11"><a href="#cb24-11" aria-hidden="true" tabindex="-1"></a> <span class="at">panel.grid.major.x =</span> <span class="fu">element_blank</span>(),</span>
<span id="cb24-12"><a href="#cb24-12" aria-hidden="true" tabindex="-1"></a> <span class="at">panel.grid.minor =</span> <span class="fu">element_blank</span>(),</span>
<span id="cb24-13"><a href="#cb24-13" aria-hidden="true" tabindex="-1"></a> <span class="at">axis.text.x =</span> <span class="fu">element_blank</span>(), </span>
<span id="cb24-14"><a href="#cb24-14" aria-hidden="true" tabindex="-1"></a> <span class="at">strip.text =</span> <span class="fu">element_text</span>(<span class="at">face =</span> <span class="st">"bold"</span>, <span class="at">hjust =</span> <span class="dv">0</span>),</span>
<span id="cb24-15"><a href="#cb24-15" aria-hidden="true" tabindex="-1"></a> <span class="at">panel.border =</span> <span class="fu">element_rect</span>(<span class="at">fill =</span> <span class="cn">NA</span>, <span class="at">color =</span> <span class="st">"gray20"</span>)</span>
<span id="cb24-16"><a href="#cb24-16" aria-hidden="true" tabindex="-1"></a> ) <span class="sc">+</span></span>
<span id="cb24-17"><a href="#cb24-17" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(</span>
<span id="cb24-18"><a href="#cb24-18" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Each chunk split up by 20 lines"</span>,</span>
<span id="cb24-19"><a href="#cb24-19" aria-hidden="true" tabindex="-1"></a> <span class="at">subtitle =</span> <span class="st">"AFINN method"</span></span>
<span id="cb24-20"><a href="#cb24-20" aria-hidden="true" tabindex="-1"></a> ) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/plot-AFINN-8-1.png" class="img-fluid" width="1056"></p>
</div>
</div>
<p>Now we can see more detailed changes in sentiment with 20-line chunks. The 6th movie is still positive, especially in the middle, but now we see a few negative dialogue between the positive ones. It is not <em>all</em> positive. We see that the 7th movie has both positive and negative dialogues but they cancel each other out when grouping into large chunks. The most positive sentiment happens at the end of the 1st movie. The 3rd movie starts very positive, as we can see from the 100-line chunks, but now we see that there are some really negative dialogues in the second half of the movie. The most negative dialogues appear to be in the 3rd film.</p>
<p>Now, let’s see about grouping it by chapter:</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a>AFINN_sentiments <span class="sc">|></span> </span>
<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">group_by</span>(film.name, chapter) <span class="sc">%>%</span> <span class="co"># group by chapter </span></span>
<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">summarise</span>(<span class="at">sentiment =</span> <span class="fu">sum</span>(value), <span class="at">index =</span> <span class="fu">min</span>(linenumber)) <span class="sc">|></span></span>
<span id="cb25-4"><a href="#cb25-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">arrange</span>(index) <span class="sc">|></span> </span>
<span id="cb25-5"><a href="#cb25-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="fu">reorder</span>(chapter, index), </span>
<span id="cb25-6"><a href="#cb25-6" aria-hidden="true" tabindex="-1"></a> sentiment, <span class="at">fill =</span> sentiment)) <span class="sc">+</span></span>
<span id="cb25-7"><a href="#cb25-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_col</span>(<span class="at">show.legend =</span> <span class="cn">FALSE</span>) <span class="sc">+</span></span>
<span id="cb25-8"><a href="#cb25-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_gradient2</span>(<span class="at">high =</span> <span class="st">"darkgreen"</span>, <span class="at">midpoint =</span> <span class="dv">0</span>,</span>
<span id="cb25-9"><a href="#cb25-9" aria-hidden="true" tabindex="-1"></a> <span class="at">low =</span> <span class="st">"red3"</span>)<span class="sc">+</span></span>
<span id="cb25-10"><a href="#cb25-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">facet_wrap</span>(<span class="sc">~</span>film.name , <span class="at">ncol =</span> <span class="dv">2</span>, <span class="at">scales =</span> <span class="st">"free_x"</span>) <span class="sc">+</span></span>
<span id="cb25-11"><a href="#cb25-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb25-12"><a href="#cb25-12" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(</span>
<span id="cb25-13"><a href="#cb25-13" aria-hidden="true" tabindex="-1"></a> <span class="at">panel.grid.major.x =</span> <span class="fu">element_blank</span>(),</span>
<span id="cb25-14"><a href="#cb25-14" aria-hidden="true" tabindex="-1"></a> <span class="at">panel.grid.minor =</span> <span class="fu">element_blank</span>(),</span>
<span id="cb25-15"><a href="#cb25-15" aria-hidden="true" tabindex="-1"></a> <span class="at">axis.text.x =</span> <span class="fu">element_blank</span>(), </span>
<span id="cb25-16"><a href="#cb25-16" aria-hidden="true" tabindex="-1"></a> <span class="at">strip.text =</span> <span class="fu">element_text</span>(<span class="at">face =</span> <span class="st">"bold"</span>, <span class="at">hjust =</span> <span class="dv">0</span>),</span>
<span id="cb25-17"><a href="#cb25-17" aria-hidden="true" tabindex="-1"></a> <span class="at">panel.border =</span> <span class="fu">element_rect</span>(<span class="at">fill =</span> <span class="cn">NA</span>, <span class="at">color =</span> <span class="st">"gray20"</span>)</span>
<span id="cb25-18"><a href="#cb25-18" aria-hidden="true" tabindex="-1"></a> )<span class="sc">+</span></span>
<span id="cb25-19"><a href="#cb25-19" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(</span>
<span id="cb25-20"><a href="#cb25-20" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Each chunk split up by chapter"</span>,</span>
<span id="cb25-21"><a href="#cb25-21" aria-hidden="true" tabindex="-1"></a> <span class="at">subtitle =</span> <span class="st">"AFINN method"</span></span>
<span id="cb25-22"><a href="#cb25-22" aria-hidden="true" tabindex="-1"></a> )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/plot-AFINN-chapter-1.png" class="img-fluid" width="1056"></p>
</div>
</div>
<p>The most positive chapter is in the 6th movie, but close is in the 3rd. Most nagative chapter is in the 2nd. Now we can see one very positive chapter towards the end of teh 8th movie. We also see that the negative detailed chunks in the middle of the 6th movie is gone. All the chapters in the middle</p>
<p>The three methods of looking at sentiment analysis shows that we will get different results from different methods of grouping text. The first method, large chunks of dialogue, gives an overall feeling for long portions of the movie, but hides the mood of a specific scene. The second method, small chunks of dialogue, shows you the detailed view of sentiment which is cancelled out in the first method. The last method, grouping dialogue by chapter, provides a more natural analysis of the flow of each movie. The writers and director meant for each chapter to have a certain feeling, which can be lost when cutting a movie into chunks of dialogue arbitrarily.</p>
</section>
<section id="bigrams" class="level2" data-number="2.1">
<h2 data-number="2.1" class="anchored" data-anchor-id="bigrams"><span class="header-section-number">2.1</span> Bigrams</h2>
<p>Bigrams are consecutive words in the script, which can give us more context than looking at single words. We would expect to see lots of names (like Harry Potter) and common 2-word phrases. But many 2-word phrases are not helpful for text analysis so we will take out all stopwords.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a>df_bigrams <span class="ot"><-</span> df <span class="sc">%>%</span></span>
<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a> <span class="co">#we take the dialogue in df, and tokenize it to sequences of 2 words</span></span>
<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">unnest_tokens</span>(bigram, dialog, <span class="at">token =</span> <span class="st">"ngrams"</span>, <span class="at">n =</span> <span class="dv">2</span>) <span class="sc">%>%</span></span>
<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">drop_na</span>(bigram)</span>
<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-6"><a href="#cb26-6" aria-hidden="true" tabindex="-1"></a><span class="co"># all bigrams, before taking out stopwords</span></span>
<span id="cb26-7"><a href="#cb26-7" aria-hidden="true" tabindex="-1"></a>df_bigrams <span class="sc">%>%</span></span>
<span id="cb26-8"><a href="#cb26-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(bigram, <span class="at">sort =</span> <span class="cn">TRUE</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 35,946 × 2
bigram n
<chr> <int>
1 in the 191
2 are you 152
3 i don't 150
4 you know 148
5 of the 145
6 do you 144
7 i think 125
8 to the 120
9 come on 118
10 to be 114
# ℹ 35,936 more rows</code></pre>
</div>
</div>
<p>Notice how the most popular bigrams are “in the” and “are you,” which do not provide valuable information about the topic or sentiment. Let’s remove the stopwords and see what we get.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a>bigrams_separated <span class="ot"><-</span> df_bigrams <span class="sc">|></span> </span>
<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a> <span class="co"># separate each bigram in two columns, word1 and word2</span></span>
<span id="cb28-3"><a href="#cb28-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">separate</span>(bigram, <span class="fu">c</span>(<span class="st">"word1"</span>, <span class="st">"word2"</span>), <span class="at">sep =</span> <span class="st">" "</span>)</span>
<span id="cb28-4"><a href="#cb28-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-5"><a href="#cb28-5" aria-hidden="true" tabindex="-1"></a>bigrams_filtered <span class="ot"><-</span> bigrams_separated <span class="sc">%>%</span></span>
<span id="cb28-6"><a href="#cb28-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">filter</span>(<span class="sc">!</span>word1 <span class="sc">%in%</span> stop_words<span class="sc">$</span>word) <span class="sc">%>%</span></span>
<span id="cb28-7"><a href="#cb28-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">filter</span>(<span class="sc">!</span>word2 <span class="sc">%in%</span> stop_words<span class="sc">$</span>word)</span>
<span id="cb28-8"><a href="#cb28-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-9"><a href="#cb28-9" aria-hidden="true" tabindex="-1"></a>bigrams_filtered <span class="sc">%>%</span> </span>
<span id="cb28-10"><a href="#cb28-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(word1, word2, <span class="at">sort =</span> <span class="cn">TRUE</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 4,525 × 3
word1 word2 n
<chr> <chr> <int>
1 harry potter 95
2 ha ha 34
3 dark lord 27
4 bloody hell 24
5 professor dumbledore 23
6 sirius black 22
7 professor snape 20
8 miss granger 18
9 harry harry 17
10 dark arts 16
# ℹ 4,515 more rows</code></pre>
</div>
<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a>bigrams_united <span class="ot"><-</span> bigrams_filtered <span class="sc">%>%</span></span>
<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">unite</span>(bigram, word1, word2, <span class="at">sep =</span> <span class="st">" "</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>After removing all stopwords from the list of bigrams, we see a lot of names: Harry Potter, Professor Dumbledore, Sirius Black, and so on. The second most used bigram is “ha ha,” which I assume is referring to someone laughing. Also in the top ten is “bloody hell,” which is an exclamation and “dark arts,” which is part of the name of a class at Hogwarts.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb31"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a>bigrams_united <span class="sc">%>%</span> </span>
<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(bigram, <span class="at">sort =</span> T) <span class="sc">%>%</span> </span>
<span id="cb31-3"><a href="#cb31-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">slice_max</span>(n, <span class="at">n=</span><span class="dv">15</span>) <span class="sc">%>%</span></span>
<span id="cb31-4"><a href="#cb31-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="at">y =</span> <span class="fu">reorder</span>(bigram, n), <span class="at">x =</span> n))<span class="sc">+</span></span>
<span id="cb31-5"><a href="#cb31-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>, <span class="at">width =</span> <span class="fl">0.65</span>, <span class="at">fill =</span> <span class="st">"peru"</span>, <span class="at">alpha =</span> <span class="dv">1</span>)<span class="sc">+</span></span>
<span id="cb31-6"><a href="#cb31-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Most popular bigrams in the movies"</span>,</span>
<span id="cb31-7"><a href="#cb31-7" aria-hidden="true" tabindex="-1"></a> <span class="at">subtitle =</span> <span class="st">"Top 15, after removing stopwords"</span>,</span>
<span id="cb31-8"><a href="#cb31-8" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> <span class="cn">NULL</span>, <span class="at">x =</span> <span class="st">"Frequency"</span>)<span class="sc">+</span></span>
<span id="cb31-9"><a href="#cb31-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/bigram-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<section id="negation-words" class="level3">
<h3 class="anchored" data-anchor-id="negation-words">Negation words</h3>
<p>There is a major problem with simply looking at sentiment for each word by itself: <em>context</em>. By itself, the word “funny” is very positive. The AFINN lexicon gives it a score of <strong>+4</strong>. But in the context of a dialogue, “not funny” is not positive and could be considered negative! Let’s see how many bigrams that include negations words like <em>not</em>, <em>no</em>, and <em>never</em>, affect the sentiment of the Harry Potter films.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a>negation_words <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"not"</span>, <span class="st">"no"</span>, <span class="st">"never"</span>, <span class="st">"without"</span>, <span class="st">"neither"</span>, <span class="st">"nor"</span>)</span>
<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb32-3"><a href="#cb32-3" aria-hidden="true" tabindex="-1"></a>not_words <span class="ot"><-</span> bigrams_separated <span class="sc">%>%</span></span>
<span id="cb32-4"><a href="#cb32-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">filter</span>(word1 <span class="sc">%in%</span> negation_words) <span class="sc">%>%</span></span>
<span id="cb32-5"><a href="#cb32-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">inner_join</span>(<span class="fu">get_sentiments</span>(<span class="st">"afinn"</span>), <span class="at">by =</span> <span class="fu">c</span>(<span class="at">word2 =</span> <span class="st">"word"</span>)) <span class="sc">%>%</span></span>
<span id="cb32-6"><a href="#cb32-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(word1, word2, value, <span class="at">sort =</span> <span class="cn">TRUE</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a>not_words <span class="sc">%>%</span></span>
<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">contribution =</span> n <span class="sc">*</span> value, <span class="co"># multiply the number of appearances with the value that AFINN assigns to the word to get its overall contribution to the movies. </span></span>
<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a> <span class="at">sign =</span> <span class="fu">if_else</span>(value <span class="sc">></span> <span class="dv">0</span>, <span class="st">"postive"</span>, <span class="st">"negative"</span>)) <span class="sc">%>%</span></span>
<span id="cb33-4"><a href="#cb33-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">group_by</span>(word1) <span class="sc">%>%</span> </span>
<span id="cb33-5"><a href="#cb33-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">slice_max</span>(<span class="fu">abs</span>(contribution), <span class="at">n=</span><span class="dv">10</span>) <span class="sc">%>%</span> <span class="co"># get the top 10 bigrams for each negation word. </span></span>
<span id="cb33-6"><a href="#cb33-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">ungroup</span>() <span class="sc">%>%</span></span>
<span id="cb33-7"><a href="#cb33-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="at">y =</span> <span class="fu">reorder_within</span>(word2, contribution, word1), </span>
<span id="cb33-8"><a href="#cb33-8" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> contribution, </span>
<span id="cb33-9"><a href="#cb33-9" aria-hidden="true" tabindex="-1"></a> <span class="at">fill =</span> contribution,</span>
<span id="cb33-10"><a href="#cb33-10" aria-hidden="true" tabindex="-1"></a> <span class="at">label =</span> contribution)) <span class="sc">+</span></span>
<span id="cb33-11"><a href="#cb33-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_col</span>() <span class="sc">+</span> </span>
<span id="cb33-12"><a href="#cb33-12" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_text</span>() <span class="sc">+</span></span>
<span id="cb33-13"><a href="#cb33-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_vline</span>(<span class="at">xintercept =</span> <span class="dv">0</span>, <span class="at">alpha =</span> .<span class="dv">3</span>) <span class="sc">+</span></span>
<span id="cb33-14"><a href="#cb33-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_y_reordered</span>() <span class="sc">+</span> </span>
<span id="cb33-15"><a href="#cb33-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">facet_wrap</span>(<span class="sc">~</span> word1, <span class="at">scales =</span> <span class="st">"free_y"</span>, <span class="at">ncol =</span> <span class="dv">2</span>) <span class="sc">+</span> </span>
<span id="cb33-16"><a href="#cb33-16" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">y =</span> <span class="st">'Words preceeded by a negation'</span>,</span>
<span id="cb33-17"><a href="#cb33-17" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Contribution (Sentiment value * number of mentions)"</span>,</span>
<span id="cb33-18"><a href="#cb33-18" aria-hidden="true" tabindex="-1"></a> <span class="at">title =</span> <span class="st">"Most common pos or neg words to follow negations"</span>) <span class="sc">+</span></span>
<span id="cb33-19"><a href="#cb33-19" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_gradient2</span>(<span class="at">high =</span> <span class="st">"darkgreen"</span>, </span>
<span id="cb33-20"><a href="#cb33-20" aria-hidden="true" tabindex="-1"></a> <span class="at">midpoint =</span> <span class="dv">0</span>,</span>
<span id="cb33-21"><a href="#cb33-21" aria-hidden="true" tabindex="-1"></a> <span class="at">low =</span> <span class="st">"red3"</span>)<span class="sc">+</span></span>
<span id="cb33-22"><a href="#cb33-22" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb33-23"><a href="#cb33-23" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(</span>
<span id="cb33-24"><a href="#cb33-24" aria-hidden="true" tabindex="-1"></a> <span class="at">strip.text =</span> <span class="fu">element_text</span>(<span class="at">face =</span> <span class="st">"bold"</span>, <span class="at">hjust =</span> <span class="dv">0</span>, <span class="at">size =</span> <span class="dv">12</span>),</span>
<span id="cb33-25"><a href="#cb33-25" aria-hidden="true" tabindex="-1"></a> <span class="at">panel.border =</span> <span class="fu">element_rect</span>(<span class="at">fill =</span> <span class="cn">NA</span>, <span class="at">color =</span> <span class="st">"gray20"</span>),</span>
<span id="cb33-26"><a href="#cb33-26" aria-hidden="true" tabindex="-1"></a> <span class="at">legend.position =</span> <span class="st">"none"</span></span>
<span id="cb33-27"><a href="#cb33-27" aria-hidden="true" tabindex="-1"></a> ) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/not-word-plot-1.png" class="img-fluid" width="1056"></p>
</div>
</div>
<p>Now we see a different picture of the sentiment analysis performed before with single words. The negation bigram that provided provided the largest absolute contribution was “no no,” but I think it could still be considered negative in a dialogue. We see that “not good” falsely contributed to 21 points in the positive direction.</p>
<p>Excluding “no no,” the net contribution of the negation words is <strong>7</strong>, but should have been in the negative direction.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb34"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a>not_words <span class="sc">|></span> <span class="fu">filter</span>( <span class="fu">not</span>(word2<span class="sc">==</span><span class="st">"no"</span> <span class="sc">&</span> word1<span class="sc">==</span><span class="st">"no"</span>)) <span class="sc">|></span> </span>
<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">summarise</span>(<span class="at">sum =</span> <span class="fu">sum</span>(value <span class="sc">*</span> n, <span class="at">na.rm =</span> T))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 1
sum
<dbl>
1 7</code></pre>
</div>
</div>
</section>
</section>
<section id="trigrams" class="level2" data-number="2.2">
<h2 data-number="2.2" class="anchored" data-anchor-id="trigrams"><span class="header-section-number">2.2</span> Trigrams</h2>
<p>Yet another form of tokenization is to look at three consecutive words. This provides even more information for topic modelling, term frequency and sentiment analysis. Once again, we must take out the stopwords.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a>df_trigrams <span class="ot"><-</span> df <span class="sc">%>%</span></span>
<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a> <span class="co">#we take the dialogue in df, and tokenize it to sequences of 2 words</span></span>
<span id="cb36-3"><a href="#cb36-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">unnest_tokens</span>(trigram, dialog, <span class="at">token =</span> <span class="st">"ngrams"</span>, <span class="at">n =</span> <span class="dv">3</span>) <span class="sc">%>%</span></span>
<span id="cb36-4"><a href="#cb36-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">drop_na</span>(trigram)</span>
<span id="cb36-5"><a href="#cb36-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-6"><a href="#cb36-6" aria-hidden="true" tabindex="-1"></a>trigrams_separated <span class="ot"><-</span> df_trigrams <span class="sc">|></span> </span>
<span id="cb36-7"><a href="#cb36-7" aria-hidden="true" tabindex="-1"></a> <span class="co"># separate each trigram in three columns, word1 and word2 and word3</span></span>
<span id="cb36-8"><a href="#cb36-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">separate</span>(trigram, <span class="fu">c</span>(<span class="st">"word1"</span>, <span class="st">"word2"</span>, <span class="st">"word3"</span>), <span class="at">sep =</span> <span class="st">" "</span>)</span>
<span id="cb36-9"><a href="#cb36-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-10"><a href="#cb36-10" aria-hidden="true" tabindex="-1"></a>trigrams_filtered <span class="ot"><-</span> trigrams_separated <span class="sc">%>%</span></span>
<span id="cb36-11"><a href="#cb36-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">filter</span>(<span class="sc">!</span>word1 <span class="sc">%in%</span> stop_words<span class="sc">$</span>word) <span class="sc">%>%</span></span>
<span id="cb36-12"><a href="#cb36-12" aria-hidden="true" tabindex="-1"></a> <span class="fu">filter</span>(<span class="sc">!</span>word2 <span class="sc">%in%</span> stop_words<span class="sc">$</span>word) <span class="sc">%>%</span></span>
<span id="cb36-13"><a href="#cb36-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">filter</span>(<span class="sc">!</span>word3 <span class="sc">%in%</span> stop_words<span class="sc">$</span>word)</span>
<span id="cb36-14"><a href="#cb36-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-15"><a href="#cb36-15" aria-hidden="true" tabindex="-1"></a>trigrams_filtered <span class="sc">%>%</span> </span>
<span id="cb36-16"><a href="#cb36-16" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(word1, word2, word3, <span class="at">sort =</span> <span class="cn">TRUE</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1,338 × 4
word1 word2 word3 n
<chr> <chr> <chr> <int>
1 ha ha ha 19
2 tri wizard tournament 8
3 dark arts teacher 6
4 tom riddle's diary 6
5 half blood prince 5
6 lumos maxima lumos 4
7 maxima lumos maxima 4
8 professor dumbledore sir 4
9 educational decree twenty 3
10 happy christmas harry 3
# ℹ 1,328 more rows</code></pre>
</div>
<div class="sourceCode cell-code" id="cb38"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a>trigrams_united <span class="ot"><-</span> trigrams_filtered <span class="sc">%>%</span></span>
<span id="cb38-2"><a href="#cb38-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">unite</span>(trigram, word1, word2, word3, <span class="at">sep =</span> <span class="st">" "</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>We can see some familiar words in the most popular trigrams: “ha ha ha” and “dark arts teacher.” Not as many proper names, but we do have “Professor Dumblefdore Sir” in the top ten.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb39"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb39-1"><a href="#cb39-1" aria-hidden="true" tabindex="-1"></a>trigrams_united <span class="sc">%>%</span> </span>
<span id="cb39-2"><a href="#cb39-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(trigram, <span class="at">sort =</span> T) <span class="sc">%>%</span> </span>
<span id="cb39-3"><a href="#cb39-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">slice_max</span>(n, <span class="at">n=</span><span class="dv">10</span>) <span class="sc">%>%</span></span>
<span id="cb39-4"><a href="#cb39-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="at">y =</span> <span class="fu">reorder</span>(trigram, n), <span class="at">x =</span> n))<span class="sc">+</span></span>
<span id="cb39-5"><a href="#cb39-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>, <span class="at">width =</span> <span class="fl">0.65</span>, <span class="at">fill =</span> <span class="st">"peru"</span>, <span class="at">alpha =</span> <span class="dv">1</span>)<span class="sc">+</span></span>
<span id="cb39-6"><a href="#cb39-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Most popular trigrams in the movies"</span>,</span>
<span id="cb39-7"><a href="#cb39-7" aria-hidden="true" tabindex="-1"></a> <span class="at">subtitle =</span> <span class="st">"Top 15, after removing stopwords"</span>,</span>
<span id="cb39-8"><a href="#cb39-8" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> <span class="cn">NULL</span>, <span class="at">x =</span> <span class="st">"Frequency"</span>)<span class="sc">+</span></span>
<span id="cb39-9"><a href="#cb39-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/trigram-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<section id="using-bings-lexicon" class="level3">
<h3 class="anchored" data-anchor-id="using-bings-lexicon">Using Bing’s lexicon</h3>
<p>Let’s see how Bing’s lexicon describes the sentiment for this movie series.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb40"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb40-1"><a href="#cb40-1" aria-hidden="true" tabindex="-1"></a>bing_sentiment <span class="sc">%>%</span> </span>
<span id="cb40-2"><a href="#cb40-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">group_by</span>(word, sentiment) <span class="sc">%>%</span></span>
<span id="cb40-3"><a href="#cb40-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">summarise</span>(<span class="at">count =</span> <span class="fu">n</span>()) <span class="sc">|></span> </span>
<span id="cb40-4"><a href="#cb40-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">ungroup</span>() <span class="sc">%>%</span></span>
<span id="cb40-5"><a href="#cb40-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">arrange</span>(<span class="fu">desc</span>(count)) <span class="sc">%>%</span></span>
<span id="cb40-6"><a href="#cb40-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">slice</span>(<span class="dv">1</span><span class="sc">:</span><span class="dv">20</span>) <span class="sc">%>%</span></span>
<span id="cb40-7"><a href="#cb40-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="at">y =</span> <span class="fu">reorder</span>(word, count), <span class="at">x =</span> count, <span class="at">fill =</span> sentiment))<span class="sc">+</span></span>
<span id="cb40-8"><a href="#cb40-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>, <span class="at">width =</span> <span class="fl">0.62</span>)<span class="sc">+</span></span>
<span id="cb40-9"><a href="#cb40-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_manual</span>(<span class="at">values =</span> <span class="fu">c</span>(<span class="st">"darkred"</span>, <span class="st">"darkgreen"</span>)) <span class="sc">+</span></span>
<span id="cb40-10"><a href="#cb40-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Top 10 most popular words with assigned sentiment"</span>,</span>
<span id="cb40-11"><a href="#cb40-11" aria-hidden="true" tabindex="-1"></a> <span class="at">subtitle =</span> <span class="st">"Bing lexicon"</span>,</span>
<span id="cb40-12"><a href="#cb40-12" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> <span class="cn">NULL</span>, <span class="at">x =</span> <span class="st">"Frequency"</span>, <span class="at">fill =</span> <span class="st">"Sentiment"</span>)<span class="sc">+</span></span>
<span id="cb40-13"><a href="#cb40-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">guides</span>(<span class="at">fill =</span> <span class="fu">guide_legend</span>(<span class="at">reverse =</span> T))<span class="sc">+</span></span>
<span id="cb40-14"><a href="#cb40-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>()<span class="sc">+</span> </span>
<span id="cb40-15"><a href="#cb40-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.position =</span> <span class="st">"top"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-18-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>The most common words in Bing’s dictionary that show up in the Harry Potter series are “well,” “right,” and “like.” The top four words are positive and the next three after that are negative. Just looking at this graph, it appears that the negative words are related to killing and death.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb41"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb41-1"><a href="#cb41-1" aria-hidden="true" tabindex="-1"></a>bing_sentiment <span class="sc">%>%</span> </span>
<span id="cb41-2"><a href="#cb41-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">group_by</span>(film.name, num, sentiment) <span class="sc">%>%</span></span>
<span id="cb41-3"><a href="#cb41-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">summarise</span>(<span class="at">count =</span> <span class="fu">n</span>()) <span class="sc">%>%</span></span>
<span id="cb41-4"><a href="#cb41-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">ungroup</span>() <span class="sc">|></span> </span>
<span id="cb41-5"><a href="#cb41-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="at">y =</span> <span class="fu">reorder</span>(film.name, <span class="sc">-</span>num ), <span class="at">x =</span> count, <span class="at">fill =</span> sentiment)) <span class="sc">+</span></span>
<span id="cb41-6"><a href="#cb41-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>, <span class="at">position =</span> <span class="st">"fill"</span>, <span class="at">width =</span> <span class="fl">0.7</span>, <span class="at">alpha =</span> <span class="fl">0.9</span>)<span class="sc">+</span></span>
<span id="cb41-7"><a href="#cb41-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_vline</span>(<span class="at">xintercept =</span> <span class="fl">0.5</span>) <span class="sc">+</span></span>
<span id="cb41-8"><a href="#cb41-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_manual</span>(<span class="at">values =</span> <span class="fu">c</span>(<span class="st">"red4"</span>, <span class="st">"darkgreen"</span>))<span class="sc">+</span></span>
<span id="cb41-9"><a href="#cb41-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_x_continuous</span>(<span class="at">labels =</span> scales<span class="sc">::</span>percent)<span class="sc">+</span></span>
<span id="cb41-10"><a href="#cb41-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Share of words with positive and negative sentiment"</span>,</span>
<span id="cb41-11"><a href="#cb41-11" aria-hidden="true" tabindex="-1"></a> <span class="at">subtitle =</span> <span class="st">"Bing lexicon"</span>, <span class="at">fill =</span> <span class="st">"Sentiment"</span>,</span>
<span id="cb41-12"><a href="#cb41-12" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> <span class="st">"Percentage"</span>, <span class="at">y =</span> <span class="st">"Ratio"</span>)<span class="sc">+</span></span>
<span id="cb41-13"><a href="#cb41-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">guides</span>(<span class="at">fill =</span> <span class="fu">guide_legend</span>(<span class="at">reverse =</span> T))<span class="sc">+</span></span>
<span id="cb41-14"><a href="#cb41-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>()<span class="sc">+</span></span>
<span id="cb41-15"><a href="#cb41-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.position =</span> <span class="st">"top"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-19-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>We can see that two films have a higher share of negative sentiment words: <em>Chamber of Secrets</em> and <em>Deathly Hallows Part 2</em>. Recall that <em>Chamber of Secrets</em> had the most negative chapter in all the movies, according to AFINN. The <em>Half-Blood Prince</em> has a higher share of positive words, which aligns with the AFINN lexicon. Despite some dark scenes in the Harry Potter series, most movies have a slightly higher share of positive sentiment overall.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb42"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a>bing_sentiment <span class="sc">%>%</span> </span>
<span id="cb42-2"><a href="#cb42-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">filter</span>(character <span class="sc">%in%</span> <span class="fu">c</span>(<span class="st">"Harry Potter"</span>, <span class="st">"Ron Weasley"</span>, <span class="st">"Hermione Granger"</span>, <span class="st">"Rubeus Hagrid"</span>, <span class="st">"Albus Dumbledore"</span>, <span class="st">"Remus Lupin"</span>, <span class="st">"Minerva McGonagall"</span>, <span class="st">"Draco Malfoy"</span>, <span class="st">"Severus Snape"</span>, <span class="st">"Lucius Malfoy"</span>, <span class="st">"Voldemort"</span>, <span class="st">"Tom Riddle"</span>, <span class="st">"Sirius Black"</span>, <span class="st">"Neville Longbottom"</span>)) <span class="sc">%>%</span></span>
<span id="cb42-3"><a href="#cb42-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">group_by</span>(character, sentiment) <span class="sc">%>%</span></span>
<span id="cb42-4"><a href="#cb42-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">summarise</span>(<span class="at">count =</span> <span class="fu">n</span>(), <span class="at">.groups =</span> <span class="st">'drop'</span>) <span class="sc">%>%</span></span>
<span id="cb42-5"><a href="#cb42-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="at">y=</span> <span class="fu">reorder</span>(character, count), <span class="at">x =</span> count, <span class="at">fill =</span> sentiment))<span class="sc">+</span></span>
<span id="cb42-6"><a href="#cb42-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>, <span class="at">position =</span> <span class="st">"fill"</span>, <span class="at">width =</span> <span class="fl">0.57</span>, <span class="at">alpha =</span> <span class="fl">0.9</span>) <span class="sc">+</span></span>
<span id="cb42-7"><a href="#cb42-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_fill_manual</span>(<span class="at">values =</span> <span class="fu">c</span>(<span class="st">"darkred"</span>, <span class="st">"darkgreen"</span>))<span class="sc">+</span></span>
<span id="cb42-8"><a href="#cb42-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_x_continuous</span>(<span class="at">labels =</span> scales<span class="sc">::</span>percent)<span class="sc">+</span></span>
<span id="cb42-9"><a href="#cb42-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_vline</span>(<span class="at">xintercept =</span> <span class="fl">0.5</span>)<span class="sc">+</span> </span>
<span id="cb42-10"><a href="#cb42-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">"Share of words with positive and negative sentiment"</span>,</span>
<span id="cb42-11"><a href="#cb42-11" aria-hidden="true" tabindex="-1"></a> <span class="at">subtitle =</span> <span class="st">"Bing lexicon, top 15 characters with the most sentences"</span>, <span class="at">fill =</span> <span class="st">"Sentiment"</span>,</span>
<span id="cb42-12"><a href="#cb42-12" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> <span class="cn">NULL</span>, <span class="at">x =</span> <span class="st">"Percentage"</span>)<span class="sc">+</span></span>
<span id="cb42-13"><a href="#cb42-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">guides</span>(<span class="at">fill =</span> <span class="fu">guide_legend</span>(<span class="at">reverse =</span> T))<span class="sc">+</span></span>
<span id="cb42-14"><a href="#cb42-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>()<span class="sc">+</span></span>
<span id="cb42-15"><a href="#cb42-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>( <span class="at">legend.position =</span> <span class="st">"top"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="final-project_v2_files/figure-html/unnamed-chunk-20-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>Of the top 15 characters with the most lines, <code>Remus Lupin</code> and <code>Minerva McGonagall</code> have the highest share of positive sentiment. That makes sense given their inspirational impact on the plot. Not surprisingly, <code>Draco Malfoy</code> and his father <code>Lucius Malfoy</code> are the most negative characters. <code>Tom Riddle</code>, the character later known as <code>Voldemort</code>, has a higher share of positive words than his later self. This is interesting because it shows a change in personality development as he grows up and becomes an evil villain.</p>
</section>
</section>
</section>
<section id="topic-modelling" class="level1" data-number="3">
<h1 data-number="3"><span class="header-section-number">3</span> TOPIC MODELLING</h1>
<p>In a saga with multiple films, such as Harry Potter, where characters, plots and motifs evolve over time, thematic modelling could identify patterns and thematic shifts in the narrative. This could reveal how certain themes are introduced, how they develop over the course of the films, and which films focus more on certain aspects of the story or characters. The advantage is that, instead of looking at individual words as in TF-IDF analysis, we would be examining patterns of words that represent themes, which could provide a deeper understanding of the content and structure of the series’ narratives.</p>
<p>First of all, we are going to prepare the dataset with which we are going to work throughout the section. To do so, we will use the chapters that refer to the chapter of the movie according to the script</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb43"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb43-1"><a href="#cb43-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(stringr)</span>
<span id="cb43-2"><a href="#cb43-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb43-3"><a href="#cb43-3" aria-hidden="true" tabindex="-1"></a><span class="co"># divide into documents, each representing one chapter</span></span>
<span id="cb43-4"><a href="#cb43-4" aria-hidden="true" tabindex="-1"></a>by_chapter <span class="ot"><-</span> df <span class="sc">%>%</span></span>
<span id="cb43-5"><a href="#cb43-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">unite</span>(document, movie, chapter)</span>
<span id="cb43-6"><a href="#cb43-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb43-7"><a href="#cb43-7" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenize</span></span>
<span id="cb43-8"><a href="#cb43-8" aria-hidden="true" tabindex="-1"></a>by_chapter_word <span class="ot"><-</span> by_chapter <span class="sc">%>%</span></span>
<span id="cb43-9"><a href="#cb43-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">unnest_tokens</span>(word, dialog)</span>
<span id="cb43-10"><a href="#cb43-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb43-11"><a href="#cb43-11" aria-hidden="true" tabindex="-1"></a><span class="co"># find document-word counts</span></span>
<span id="cb43-12"><a href="#cb43-12" aria-hidden="true" tabindex="-1"></a>word_counts <span class="ot"><-</span> by_chapter_word <span class="sc">%>%</span></span>
<span id="cb43-13"><a href="#cb43-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">anti_join</span>(stop_words) <span class="sc">%>%</span></span>
<span id="cb43-14"><a href="#cb43-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(document, word, <span class="at">sort =</span> <span class="cn">TRUE</span>)</span>
<span id="cb43-15"><a href="#cb43-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb43-16"><a href="#cb43-16" aria-hidden="true" tabindex="-1"></a>word_counts</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 18,314 × 3
document word n
<chr> <chr> <int>