In [6]:
from collections import Counter

import pandas as pd
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipe("ner")
nlp.max_length = 10_000_000

# Septology: Jon Fosse

## Raw Word Counts

In [4]:
with open("./input/Septology.txt", "r", encoding="utf-8") as f:
    septology = f.read()
    septology = " ".join(septology.split())

In [7]:
septology_doc = nlp(septology)
septology_words = [token.text for token in septology_doc if not token.is_punct]
septology_word_freq = Counter(septology_words)

In [8]:
septology_df = pd.DataFrame.from_dict(septology_word_freq, orient="index").reset_index()
septology_df.columns = ["word", "count"]
septology_df = septology_df.sort_values("count", ascending=False).reset_index(drop=True)
septology_df.to_csv("./output/septology_words.csv", index=False)

### Total Unique Words

In [26]:
septology_df.shape[0]

6605

### Top 10 Unique Words

In [9]:
septology_df.head(10)

Unnamed: 0,word,count
0,and,20021
1,the,11732
2,I,11148
3,he,8826
4,to,7271
5,that,6085
6,'s,5918
7,says,5698
8,it,5450
9,a,5283


### Number of Single-Use Words

In [10]:
(septology_df["count"] == 1).sum()

2399

## Lemma Counts

In [11]:
septology_lemmas = [token.lemma_ for token in septology_doc if not token.is_punct]
septology_lemma_freq = Counter(septology_lemmas)

In [13]:
septology_lemma_df = pd.DataFrame.from_dict(
    septology_lemma_freq, orient="index"
).reset_index()
septology_lemma_df.columns = ["lemma", "count"]
septology_lemma_df = septology_lemma_df.sort_values(
    "count", ascending=False
).reset_index(drop=True)
septology_lemma_df.to_csv("./output/septology_lemmas.csv", index=False)

### Total Unique Lemmas

In [27]:
septology_lemma_df.shape[0]

4908

### Top 10 Unique Lemmas

In [14]:
septology_lemma_df.head(10)

Unnamed: 0,lemma,count
0,and,20241
1,the,14221
2,be,13039
3,I,12121
4,he,10184
5,say,8002
6,to,7283
7,that,6174
8,it,5604
9,a,5317


### Number of Single-Use Lemmas

In [15]:
(septology_lemma_df["count"] == 1).sum()

1730

# East of Eden: John Steinbeck

## Raw Word Count

In [16]:
with open("./input/East_of_Eden.txt", "r", encoding="utf-8") as f:
    east = f.read()
    east = " ".join(east.split())

In [18]:
east_doc = nlp(east)
east_words = [token.text for token in east_doc if not token.is_punct]
east_word_freq = Counter(east_words)

In [19]:
east_df = pd.DataFrame.from_dict(east_word_freq, orient="index").reset_index()
east_df.columns = ["word", "count"]
east_df = east_df.sort_values("count", ascending=False).reset_index(drop=True)
east_df.to_csv("./output/east_words.csv", index=False)

### Total Unique Words

In [28]:
east_df.shape[0]

12744

### Top 10 Unique Words

In [20]:
east_df.head(10)

Unnamed: 0,word,count
0,the,9471
1,and,7601
2,to,5662
3,a,5577
4,I,5178
5,of,3929
6,you,3569
7,it,3267
8,was,3171
9,in,3158


### Number of Single-Use Words

In [21]:
(east_df["count"] == 1).sum()

5654

## Lemma Counts

In [22]:
east_lemmas = [token.lemma_ for token in east_doc if not token.is_punct]
east_lemma_freq = Counter(east_lemmas)

In [23]:
east_lemma_df = pd.DataFrame.from_dict(east_lemma_freq, orient="index").reset_index()
east_lemma_df.columns = ["lemma", "count"]
east_lemma_df = east_lemma_df.sort_values("count", ascending=False).reset_index(
    drop=True
)
east_lemma_df.to_csv("./output/east_lemmas.csv", index=False)

### Total Unique Lemmas

In [29]:
east_lemma_df.shape[0]

9312

### Top 10 Unique Lemmas

In [24]:
east_lemma_df.head(10)

Unnamed: 0,lemma,count
0,the,10480
1,be,9205
2,and,8503
3,he,6358
4,I,6140
5,a,5796
6,to,5711
7,you,4563
8,it,4124
9,not,4002


### Number of Single-Use Lemmas

In [25]:
(east_lemma_df["count"] == 1).sum()

3956

# Ulysses: James Joycee

## Raw Word Count

In [30]:
with open("./input/Ulysses.txt", "r", encoding="utf-8") as f:
    ulysses = f.read()
    ulysses = " ".join(ulysses.split())

In [31]:
ulysses_doc = nlp(ulysses)
ulysses_words = [token.text for token in ulysses_doc if not token.is_punct]
ulysses_word_freq = Counter(ulysses_words)

In [32]:
ulysses_df = pd.DataFrame.from_dict(ulysses_word_freq, orient="index").reset_index()
ulysses_df.columns = ["word", "count"]
ulysses_df = ulysses_df.sort_values("count", ascending=False).reset_index(drop=True)
ulysses_df.to_csv("./output/ulysses_words.csv", index=False)

### Total Unique Words

In [33]:
ulysses_df.shape[0]

33732

### Top 10 Unique Words

In [34]:
ulysses_df.head(10)

Unnamed: 0,word,count
0,the,13524
1,of,8062
2,and,6627
3,a,5823
4,to,4798
5,in,4687
6,his,3063
7,he,3060
8,I,3002
9,’s,2678


### Number of Single-Use Words

In [35]:
(ulysses_df["count"] == 1).sum()

19206

## Lemma Counts

In [36]:
ulysses_lemmas = [token.lemma_ for token in ulysses_doc if not token.is_punct]
ulysses_lemma_freq = Counter(ulysses_lemmas)

In [38]:
ulysses_lemma_df = pd.DataFrame.from_dict(
    ulysses_lemma_freq, orient="index"
).reset_index()
ulysses_lemma_df.columns = ["lemma", "count"]
ulysses_lemma_df = ulysses_lemma_df.sort_values("count", ascending=False).reset_index(
    drop=True
)
ulysses_lemma_df.to_csv("./output/ulysses_lemmas.csv", index=False)

### Total Unique Lemmas

In [39]:
ulysses_lemma_df.shape[0]

26590

### Top 10 Unique Lemmas

In [40]:
ulysses_lemma_df.head(10)

Unnamed: 0,lemma,count
0,the,14953
1,of,8144
2,and,7215
3,a,6461
4,be,6116
5,he,5839
6,to,4958
7,in,4947
8,I,3937
9,his,3330


### Number of Single-Use Lemmas

In [41]:
(ulysses_lemma_df["count"] == 1).sum()

14928