# Import packages

In [1]:
from pathlib import Path
import zipfile

from string import ascii_letters
from random import sample
from langdetect import detect

import pandas as pd

# Load data

In [None]:
! kaggle datasets download -d michaelarman/poemsdataset
! unzip -q poemsdataset.zip
! mv poemsdataset data

In [2]:
def list_poems_filenames(location):
    filenames = []
    for poems_main_category in location.glob('*'):
        for poems_category in poems_main_category.glob('*'):
            for poem_filename in poems_category.glob('*'):
                filenames.append(poem_filename)
    return filenames

In [3]:
def get_full_texts(location):
    filenames = list_poems_filenames(location)
    poems = []
    for filename in filenames:
        with filename.open(encoding="utf-8") as f:
            lines = f.readlines()
        poems.append(lines)
    return poems

In [4]:
data_location = Path.cwd().joinpath("data")

In [5]:
sample(list_poems_filenames(data_location), 8)

[WindowsPath('C:/Users/koste/Studia/10/NLP/Poem-Generation/data/forms/pantoum/PantoumPoemsPantoumPoembycynthiaRouten.txt'),
 WindowsPath('C:/Users/koste/Studia/10/NLP/Poem-Generation/data/forms/sonnet/SonnetPoemsHolySonnetIThouHastMadeMePoembyJohnDonne.txt'),
 WindowsPath('C:/Users/koste/Studia/10/NLP/Poem-Generation/data/topics/sympathy/SympathyPoemsSympathyForSurvivorsOfAPakistaniMassMurderOfChildrenPoembyBillGrace.txt'),
 WindowsPath('C:/Users/koste/Studia/10/NLP/Poem-Generation/data/topics/faith/FaithPoemsLostFaithPoembySathyaNarayana.txt'),
 WindowsPath('C:/Users/koste/Studia/10/NLP/Poem-Generation/data/topics/culture/CulturePoemsRacismAndTheCultureOfRacismRev2013PoembyTradeMartin.txt'),
 WindowsPath('C:/Users/koste/Studia/10/NLP/Poem-Generation/data/forms/quatrain/QuatrainPoemsLoveInQuatrainVersePoembyMuzahidulReza.txt'),
 WindowsPath('C:/Users/koste/Studia/10/NLP/Poem-Generation/data/topics/romance/RomancePoemsNoRomanceSoldUntoPoembyEmilyDickinson.txt'),
 WindowsPath('C:/Users/k

In [6]:
len(list_poems_filenames(data_location))

20657

In [7]:
all_poems = get_full_texts(data_location)

In [8]:
sample(all_poems, 2)

[['844\n',
  'Spring is the Period\n',
  'Express from God.\n',
  'Among the other seasons\n',
  'Himself abide,\n',
  'But during March and April\n',
  'None stir abroad\n',
  'Without a cordial interview\n',
  'With God.'],
 ["Here in the flamin' thick of thick of things,\n",
  "With Death across the way, 'n' traps\n",
  'What little Fritz the German flings\n',
  "Explodin' in yer lunch pe'aps,\n",
  "It ain't all glory for a bloke',\n",
  "It ain't all corfee 'ot and stoo,\n",
  "Nor wavin' banners in the smoke,\n",
  "Or practisin' the bay'net stroke—\n",
  'We has our little troubles, too!\n',
  "Here's Trigger Ribb bin seein' red\n",
  "'N' raisin' Cain because he had,\n",
  "Back in the caverns iv his 'ead,\n",
  "A 'oller tooth run ravin' mad.\n",
  "Pore Trigger up 'n' down the trench\n",
  "Was jiggin' like a blithered loan,\n",
  "'N' every time she give a wrench\n",
  'You orter seen the beggar blench,\n',
  "You orter 'eard him play a toon.\n",
  "The sullen shells was paw

In [9]:
len(all_poems)

20657

# Remove non-english poems

In [10]:
def remove_non_english(poems):
    english_poems = []
    for poem in poems:
        poem_string = " ".join(poem)
        try:
            language = detect(poem_string)
        except:
            language = "error"
            print("This poem throws an error:", poem_string)
        if language == "en":
            english_poems.append(poem)
    return english_poems

In [11]:
english_poems = remove_non_english(all_poems)

This poem throws an error: 
This poem throws an error: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
This poem throws an error: ଖୋଲା ଆକାଶ ତଳେ
 ଖରା, ବର୍ଷା, ଶୀତକୁଆଲିଗଂନ କରି
 ଝାଳ ବୁହାଇ ଚାଲେ
 ବଞ୍ଚିବାର କିଛି ସ୍ଵପ୍ନ ନେଇ
 କାରଣ, କିଛି ଦାନା ପେଟ ପାଇଁ- ଜୀବନର ଲକ୍ଷ୍ୟ ସେଇ
 ତଥାପି ସେ ବଞ୍ଚେ
 ହସି ହସି ଗ୍ରହଣ କରେ ପରିସ୍ଥିର ସମସ୍ତ ଦେୟ
 ନ ଥାଏ କାହା ପ୍ରତି କେବେ ହେୟ
 ଅବା କିଛି ଲୁଟି ହୋଇ ଯିବାର ଭୟ
 ଅଥବା ଅନ୍ୟକୁ ଠେଲି ଆଗକୁ ବଢି ଯିବାର ଶ୍ରେୟ
 କାରଣ, ସେ ତ ଗରିବ- ଏହା ହିଁ ତାର ପରିଚୟ
 ନିତୀ ଆସେ, ଅର୍ଥ ଆସେ
 ଗରିବିର ମୂଳ ଉପ୍ତଡ

In [12]:
len(english_poems)

20407

# Getting pairs from poems

In [13]:
def get_pairs(poems):
    pairs = []
    for poem in poems:
        lines = poem
        
        # 1) to lower, no endl in the end
        lines = [line[:-1].lower() for line in lines]
        
        # 2) only ascii small letters
        allowed = set(ascii_letters + ' ')
        lines = [''.join(l for l in line if l in allowed) for line in lines]
        
        # 3) remove leading whitespaces
        lines = [line.strip() for line in lines]
        
        # 4) take all the pairs next to each other
        for i in range(1, len(lines)):
            pairs.append([lines[i-1], lines[i]])
    return pairs

In [14]:
pairs = get_pairs(english_poems)

In [15]:
sample(pairs, 8)

[['steady thy laden head across a brook',
  'or by a ciderpress with patient look'],
 ['through the pregnant porphyry', 'dome of lapizlazuli'],
 ['i hear the dancemusic of all nations',
  'the waltz some delicious measure lapsing bathing me in'],
 ['two', 'comes'],
 ['with sinfulness of men thereby to learn',
  'true patience and to temper joy with fear'],
 ['are trying to be higher', 'tower'],
 ['however the gardener had already left quietly leaving the beautiful future for you',
  'later the meadow became a garden with beautiful flowers of future all over'],
 ['ah yes and they benumb us at our call',
  'yet still from time to time vague and forlorn']]

In [16]:
len(pairs)

538735

# Remove short verses

In [17]:
minimum_digits_per_verse = 8

In [18]:
pairs = [pair for pair in pairs if len(pair[0]) >= minimum_digits_per_verse and len(pair[1]) >= minimum_digits_per_verse]

In [19]:
len(pairs)

514956

# Remove one-word verses

In [20]:
pairs = [pair for pair in pairs if " " in pair[0] and " " in pair[1]]

In [21]:
len(pairs)

510079

# Remove long ones

In [22]:
maximum_digits_per_verse = 64

In [23]:
pairs = [pair for pair in pairs if len(pair[0]) < maximum_digits_per_verse and len(pair[1]) < maximum_digits_per_verse]

In [24]:
len(pairs)

492888

# Remove pairs with abc title

In [25]:
pairs = [pair for pair in pairs if "abc" not in pair[0]]

In [26]:
len(pairs)

492723

# Remove "copyright" lines

In [27]:
pairs = [pair for pair in pairs if "copyright" not in pair[0] and "copyright" not in pair[1]]

In [28]:
len(pairs)

491492

In [29]:
sample(pairs, 10)

[['for ampler coveting', 'it might be famine all around'],
 ['people say this world is pitiful', 'but she says it beautiful'],
 ['dipped in ink they hent from a harried cuttlefish',
  'at cape sepias the throes felt theypoor daughters of tethys'],
 ['then i realised it was passion', 'passion is strong'],
 ['to bake the glebe and bind the slipry flood',
  'this of the wintry season is the prime'],
 ['again the captain waves his glass', 'sights a beacon turns and cries'],
 ['our backs press up against', 'a corrugated steel fence'],
 ['my helen fair prepared his bed', 'we waked ere lang lifes smotherd flame'],
 ['tonight he answered gravely and was dumb',
  'but pointed out the stones that numbered miles']]

# Remove verses with rare words in poems given

In [30]:
words = [pair[0].split(" ") for pair in pairs]
words = [item for sublist in words for item in sublist]

In [31]:
words_df = pd.DataFrame(words)

In [32]:
words_value_counts = words_df.value_counts()
words_value_counts.head()

the    169809
and    109827
of      72361
to      71901
a       62139
dtype: int64

In [33]:
min_occurs = 8
common_dataset_words = list(words_value_counts[words_value_counts >= min_occurs].index)

In [34]:
common_dataset_words = [str(word)[2:-3] for word in common_dataset_words]
len(common_dataset_words)

17322

In [35]:
sample(common_dataset_words, 8)

['clerihew',
 'dwellers',
 'loo',
 'shoved',
 'claps',
 'stalked',
 'learnings',
 'rancour']

In [36]:
def filter_pairs_with_words_in(pairs, list_of_acceptable):
    pairs_filtered = []
    for pair in pairs:
        has_rare_words = False
        for i in range(2):
            sentence = pair[i].split(" ")
            for word in sentence:
                if word not in list_of_acceptable:
                    has_rare_words = True
                    break
        if not has_rare_words:
            pairs_filtered.append(pair)
    return pairs_filtered

In [37]:
pairs = filter_pairs_with_words_in(pairs, common_dataset_words)

In [38]:
len(pairs)

309503

## Remove verses with words not in TOP 20 000 common

https://github.com/first20hours/google-10000-english/

In [39]:
google_common_words = list(pd.read_csv("google-20000-english.txt")["the"])

In [40]:
pairs = filter_pairs_with_words_in(pairs, google_common_words)

In [41]:
len(pairs)

87264

In [42]:
sample(pairs, 32)

[['arab children', 'spring rain'],
 ['always knew there was a special reason for', 'your presence here'],
 ['and because i love you i hate you',
  'i hate you when you are morning sadness'],
 ['felt by whom ignored by most', 'we host truth'],
 ['yet taste every feeling', 'seeing into each'],
 ['but i must attend', 'an older friend'],
 ['to waste his whole creation or possess',
  'all as our own and drive as we were driven'],
 ['this question i pose', 'what on this earth is'],
 ['alas his spirit even there', 'where all around was bright and fair'],
 ['just friends without that trust', 'you have to work your trust with me'],
 ['and they are came up with real things', 'drugs like crap'],
 ['mere dreams mere dreams yet homer had not sung',
  'had he not found it certain beyond dreams'],
 ['with first approach of light we must be risen',
  'and at our pleasant labour to reform'],
 ['life of child is so dear', 'pain may be more but i may bear'],
 ['encouraging us in love', 'and support'],
 [

# Save to .csv

In [43]:
pd.DataFrame(pairs, columns=["verse 1", "verse 2"]).to_csv("verses_pairs.csv", index=False)