In [1]:
from copy import deepcopy
import pandas as pd
import numpy as np
from pymongo import MongoClient

In [35]:
# imported custom function to replace text keys with values in a substitution dictionary.
from my_functions import replace

In [36]:
# loop to create 10 dataframes for all century documents.
df = pd.DataFrame()
for century in range(1, 11):
    df2 = pd.DataFrame()
    df2['prophecy'] = pd.read_csv(f'./nostradamus_docs/nostradamus_century_{century}.txt', sep = '\t', header = None)[0]
    df2['century'] = century
    df = pd.concat([df, df2], axis = 0, sort = False)    

In [37]:
df.head()

Unnamed: 0,prophecy,century
0,Century I,1
1,1,1
2,Sitting alone at night in secret study;,1
3,it is placed on the brass tripod.,1
4,A slight flame comes out of the emptiness and,1


In [38]:
# removed title header of each document.
df = df[(['Century' not in s for s in df['prophecy']])].reset_index()
df.drop('index', axis = 1, inplace = True)

In [40]:
# removed every 5 lines from the dataframe.
# these lines represented the prophecy number.
df = df[np.mod(np.arange(df.index.size), 5) != 0].reset_index()
df.drop('index', axis = 1, inplace = True)

In [41]:
# added line numbers so each prophecy has four labeled lines.
df.loc[df.index % 4 == 0, 'line_num'] = 1
df.loc[df.index % 4 == 1, 'line_num'] = 2
df.loc[df.index % 4 == 2, 'line_num'] = 3
df.loc[df.index % 4 == 3, 'line_num'] = 4

In [43]:
# added prophecy numbers that reset at the start of each century.
n = 0
century = 1
boolean_list = (df.index % 4 == 0).tolist()
for i, boolean in enumerate(boolean_list):
    if boolean and df.loc[i, 'century'] == century:
        n+=1
        df.loc[i, 'prophecy_num'] = n
    elif boolean:
        century+=1
        n = 1
        df.loc[i, 'prophecy_num'] = n
    else:
        df.loc[i, 'prophecy_num'] = n

In [9]:
# combined prophecy lines into indivudal cells so that each prophecy has one cell.
df = df.groupby(['century', 'prophecy_num'])['prophecy'].apply(lambda x: '  '.join(x)).reset_index()

In [10]:
df['century'] = df['century'].astype(float)

In [11]:
len(df)

942

In [12]:
df.head()

Unnamed: 0,century,prophecy_num,prophecy
0,1.0,1.0,Sitting alone at night in secret study; it is...
1,1.0,2.0,The wand in the hand is placed in the middle o...
2,1.0,3.0,When the litters are overturned by the whirlwi...
3,1.0,4.0,In the world there will be made a king who wi...
4,1.0,5.0,They will be driven away for a long drawn out ...


In [13]:
# created new dataframe for nostradamus' epistle to Henry II
h2_df = pd.read_csv('./nostradamus_docs/nostradamus_epistle_henry_2.txt', sep = '\t', header = None)
h2_df['group'] = 1
h2_df = h2_df.loc[2:].groupby(['group'])[0].apply(lambda x: '  '.join(x)).reset_index()

In [14]:
# created new dataframe for nostradamus' prephacy to Les Propheties
pre_df = pd.read_csv('./nostradamus_docs/nostradamus_preface.txt', sep = '\t', header = None)
pre_df['group'] = 1
pre_df = pre_df.loc[2:].groupby(['group'])[0].apply(lambda x: '  '.join(x)).reset_index()

In [16]:
# created new collection for prophecies.
client = MongoClient()
db = client.nostradamus
db.create_collection('prophecy')

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'nostradamus'), 'prophecy')

In [17]:
# db.drop_collection('prophecy')

In [18]:
client.address

('localhost', 27017)

In [19]:
client.list_database_names()

['MyDatabase', 'admin', 'config', 'local', 'nostradamus']

In [20]:
db.list_collection_names()

['prophecy']

In [21]:
col = db.get_collection('prophecy')

In [22]:
# no documents in collection yet
col.estimated_document_count()

0

In [23]:
# added all prophecies to the collection
for i, prophecy in enumerate(df['prophecy']):
    doc = {'prophecy': prophecy, 'century': df['century'][i], 'prophecy_num': df['prophecy_num'][i]}
    col.insert_one(doc)

In [24]:
# all prophecies accounted for
col.estimated_document_count()

942

In [25]:
# consolidated all items in prophecy collection into pandas dataframe.
df = pd.DataFrame(list(col.find()))
df.drop('_id', axis = 1, inplace = True)

In [26]:
# performed substituions on prophecies
# based on words that were not caught later on by lemmatization/stemming.

substitutions = {'african': 'africa', 'barbaric': 'barbarian', 'because': '', 'cause': '', 'blood': 'bleed',
                 'bloody': 'bleed', 'bled': 'bleed', 'burnt': 'burn', 'captive': 'capture',
                 'captive': 'captured', 'children': 'child', 'complaints': 'complain',
                 'dead': 'die', 'death': 'die', 'deceived': 'deceive', 'deceit': 'deceive',
                 'depth': 'deep', 'defense': 'defend', 'destruction': 'destroy',
                 'discovered': 'discover', 'discoveries': 'discover', 'discovery': 'discover',
                 'doubly': 'double', 'drawn': 'draw', 'dreamer': 'dream', 'driven': 'drive',
                 'drunken': 'drunk', 'eastern': 'east', 'eaten': 'eat', 'enclosed': 'enclose',
                 'enclosure': 'enclose', 'entry': 'enter', 'envied': 'envy', 'envious': 'envy',
                 'executioners': 'execute', 'extension': 'extend', 'falsify': 'false',
                 'failure': 'fail', 'flight': 'flies', 'florense': 'florence', 'french': 'france',
                 'fraudulent': 'fraud', 'friendship': 'friend', 'frightful': 'frighten',
                 'frost': 'freeze', 'frozen': 'freeze', 'furious': 'fury', 'gascony': 'gascon',
                 'genevans': 'geneva', 'gnashing': 'gnaw', 'governor': 'govern', 'greedy': 'greed',
                 'germany': 'german', 'golden': 'gold', 'greatest': 'great', 'greater': 'great',
                 'halfway': 'half', 'healthy': 'health', 'hairy': 'hair', 'hardships': 'hard',
                 'hatred': 'hate', 'heard': 'hear', 'higher': 'high', 'highest': 'high',
                 'household': 'house', 'hundred': '', 'hungarians': 'hungary', 'impetuosity': 'impetuous',
                 'injured': 'injure', 'italian': 'italy', 'judges': 'judge', 'judgement': 'judge',
                 'kingdom': 'king', 'larger': 'large', 'later': 'late', 'leadership': 'lead',
                 'leader': 'lead', 'leaderless': 'lead', 'leaguers': 'league', 'longer': 'long',
                 'loss': 'lose', 'lost': 'lose', 'lower': 'low', 'lowest': 'low', 'lover': 'love',
                 'lusitanian': 'lusitania', 'macedonian': 'macedonia', 'mankind': 'man',
                 'marriage': 'marry', 'maritime': 'marine', 'mightily': 'mighty', 'newly': 'new',
                 'older': 'old', 'oldest': 'old', 'oppose': 'opposite', 'pleasing': 'pleasure',
                 'philosophers': 'philosophy', 'provincial': 'province', 'pursuers': 'pursue',
                 'pursuit': 'pursue', 'robbery': 'rob', 'robber': 'rob', 'redden': 'red',
                 'redbeard': 'red', 'remainder': 'remain', 'revolutions': 'revolt',
                 'robbed': 'rob', 'shadow': 'shade', 'seventh': 'seven', 'shorter': 'short',
                 'smaller': 'small', 'soldiery': 'soldiers', 'sovereignty': 'sovereign',
                 'spacious': 'space', 'spanish': 'spain', 'spaniards': 'spain', 'strongest': 'strong',
                 'stronghold': 'strong', 'survivor': 'survive', 'swampy': 'swamps', 'sprung': 'spring',
                 'taken': 'take', 'taker': 'take', 'terrified': 'terror', 'terrible': 'terror',
                 'thirsty': 'thirst', 'thought': 'think', 'threatens': 'threat', 'toulousans': 'toulouse',
                 'thunderbolt': 'thunder', 'treacherous': 'treachery', 'trickery': 'trick',
                 'troubleed': 'trouble', 'troubled': 'trouble', 'tuscany': 'tuscan',
                 'twentieth': 'twenty', 'tyrant': 'tyranny', 'victorious': 'victor',
                 'victory': 'victor', 'violent': 'violence', 'younger': 'young', 'warlike': 'war',
                 'weak': 'weaken', 'weaker': 'weaken', 'wives': 'wife', 'wooden': 'wood',
                 'worldwide': 'world', 'worthy': 'worth', 'youth': 'young'}

df['prophecy'] = deepcopy(replace(df['prophecy'], substitutions))
h2_df[0] = deepcopy(replace(h2_df[0], substitutions))
pre_df[0] = deepcopy(replace(pre_df[0], substitutions))

In [27]:
df.head()

Unnamed: 0,century,prophecy,prophecy_num
0,1.0,Sitting alone at night in secret study; it is...,1.0
1,1.0,The wand in the hand is placed in the middle o...,2.0
2,1.0,When the litters are overturned by the whirlwi...,3.0
3,1.0,In the world there will be made a king who wi...,4.0
4,1.0,They will be drive away for a long draw out fi...,5.0


In [28]:
df.to_pickle('pickle/prophecy_df.pkl')
h2_df.to_pickle('pickle/h2_df.pkl')
pre_df.to_pickle('pickle/pre_df.pkl')