In [1]:
%reset -fs

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn

In [3]:
def find_speaker(text):
    return text.split(':')[0].strip()
def find_text(text):
    return text.split(':')[1]

In [4]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)

In [5]:
with open('fe7_only.txt') as f:
    lines = f.read()

In [6]:
lines



In [7]:
lines = lines.replace("\n   ", " ")
lines = lines.replace("Agung", "Mark")
lines = lines.split('\n')

In [8]:
':' not in lines[6]

False

In [9]:
for i in range(len(lines) - 1, -1, -1):
    if (':' not in lines[i]) or ("CHAPTER" in lines[i]) or ("PROLOGUE" in lines[i]) or (lines[i][-1] == ":"):
        lines.pop(i)
        

In [10]:
lines

["Lyn  : Are you awake? I found you unconscious on the plains. I am Lyn, of the Lorca tribe. You're safe now. Who are you? Can you remember your name? Your name is Mark? What an odd-sounding name... But pay me no mind. It is a good name. I see by your attire that you are a traveler. What brings you to the Sacae Plains? Would you share your story with me? Hm? What was that noise? I'll go see what's happening. Mark, wait here for me.",
 "Lyn  : Oh, no! Bandits! They must have come down from the Bern Mountains! They must be planning on raiding the local villages. I... I have to stop them! If that's all of them, I think I can handle them on my own. You'll be safe in here, Mark! What? You want to help? Well, can you use a weapon? Ah, I see... So you're a strategist by trade? An odd profession, but... Very well. We'll go together!",
 "Lyn  : Over here! If you want to help, Mark, I could use your advice. I'll protect you, so stay close to me.",
 'Lyn  : I need to be closer to the enemy.',
 'L

In [11]:
df = pd.DataFrame()
df['text'] = lines
df

Unnamed: 0,text
0,Lyn : Are you awake? I found you unconscious ...
1,"Lyn : Oh, no! Bandits! They must have come do..."
2,"Lyn : Over here! If you want to help, Mark, I..."
3,Lyn : I need to be closer to the enemy.
4,"Lyn : Yes, this should be close enough."
...,...
3642,Rebecca : Forgive me... Brother...
3643,Raven : I'm done for. Is this really... the be...
3644,Sain : Lady Lyndis... I... was... so happy...
3645,Serra : Ooh! Why me? This is SO annoying!


In [12]:
df['speaker'] = lines
df['text'] = df['text'].apply(find_text)
df['speaker'] = df['speaker'].apply(find_speaker)
df

Unnamed: 0,text,speaker
0,Are you awake? I found you unconscious on the...,Lyn
1,"Oh, no! Bandits! They must have come down fro...",Lyn
2,"Over here! If you want to help, Mark, I could...",Lyn
3,I need to be closer to the enemy.,Lyn
4,"Yes, this should be close enough.",Lyn
...,...,...
3642,Forgive me... Brother...,Rebecca
3643,I'm done for. Is this really... the best I co...,Raven
3644,Lady Lyndis... I... was... so happy...,Sain
3645,Ooh! Why me? This is SO annoying!,Serra


In [13]:
stop_words_plus = ENGLISH_STOP_WORDS.union(['ah', 'yes', 'no', 'um', 'oh', 'hm', 'uh', 've', 'huh', 'eh', 'ha', 'hey',
                                           'ok', 'sure', 'll', 'let', 'isn', 'heh', 'don', 'did', 'just', 'like', 'think', 'hee', 'fine'])

In [14]:
cv = CountVectorizer(strip_accents = 'unicode', stop_words = stop_words_plus, lowercase = True)
X = cv.fit_transform(df['text'])

In [15]:
df_cv = pd.DataFrame(X.todense(), index = df['text'], columns = cv.get_feature_names())



In [16]:
df_cv

Unnamed: 0_level_0,000,100,18,19,30,50,75th,aa,aaaaaaah,aaaaah,...,yo,yon,youdoing,young,younger,yourd,youre,youth,yu,zephiel
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Are you awake? I found you unconscious on the plains. I am Lyn, of the Lorca tribe. You're safe now. Who are you? Can you remember your name? Your name is Mark? What an odd-sounding name... But pay me no mind. It is a good name. I see by your attire that you are a traveler. What brings you to the Sacae Plains? Would you share your story with me? Hm? What was that noise? I'll go see what's happening. Mark, wait here for me.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Oh, no! Bandits! They must have come down from the Bern Mountains! They must be planning on raiding the local villages. I... I have to stop them! If that's all of them, I think I can handle them on my own. You'll be safe in here, Mark! What? You want to help? Well, can you use a weapon? Ah, I see... So you're a strategist by trade? An odd profession, but... Very well. We'll go together!",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Over here! If you want to help, Mark, I could use your advice. I'll protect you, so stay close to me.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
I need to be closer to the enemy.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Yes, this should be close enough.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Forgive me... Brother...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
I'm done for. Is this really... the best I could aspire to?,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Lady Lyndis... I... was... so happy...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ooh! Why me? This is SO annoying!,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
tfidf = TfidfVectorizer(**cv.get_params())
X_tfidf = tfidf.fit_transform(df['text'])

set_components = 6
itera = 20

lda_tf = LatentDirichletAllocation(n_components = set_components, max_iter = itera, random_state = 0)
lda_tf.fit(X)

lda_tfidf = LatentDirichletAllocation(n_components = set_components, max_iter = itera, random_state = 0)
lda_tfidf.fit(X_tfidf)



LatentDirichletAllocation(max_iter=20, n_components=6, random_state=0)

In [18]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_tf, X, cv)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


1. Lord Eliwood trying to save Ninian
2. Assassination plot against Marquess Caelin
3. Lyn petitioning for help to reunite with her grandfather
4. Hector or Mark lending aid
5. Nils and Ninian trapped at Dragon's Gate
6. Miscellaneous

In [19]:

pyLDAvis.sklearn.prepare(lda_tfidf, X_tfidf, tfidf)

  default_term_info = default_term_info.sort_values(


In [None]:
X