# Extracting meaningful lines from 'Alchemy of Souls' 

Loading the libraries and data

In [1]:
import pandas as pd
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

df_ep19 = pd.read_csv('https://raw.githubusercontent.com/dduyg/alchemy-of-souls/main/data/raw-data/alchemyofsouls_S01E19.csv')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessing the text data by removing any unwanted characters, punctuation, and stop words

In [2]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    tokens = word_tokenize(text) # tokenize the text
    tokens = [word for word in tokens if not word in stop_words] # remove stop words
    clean_text = ' '.join(tokens)
    return clean_text

df_ep19['clean_text'] = df_ep19['text'].apply(clean_text)

Using a topic modeling technique like Latent Dirichlet Allocation (LDA) to extract themes and messages from the preprocessed text data.

In [3]:
!pip install gensim
from gensim import corpora
from gensim.models import LdaModel

# create a dictionary
dictionary = corpora.Dictionary(df_ep19['clean_text'].apply(lambda x: x.split()))

# create a corpus
corpus = [dictionary.doc2bow(text.split()) for text in df_ep19['clean_text']]

# train the LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=42)



The top 5 topics and the top words in each topic.

In [4]:
# print the topics and the top words in each topic
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")

Topic: 0 
Words: 0.031*"ice" + 0.029*"stone" + 0.012*"jinyowon" + 0.012*"body" + 0.012*"like" + 0.011*"jang" + 0.010*"mage" + 0.010*"know" + 0.009*"would" + 0.009*"soul"

Topic: 1 
Words: 0.021*"jang" + 0.018*"power" + 0.015*"father" + 0.013*"master" + 0.013*"gang" + 0.011*"energy" + 0.010*"use" + 0.009*"must" + 0.009*"put" + 0.009*"us"

Topic: 2 
Words: 0.020*"master" + 0.018*"mudeok" + 0.013*"die" + 0.010*"like" + 0.010*"sorcery" + 0.010*"everyone" + 0.009*"power" + 0.009*"also" + 0.009*"get" + 0.008*"soul"

Topic: 3 
Words: 0.024*"stone" + 0.021*"ice" + 0.010*"jin" + 0.008*"need" + 0.008*"going" + 0.007*"told" + 0.007*"soul" + 0.007*"take" + 0.007*"found" + 0.007*"become"

Topic: 4 
Words: 0.017*"star" + 0.016*"kings" + 0.013*"us" + 0.013*"right" + 0.012*"let" + 0.011*"body" + 0.010*"get" + 0.009*"uk" + 0.009*"souls" + 0.009*"person"



Using the LDA model to assign topics to each line in the dataframe and then filter the lines that belong to the relevant topics.

In [5]:
def get_topic(text):
    bow = dictionary.doc2bow(text.split())
    topic = max(lda_model[bow], key=lambda x: x[1])[0]
    return topic

df_ep19['topic'] = df_ep19['clean_text'].apply(get_topic)

The meaningful, iconic lines that contain valuable messages/lessons from the relevant topics identified by the LDA model.

In [6]:
# find the meaningful, iconic lines that belong to the relevant topics
relevant_topics = [0, 2] # replace with the relevant topics identified from the LDA model
iconic_lines = df_ep19[df_ep19['topic'].isin(relevant_topics)]['text']
print(iconic_lines)

5                                  Yes, it has to be it.
9              through the great power of the ice stone.
11     It explains why he performed sorcery\nto save ...
14     a good-for-nothing man like me\nwas able to marry
16                      So I guess I should be grateful.
                             ...                        
929                           despite how lacking I was.
930                                          I know that
931      you gave up your chance\nto regain your energy.
936                             I, Mu-deok, your master,
939                                   Marry me, Mu-deok.
Name: text, Length: 427, dtype: object
