In [1]:
from IPython.display import display, HTML
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
spark = SparkSession.builder.appName("GameOfThronesAnalysis").getOrCreate()

def display_df(df, rows=1):
    display(HTML(df.toPandas().head(rows).to_html()))

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/20 18:06:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# dataset = spark.read.csv("../medium_search_dataset.csv", header=True, inferSchema=True)
# print("dataset rows:", dataset.count())
# data = dataset.select('post_time', 'post_name', 'ranking', 'Searched')
data = spark.read.csv("../got_scripts_breakdown.csv", header=True, inferSchema=True)
print("data rows:", data.count())
data.describe()

df_agg = data.orderBy('Name', 'N_series').groupBy('Name').agg(F.array_join(F.collect_list("Sentence"), ' ', ' ').alias("Dialogue"))
display_df(df_agg, 1)


                                                                                

data rows: 22510


                                                                                

Unnamed: 0,Name,Dialogue
0,,"Do the Dothrakis buy their slaves? I didn’t realize there were knights of the Blackwater Lyanna Would you get out of here? We’re sparring I don’t know But he’s got giant’s blood Winter is here, Your Grace We need the King in the North in the north There is a snow blown field ARYA is riding on a horse Winterfell can be seen far in the distance"


In [3]:
import string
from nltk.stem.snowball import SnowballStemmer
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.sql.types import ArrayType, StringType


tokenizer = Tokenizer(inputCol="Dialogue", outputCol="words")
token_data = tokenizer.transform(df_agg)

stopwords = StopWordsRemover.loadDefaultStopWords('english')
stopwords.extend(['it'])
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=stopwords)
# print(stopwords_remover.getStopWords())
df_no_stopwords = stopwords_remover.transform(token_data)


def clean_stem(stemmer, token):
    """ Return stem word, removing any punctuation from end of string """
    t = stemmer.stem(token)
    return t.rstrip(string.punctuation) 
    # return t.translate(str.maketrans('', '', string.punctuation))
    
stemmer = SnowballStemmer(language='english')
stemmer_udf = F.udf(lambda tokens: [clean_stem(stemmer, token) for token in tokens], ArrayType(StringType()))
df_stemmed = df_no_stopwords.withColumn("words_stemmed", stemmer_udf("filtered_words"))


cv = CountVectorizer(inputCol="words_stemmed", outputCol="features")
cv_model = cv.fit(df_stemmed)
vocabulary = {i:v for i,v in enumerate(cv_model.vocabulary)}
vector_data = cv_model.transform(df_stemmed)
display_df(vector_data, 5)

                                                                                

Unnamed: 0,Name,Dialogue,words,filtered_words,words_stemmed,features
0,,"Do the Dothrakis buy their slaves? I didn’t realize there were knights of the Blackwater Lyanna Would you get out of here? We’re sparring I don’t know But he’s got giant’s blood Winter is here, Your Grace We need the King in the North in the north There is a snow blown field ARYA is riding on a horse Winterfell can be seen far in the distance","[do, the, dothrakis, buy, their, slaves?, i, didn’t, realize, there, were, knights, of, the, blackwater, lyanna, would, you, get, out, of, here?, we’re, sparring, i, don’t, know, but, he’s, got, giant’s, blood, winter, is, here,, your, grace, we, need, the, king, in, the, north, in, the, north, there, is, a, snow, blown, field, arya, is, riding, on, a, horse, winterfell, can, be, seen, far, in, the, distance]","[dothrakis, buy, slaves?, didn’t, realize, knights, blackwater, lyanna, get, here?, we’re, sparring, don’t, know, he’s, got, giant’s, blood, winter, here,, grace, need, king, north, north, snow, blown, field, arya, riding, horse, winterfell, seen, far, distance]","[dothraki, buy, slaves, didn't, realiz, knight, blackwat, lyanna, get, here, we'r, spar, don't, know, he, got, giant, blood, winter, here, grace, need, king, north, north, snow, blown, field, arya, ride, hors, winterfel, seen, far, distanc]","(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...)"
1,a voice,"It’s Maester Luwin, my lord","[it’s, maester, luwin,, my, lord]","[it’s, maester, luwin,, lord]","[it, maester, luwin, lord]","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
2,a younger melara,We shouldn’t be out here alone,"[we, shouldn’t, be, out, here, alone]","[shouldn’t, alone]","[shouldn't, alon]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
3,addam marbrand,ls it true about Stannis and Renly?,"[ls, it, true, about, stannis, and, renly?]","[ls, true, stannis, renly?]","[ls, true, stanni, renly]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
4,aemon,"And what is it that couldnt wait until morning, Tarly? That it was you? Who else but Samwell Tarly would be wasting candles to read in the middle of the night? Oh, no need I know my way around this library better than any other place in Castle Black Thousands of books and no eyes to read them Old age is a wonderful source of ironies if nothing else But you didnt answer my question, what are you reading? Ah, the Wildlings I can assure you that the closest Maester Faull ever got to a Wildling was this very library Yes Imagine the stories Wildlings tell about us! Love is the death of duty I told that to your friend Jon Snow once He didnt listen and neither did you Thats why you abandoned your watch atop the wall to come here and read about the things that may have happened to the girl you love Yes you do Yes you do! I heard it in your voice when you first brought her to me I remember how it sounds! I was in love once You can imagine all manner of horrors befalling that poor girl and her child, is it so difficult to imagine that an old person was once, more or less, like you? You know who I was before I came here? What I couldve been if Id only said the word? Of course you do I met many girls when I was Aemon Targaryen A future king always does Some of them were quite forward in their attempts to win my affections One of them succeeded I was very young Ah ha I could tell you everything about her Who she was How we met The color of her eyes and the shape of her nose I can see her right in front of me Shes more real than you are We could spend all night trading tales of lost love Nothing makes the past a sweeter place to visit than the prospect of imminent death Go to bed Tarly They came to us from White Harbor and Barrowton, from Fairmarket and Kings Landing, from north and south, from east and west They died protecting men, women, and children who will never know their names It is for us to remember them Our brothers, we shall never see their like again And now their watch is ended.Aemon passes the torch around as each member of the Nights Watch helps burn the bodies Stannis watches from above Does anyone wish to speak for candidates before we cast our tokens for the 998th Lord Commander of the Nights Watch? Is there anyone else? If theres no one else, we will begin the voting The triangular tokens count for Ser Alliser Thorne The square tokens for Ser Denys Mallister Each brother will- Samwell Tarly Go on It is time And shes alone, under siege, no family to guide her or protect her Her last relation thousands of miles away, useless, dying A Targaryen, alone in the world, is a terrible thing Lord Commander Oh, like a hundred-year-old man slowly freezing to death Half the men hate you already, Lord Commander Do it That doesnt matter You do You will find little joy in your command But with luck, you will find the strength to do what needs to be done Kill the boy, Jon Snow Winter is almost upon us Kill the boy, and let the man be born There he is That laugh Egg! Egg laughed like that One of the first things I remember about him And before that, he was a jolly fellow Like this one Get him south, Gillyflower Before it’s too late Egg! Egg! Mothers looking for you! Egg Egg! Egg I dreamed that I was old","[and, what, is, it, that, couldnt, wait, until, morning,, tarly?, that, it, was, you?, who, else, but, samwell, tarly, would, be, wasting, candles, to, read, in, the, middle, of, the, night?, oh,, no, need, i, know, my, way, around, this, library, better, than, any, other, place, in, castle, black, thousands, of, books, and, no, eyes, to, read, them, old, age, is, a, wonderful, source, of, ironies, if, nothing, else, but, you, didnt, answer, my, question,, what, are, you, reading?, ah,, the, wildlings, i, can, assure, you, that, the, closest, maester, faull, ever, got, to, a, wildling, was, this, very, library, ...]","[couldnt, wait, morning,, tarly?, you?, else, samwell, tarly, wasting, candles, read, middle, night?, oh,, need, know, way, around, library, better, place, castle, black, thousands, books, eyes, read, old, age, wonderful, source, ironies, nothing, else, didnt, answer, question,, reading?, ah,, wildlings, assure, closest, maester, faull, ever, got, wildling, library, yes, imagine, stories, wildlings, tell, us!, love, death, duty, told, friend, jon, snow, didnt, listen, neither, thats, abandoned, watch, atop, wall, come, read, things, may, happened, girl, love, yes, yes, do!, heard, voice, first, brought, remember, sounds!, love, imagine, manner, horrors, befalling, poor, girl, child,, difficult, imagine, old, person, once,, less,, like, ...]","[couldnt, wait, morning, tarly, you, els, samwel, tar, wast, candl, read, middl, night, oh, need, know, way, around, librari, better, place, castl, black, thousand, book, eye, read, old, age, wonder, sourc, ironi, noth, els, didnt, answer, question, reading, ah, wildl, assur, closest, maester, faull, ever, got, wildl, librari, yes, imagin, stori, wildl, tell, us, love, death, duti, told, friend, jon, snow, didnt, listen, neither, that, abandon, watch, atop, wall, come, read, thing, may, happen, girl, love, yes, yes, do, heard, voic, first, brought, rememb, sounds, love, imagin, manner, horror, befal, poor, girl, child, difficult, imagin, old, person, once, less, like, ...]","(3.0, 3.0, 0.0, 4.0, 2.0, 0.0, 2.0, 5.0, 0.0, 1.0, 4.0, 3.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 2.0, 0.0, 1.0, 2.0, 0.0, 0.0, 2.0, 1.0, 1.0, 1.0, 2.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 2.0, 3.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 4.0, 4.0, 3.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 2.0, 5.0, 2.0, 1.0, 1.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 2.0, 2.0, 1.0, 0.0, 0.0, 1.0, ...)"


In [None]:
from pyspark.ml.clustering import LDA

lda = LDA(k=15, maxIter=30, featuresCol="features")
lda_model = lda.fit(vector_data)

topics = lda_model.describeTopics(3)


[Stage 121:>                                                        (0 + 1) / 1]

In [None]:
import pyspark.sql.functions as F

map_array = F.udf(lambda a: [vocabulary[k] for k in a])
df = topics.withColumn('terms', map_array(topics['termIndices']))
display_df(df, 50)