In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import pandas as pd

In [2]:
conf = SparkConf()
sc = SparkContext(conf=conf)
spark = SparkSession.builder.appName("lyric Clustering").getOrCreate()

In [15]:
df = pd.read_csv('./lyric_ko_big.csv', header=None, names=["title", "artist", "lyric"])
df = df.dropna(axis=0, how='any')
df["lyric"] = df.lyric.apply(lambda x: x.replace(",", " "))

In [16]:
df.to_csv("./lyric_ko_pre.csv", header=True, index=False)

In [5]:
df = spark.read\
.format("com.databricks.spark.csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("lyric_ko.csv")

In [6]:
df.show()

+--------------------+-----------+--------------------+
|               title|     artist|               lyric|
+--------------------+-----------+--------------------+
|             그때 또 다시|         유노|돌이켜보면 너같은 사람 나에게 ...|
|      Love Love Love|선비(SunBee..|Are you there my ...|
|Butterfly (Prod. ...|  LambC(램씨)|There is a time t...|
|            GOOD BAD|       입술세개|24 군대 막 다녀와서 널 만났...|
|             너무 원했기에|        천소아|아직 이해가 안 돼소나기처럼 내...|
|               이별중이야|데이스타(Days..|욕심이라고 몇 번을 말해야 받아...|
|                Time|      V.O.S|감추려 숨기려 지우려고 아무리 ...|
|우유부단 (Prod. by K ...|조현아(어반자카파..|두려워 지금 너의 눈빛이물론 짐...|
|  기억이 난다 (Feat. 강슬기)|장한종과 J. F..|아련한 안개길 가운데 서니앞에 ...|
|                  그냥|    노래안하는사람|알았지만 애써 외면했어인정하면 ...|
|                  이유|   구오 (GUO)|그대가 날 떠난다고 생각한 적 ...|
|               그대이기에|더 원(The O..|눈부시게 빛나는 그대 얼굴이표정...|
|                Dawn|곽키(Quak-E..|저 차가운 밤을깨운 그 다음묶여...|
|party (SHUT DOWN)...|Sik-K(식케이..|Groovy Everywhere...|
|               그렇게 또|비볼드(Bebol..|넌 그렇게 또장난처럼 내게

In [7]:
df.count()

19637

In [8]:
from customTransformer import KonlpTokenizer

In [9]:
tokenizer = KonlpTokenizer(inputCol="lyric", outputCol="tokens")

In [10]:
tokensDF = tokenizer.transform(df)

In [11]:
tokensDF.show()

+--------------------+-----------+--------------------+--------------------+
|               title|     artist|               lyric|              tokens|
+--------------------+-----------+--------------------+--------------------+
|             그때 또 다시|         유노|돌이켜보면 너같은 사람 나에게 ...|[돌이, 보면, 같은, 사람, ...|
|      Love Love Love|선비(SunBee..|Are you there my ...|[대체, 하느, 늦는, 건지, ...|
|Butterfly (Prod. ...|  LambC(램씨)|There is a time t...|                  []|
|            GOOD BAD|       입술세개|24 군대 막 다녀와서 널 만났...|[군대, 다녀와, 만났, 작은,...|
|             너무 원했기에|        천소아|아직 이해가 안 돼소나기처럼 내...|[아직, 이해, 소나기, 내게,...|
|               이별중이야|데이스타(Days..|욕심이라고 몇 번을 말해야 받아...|[욕심, 말해야, 받아들일, 생...|
|                Time|      V.O.S|감추려 숨기려 지우려고 아무리 ...|[감추려, 숨기, 지우려, 애써...|
|우유부단 (Prod. by K ...|조현아(어반자카파..|두려워 지금 너의 눈빛이물론 짐...|[두려워, 지금, 눈빛, 물론,...|
|  기억이 난다 (Feat. 강슬기)|장한종과 J. F..|아련한 안개길 가운데 서니앞에 ...|[아련한, 개길, 가운데, 서니...|
|                  그냥|    노래안하는사람|알았지만 애써 외면했어인정하면 ...|[애써, 외면했, 인정하면, 끝...|

In [13]:
# from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [14]:
# hashingTF = HashingTF(inputCol="tokens", outputCol="tf")
# tfLyric = hashingTF.transform(tokensDF)

In [15]:
# idf = IDF(inputCol="tf", outputCol="tfidf")
# idfModel = idf.fit(tfLyric)
# tfidfLyric = idfModel.transform(tfLyric)

In [16]:
# tfidfLyric.select("title", "tfidf").show(25)

In [17]:
from pyspark.ml.feature import CountVectorizer
vectorizer = CountVectorizer(inputCol="tokens", outputCol="tf").fit(tokensDF)
voca = vectorizer.vocabulary
len(voca)

49525

In [19]:
vetorizedDF = vectorizer.transform(tokensDF)

In [21]:
idf = IDF(inputCol="tf", outputCol="tfidf")
idfModel = idf.fit(vetorizedDF)
tfidfLyric = idfModel.transform(vetorizedDF)

In [None]:
# from pyspark.sql.types import ArrayType, StringType
# from pyspark.sql.functions import udf

# def indices_to_terms(vocabulary):
#     def indices_to_terms(xs):
#         return [vocabulary[int(x)] for x in xs]
#     return udf(indices_to_terms, ArrayType(StringType()))

# ldaModel = ldaModel.withColumn(
#     "topics_words", indices_to_terms(voca)("termIndices")).cache()

In [22]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

normalizer = Normalizer(inputCol="tfidf", outputCol="features")
normLyrics = normalizer.transform(tfidfLyric)

# from pyspark.ml.clustering import KMeans

# kmeans = KMeans().setK(15).setMaxIter(20)
# km_model = kmeans.fit(normLyrics)

# clustersTable = km_model.transform(normLyrics)

In [None]:
from pyspark.ml.clustering import KMeans

cost = []

for k in range(2, 30):
    kmeans = KMeans().setK(k).setMaxIter(20)
    km_model = kmeans.fit(normLyrics)
    cost.append(km_model.computeCost(normLyrics))

In [16]:
clustersTable.groupBy("prediction").count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|        12|    2|
|         1|    3|
|        13|  749|
|         6|    4|
|         3| 2268|
|         5|    2|
|         9| 2861|
|         4|    2|
|         8|    2|
|         7|    1|
|        10|    2|
|        11|13736|
|        14|    2|
|         2|    1|
|         0|    2|
+----------+-----+



In [26]:
clustersTable.select("tfidf").orderBy("tfidf").show()

+--------------------+
|               tfidf|
+--------------------+
|(30000,[1,101,167...|
|(30000,[1,101,167...|
|(30000,[1,167,265...|
|(30000,[2,76,307,...|
|(30000,[2,167,400...|
|(30000,[3,15,126,...|
|(30000,[3,15,167,...|
|(30000,[3,25,167,...|
|(30000,[3,36,110,...|
|(30000,[3,54,73,1...|
|(30000,[3,73,167,...|
|(30000,[3,101,133...|
|(30000,[3,126,167...|
|(30000,[3,144,167...|
|(30000,[3,167,180...|
|(30000,[3,423,465...|
|(30000,[4,69,110,...|
|(30000,[4,69,110,...|
|(30000,[4,84,167,...|
|(30000,[4,110,118...|
+--------------------+
only showing top 20 rows



In [12]:
from pyspark.ml.feature import CountVectorizer

In [13]:
vectorizer = CountVectorizer(inputCol="tokens", outputCol="features").fit(tokensDF)
# vectorizer.vocabulary

In [14]:
len(vectorizer.vocabulary)

49525

In [15]:
vetorizedDF = vectorizer.transform(tokensDF)

In [16]:
vetorizedDF.show()

+--------------------+-----------+--------------------+--------------------+--------------------+
|               title|     artist|               lyric|              tokens|            features|
+--------------------+-----------+--------------------+--------------------+--------------------+
|             그때 또 다시|         유노|돌이켜보면 너같은 사람 나에게 ...|[돌이, 보면, 같은, 사람, ...|(49525,[0,3,8,10,...|
|      Love Love Love|선비(SunBee..|Are you there my ...|[대체, 하느, 늦는, 건지, ...|(49525,[5,8,12,15...|
|Butterfly (Prod. ...|  LambC(램씨)|There is a time t...|                  []|       (49525,[],[])|
|            GOOD BAD|       입술세개|24 군대 막 다녀와서 널 만났...|[군대, 다녀와, 만났, 작은,...|(49525,[4,6,9,10,...|
|             너무 원했기에|        천소아|아직 이해가 안 돼소나기처럼 내...|[아직, 이해, 소나기, 내게,...|(49525,[0,3,4,12,...|
|               이별중이야|데이스타(Days..|욕심이라고 몇 번을 말해야 받아...|[욕심, 말해야, 받아들일, 생...|(49525,[1,2,4,6,1...|
|                Time|      V.O.S|감추려 숨기려 지우려고 아무리 ...|[감추려, 숨기, 지우려, 애써...|(49525,[4,5,9,10,...|
|우유부단 (Prod. by K ..

In [17]:
from pyspark.ml.clustering import LDA

In [36]:
lda = LDA(k=5, seed=1, optimizer="em")

In [37]:
ldaDF = lda.fit(vetorizedDF)

In [38]:
ldaDF.isDistributed()

True

In [39]:
ldaDF.vocabSize()

49525

In [40]:
voca = vectorizer.vocabulary

In [41]:
ldaModel = ldaDF.describeTopics().cache()

In [42]:
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf

def indices_to_terms(vocabulary):
    def indices_to_terms(xs):
        return [vocabulary[int(x)] for x in xs]
    return udf(indices_to_terms, ArrayType(StringType()))

In [43]:
ldaModel = ldaModel.withColumn(
    "topics_words", indices_to_terms(voca)("termIndices")).cache()

In [44]:
ldaModel.orderBy(ldaModel.topics_words.desc()).show(30)

+-----+--------------------+--------------------+--------------------+
|topic|         termIndices|         termWeights|        topics_words|
+-----+--------------------+--------------------+--------------------+
|    1|[0, 2, 6, 7, 3, 5...|[0.00824126830050...|[사랑, 우리, 오늘, 없는, ...|
|    4|[0, 2, 1, 3, 6, 5...|[0.01390785978933...|[사랑, 우리, 그대, 마음, ...|
|    0|[1, 0, 2, 3, 5, 1...|[0.01218235255123...|[그대, 사랑, 우리, 마음, ...|
|    2|[1, 0, 4, 2, 3, 7...|[0.01109427689030...|[그대, 사랑, 시간, 우리, ...|
|    3|[1, 0, 3, 2, 4, 1...|[0.01278534852259...|[그대, 사랑, 마음, 우리, ...|
+-----+--------------------+--------------------+--------------------+



In [46]:
idf = IDF(inputCol="features", outputCol="tfidf")
idfModel = idf.fit(vetorizedDF)
tfidfLyric = idfModel.transform(vetorizedDF)

In [50]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

normalizer = Normalizer(inputCol="tfidf", outputCol="norm")
normLyrics = normalizer.transform(tfidfLyric)

from pyspark.ml.clustering import KMeans

# Trains a KMeans model.
kmeans = KMeans().setK(10).setMaxIter(20)
km_model = kmeans.fit(normLyrics)

clustersTable = km_model.transform(normLyrics)

KeyboardInterrupt: 