In [9]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [10]:
spark = SparkSession.builder.appName('lyrics').getOrCreate()

In [11]:
dataframe = spark.read.format('csv').option("header", "true").load("lyrics.csv")
dataframe.show()

+--------------------+
|                Text|
+--------------------+
|Would you be my G...|
|Bye bye bye   I'm...|
|Baby, you're not ...|
|Thank God that it...|
|It's gonna--be--m...|
|It's gonna--be--m...|
|There's a thousan...|
|I was hanging wit...|
|Oh, oh   When the...|
|Sick and tired of...|
|My my   At Waterl...|
|I wasn't jealous ...|
|I wasn't jealous ...|
|"[Chorus]   Mamma...|
|"[Chorus]   Mamma...|
|If you change you...|
|If you change you...|
|Can you hear the ...|
|You're so hot, te...|
|You can dance, yo...|
+--------------------+
only showing top 20 rows



In [13]:
# Tokenize dataframe
lyrics_data = Tokenizer(inputCol="Text", outputCol="Words")

In [14]:
# Transform dataframe
lyrics_text = lyrics_data.transform(dataframe)
lyrics_text.show()

+--------------------+--------------------+
|                Text|               Words|
+--------------------+--------------------+
|Would you be my G...|[would, you, be, ...|
|Bye bye bye   I'm...|[bye, bye, bye, ,...|
|Baby, you're not ...|[baby,, you're, n...|
|Thank God that it...|[thank, god, that...|
|It's gonna--be--m...|[it's, gonna--be-...|
|It's gonna--be--m...|[it's, gonna--be-...|
|There's a thousan...|[there's, a, thou...|
|I was hanging wit...|[i, was, hanging,...|
|Oh, oh   When the...|[oh,, oh, , , whe...|
|Sick and tired of...|[sick, and, tired...|
|My my   At Waterl...|[my, my, , , at, ...|
|I wasn't jealous ...|[i, wasn't, jealo...|
|I wasn't jealous ...|[i, wasn't, jealo...|
|"[Chorus]   Mamma...|["[chorus], , , m...|
|"[Chorus]   Mamma...|["[chorus], , , m...|
|If you change you...|[if, you, change,...|
|If you change you...|[if, you, change,...|
|Can you hear the ...|[can, you, hear, ...|
|You're so hot, te...|[you're, so, hot,...|
|You can dance, yo...|[you, can,

In [15]:
# Remove stop words
remover = StopWordsRemover(inputCol="Words", outputCol="Filtered")

In [16]:
# Transform new dataframe
newFrame = remover.transform(lyrics_text)
newFrame.show()

+--------------------+--------------------+--------------------+
|                Text|               Words|            Filtered|
+--------------------+--------------------+--------------------+
|Would you be my G...|[would, you, be, ...|[girlfriend?, [re...|
|Bye bye bye   I'm...|[bye, bye, bye, ,...|[bye, bye, bye, ,...|
|Baby, you're not ...|[baby,, you're, n...|[baby,, one, , , ...|
|Thank God that it...|[thank, god, that...|[thank, god, frid...|
|It's gonna--be--m...|[it's, gonna--be-...|[gonna--be--me, ,...|
|It's gonna--be--m...|[it's, gonna--be-...|[gonna--be--me, ,...|
|There's a thousan...|[there's, a, thou...|[thousand, words,...|
|I was hanging wit...|[i, was, hanging,...|[hanging, fellas,...|
|Oh, oh   When the...|[oh,, oh, , , whe...|[oh,, oh, , , vis...|
|Sick and tired of...|[sick, and, tired...|[sick, tired, hea...|
|My my   At Waterl...|[my, my, , , at, ...|[, , waterloo, na...|
|I wasn't jealous ...|[i, wasn't, jealo...|[jealous, met, , ...|
|I wasn't jealous ...|[i,

In [22]:
spark.stop()

In [19]:
import pandas as pd
lyrics_final_df = newFrame.toPandas() 

In [20]:
lyrics_final_df.head()

Unnamed: 0,Text,Words,Filtered
0,Would you be my Girlfriend? [Repeat: x3] I l...,"[would, you, be, my, girlfriend?, [repeat:, x3...","[girlfriend?, [repeat:, x3], , , like, you,, r..."
1,Bye bye bye I'm doing this tonight You're ...,"[bye, bye, bye, , , i'm, doing, this, tonight,...","[bye, bye, bye, , , tonight, , , probably, gon..."
2,"Baby, you're not the only one I see the thin...","[baby,, you're, not, the, only, one, , , i, se...","[baby,, one, , , see, things, , , pain, puts, ..."
3,Thank God that it's Friday night and I Just-...,"[thank, god, that, it's, friday, night, and, i...","[thank, god, friday, night, , , just-just-just..."
4,"It's gonna--be--me Oh, yeah You might b...","[it's, gonna--be--me, , , oh,, yeah, , , , , ,...","[gonna--be--me, , , oh,, yeah, , , , , , might..."


In [21]:
lyrics_filtered_df = lyrics_final_df.to_csv("lyrics_filtered.csv", encoding="utf-8", index=False)