### Reading text data

In [3]:
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 *

spark = (SparkSession.builder  # SparkSession 빌더 패턴 시작
         .appName("text-processing")  # 애플리케이션 이름 설정
         .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
         .config("spark.executor.memory", "512m")  # Spark 설정 옵션
         .getOrCreate()  # SparkSession 생성 또는 기존 세션 반환)

spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/19 12:11:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/19 12:11:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
df = (spark.read.format("csv")  # CSV 형식으로 데이터 읽기
      .option("header",True)
      .load(  # 파일 로드"../data/Reviews.csv"))

                                                                                

In [5]:
df.printSchema()  # DataFrame 스키마 구조 출력

root
 |-- Id: string (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- UserId: string (nullable = true)
 |-- ProfileName: string (nullable = true)
 |-- HelpfulnessNumerator: string (nullable = true)
 |-- HelpfulnessDenominator: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Text: string (nullable = true)



In [6]:
df.show(10, truncate=False)

+------+----------+--------------+----------------------------------+--------------------+----------------------+-----+----------+---------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
# 적용 regular expression to remove all non-alphabetic characters
df_clean = (df
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"Text", regexp_replace("Text", "[^a-zA-Z ]", ""))
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"Text", regexp_replace("Text", "  +", " ")))

df_clean.show()  # DataFrame 내용 출력

[Stage 2:>                                                          (0 + 1) / 1]

+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+
|    Id| ProductId|        UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|
+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+
|250093|B0029NII3C|A3P8CU9874SRK5|        C. christine|                   0|                     0|    2|1316649600|Unwanted Ingredients|I was interested ...|
|250115|B0013MEB40| A99TG4Q2ZPW7S|"Blu-estLight ""M...|                   0|                     0|    4|1310083200|one of my favorit...|These cookies are...|
|250132|B005UBH8WC| AY12DBB0U420B|       Gary Peterson|                   0|                     0|    5|1336348800|A Favorite of My ...|Ive been using Ic...|
|250137|B001EQ57QG| A9X62UCTFNQBE|            

                                                                                

In [8]:
df_with_words = (df_clean.withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"words", split(df_clean.Text, "\\s+")))
df_with_words.show()  # DataFrame 내용 출력

+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+
|    Id| ProductId|        UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|               words|
+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+
|250093|B0029NII3C|A3P8CU9874SRK5|        C. christine|                   0|                     0|    2|1316649600|Unwanted Ingredients|I was interested ...|[I, was, interest...|
|250115|B0013MEB40| A99TG4Q2ZPW7S|"Blu-estLight ""M...|                   0|                     0|    4|1310083200|one of my favorit...|These cookies are...|[These, cookies, ...|
|250132|B005UBH8WC| AY12DBB0U420B|       Gary Peterson|                   0|                     0| 

In [9]:
from pyspark.ml.feature import Tokenizer

# Tokenize the text data
tokenizer = Tokenizer(inputCol='Text', outputCol='words')
df_with_words = tokenizer.transform(  # 배열 변환df_clean)
df_with_words.show()  # DataFrame 내용 출력

[Stage 4:>                                                          (0 + 1) / 1]

+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+
|    Id| ProductId|        UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|               words|
+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+
|250093|B0029NII3C|A3P8CU9874SRK5|        C. christine|                   0|                     0|    2|1316649600|Unwanted Ingredients|I was interested ...|[i, was, interest...|
|250115|B0013MEB40| A99TG4Q2ZPW7S|"Blu-estLight ""M...|                   0|                     0|    4|1310083200|one of my favorit...|These cookies are...|[these, cookies, ...|
|250132|B005UBH8WC| AY12DBB0U420B|       Gary Peterson|                   0|                     0| 

                                                                                

In [10]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df_stop_words_removed = remover.transform(  # 배열 변환df_with_words)

df_stop_words_removed.show()  # DataFrame 내용 출력

+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+
|    Id| ProductId|        UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|               words|      filtered_words|
+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+
|250093|B0029NII3C|A3P8CU9874SRK5|        C. christine|                   0|                     0|    2|1316649600|Unwanted Ingredients|I was interested ...|[i, was, interest...|[interested, food...|
|250115|B0013MEB40| A99TG4Q2ZPW7S|"Blu-estLight ""M...|                   0|                     0|    4|1310083200|one of my favorit...|These cookies are...|[these, cookies, ...|[cookies, yummy, 

In [11]:
df_exploded = (df_stop_words_removed
               .select(  # 컬럼 선택explode(df_stop_words_removed.filtered_words).alias("word")))
word_count = (df_exploded
              .groupBy(  # 그룹화"word")
              .count()  # 행 개수 계산
              .orderBy(  # 정렬"count", ascending=False))
word_count.show(n=100)

[Stage 8:>                                                          (0 + 1) / 1]

+---------+-----+
|     word|count|
+---------+-----+
|     like|11014|
|       br|10705|
|     good| 8717|
|    great| 7475|
|      one| 7471|
|    taste| 7435|
|   coffee| 7257|
|  product| 6593|
|   flavor| 6468|
|     love| 5968|
|      tea| 5947|
|     food| 5514|
|      get| 4675|
|   really| 4488|
|     much| 4109|
|     dont| 4054|
|     also| 3757|
|     time| 3716|
|   little| 3710|
|      use| 3695|
|   amazon| 3626|
|    tried| 3483|
|     best| 3434|
|      buy| 3381|
|     find| 3338|
|    price| 3324|
|      ive| 3317|
|       im| 3267|
|     even| 3191|
|     make| 3163|
|     well| 3113|
|      try| 2991|
|      dog| 2990|
|   better| 2985|
|      eat| 2899|
|    first| 2662|
|chocolate| 2522|
|    water| 2495|
|    found| 2492|
|     used| 2437|
|      bag| 2399|
|   bought| 2352|
|    sweet| 2282|
|      cup| 2236|
|    drink| 2212|
|     made| 2197|
|    sugar| 2155|
|      two| 2152|
|      box| 2139|
|    think| 2077|
|   tastes| 2053|
|      way| 2047|
|    since

                                                                                

In [12]:
from pyspark.ml.feature import CountVectorizer

# Convert the text data into numerical features
vectorizer = CountVectorizer(inputCol='filtered_words', outputCol='features')
vectorized_data = vectorizer.fit(df_stop_words_removed).transform(  # 배열 변환df_stop_words_removed)
vectorized_data.show(10, truncate=False)

                                                                                

+------+----------+--------------+----------------------------------+--------------------+----------------------+-----+----------+---------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [13]:
(vectorized_data.repartition(  # 파티션 재분배1)
 .write.mode("overwrite")  # 기존 데이터 덮어쓰기
 .json("../data/data_lake/reviews_vectorized.json"))

                                                                                

### Using the `regexp_extract()` function

In [14]:
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 regexp_extract

# Extract all words starting with "q"
df_q_words = (vectorized_data
              .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"q_words", regexp_extract("text", "\\\\bq\\\\w*", 0)))
df_q_words.show()  # DataFrame 내용 출력

+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+
|    Id| ProductId|        UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|               words|      filtered_words|            features|q_words|
+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+
|250093|B0029NII3C|A3P8CU9874SRK5|        C. christine|                   0|                     0|    2|1316649600|Unwanted Ingredients|I was interested ...|[i, was, interest...|[interested, food...|(40697,[1,2,11,12...|       |
|250115|B0013MEB40| A99TG4Q2ZPW7S|"Blu-estLight ""M...|                   0|    

### Using the `rlike()` function

In [15]:
# Check if text data contains the word "good"
df_good_word = (vectorized_data
              .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"contains_qood", expr("text rlike 'quick'")))
df_good_word.show()  # DataFrame 내용 출력

+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+
|    Id| ProductId|        UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|               words|      filtered_words|            features|contains_qood|
+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+
|250093|B0029NII3C|A3P8CU9874SRK5|        C. christine|                   0|                     0|    2|1316649600|Unwanted Ingredients|I was interested ...|[i, was, interest...|[interested, food...|(40697,[1,2,11,12...|        false|
|250115|B0013MEB40| A99TG4Q2ZPW7S|"Blu-estLight ""M...| 

### Customizing Stop Words

In [16]:
custom_stopwords = ["/><br", "-", "/>I","/>The"]
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=custom_stopwords)

df_stop_words_removed = stopwords_remover.transform(  # 배열 변환df_with_words)

df_stop_words_removed.show()  # DataFrame 내용 출력

+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+
|    Id| ProductId|        UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|               words|      filtered_words|
+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+
|250093|B0029NII3C|A3P8CU9874SRK5|        C. christine|                   0|                     0|    2|1316649600|Unwanted Ingredients|I was interested ...|[i, was, interest...|[i, was, interest...|
|250115|B0013MEB40| A99TG4Q2ZPW7S|"Blu-estLight ""M...|                   0|                     0|    4|1310083200|one of my favorit...|These cookies are...|[these, cookies, ...|[these, cookies, 

In [17]:
custom_stopwords = ["/><br", "-", "/>I","/>The"]

stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=custom_stopwords)
stopwords_remover.setStopWords(custom_stopwords)

df_stop_words_removed = stopwords_remover.transform(  # 배열 변환df_with_words)

df_stop_words_removed.show()  # DataFrame 내용 출력

+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+
|    Id| ProductId|        UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|               words|      filtered_words|
+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+
|250093|B0029NII3C|A3P8CU9874SRK5|        C. christine|                   0|                     0|    2|1316649600|Unwanted Ingredients|I was interested ...|[i, was, interest...|[i, was, interest...|
|250115|B0013MEB40| A99TG4Q2ZPW7S|"Blu-estLight ""M...|                   0|                     0|    4|1310083200|one of my favorit...|These cookies are...|[these, cookies, ...|[these, cookies, 

In [18]:
spark.stop()  # Spark 세션 종료 - 리소스 정리