In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [14]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.sql import Window

In [82]:
def clear_review(df):
    df = df.withColumn('Review', F.regexp_replace('Review', "did n't", "didn't"))
    df = df.withColumn('Review', F.regexp_replace('Review', "wo n't", "won't"))
    df = df.withColumn('Review', F.regexp_replace('Review', "ca n't", "can't"))
    df = df.withColumn('Review', F.regexp_replace('Review', "n't", ""))
    df = df.withColumn('Review', F.split('Review', ' '))
    return df

def count_term_freq(df: DataFrame):
    df = df.withColumn('doc_id', F.monotonically_increasing_id())
    df = df.select('doc_id', F.explode('Review').alias('word'))
    df = df.filter(F.length('word') > 0)
    df = df.groupBy('doc_id', 'word').agg(F.count('word').alias('tf'))
    return df

def count_doc_freq(df: DataFrame):
    window = Window.partitionBy('word')
    df = df.groupBy('word').agg(F.count('doc_id').alias('df'))
    df = df.orderBy('df', ascending=False)
    df = df.limit(100)
    return df
    
def count_tfidf(df: DataFrame):
    term_freq_df = count_term_freq(df)
    doc_freq_df = count_doc_freq(term_freq_df)
    tfidf_df = term_freq_df.join(doc_freq_df, 'word')
    tfidf_df = tfidf_df.withColumn('tfidf', F.col('tf') / F.col('df'))
    tfidf_df = tfidf_df.groupBy('doc_id').pivot('word').sum('tfidf')
    return tfidf_df
    

In [83]:
TEST_MOD = False
review_df = spark.read\
            .option('header', 'true')\
            .csv('tripadvisor_hotel_reviews.csv')
if TEST_MOD:
    review_df = review_df.limit(10)
    
review_df = clear_review(review_df)
review_df = count_tfidf(review_df)

In [84]:
review_df.show(vertical=True)

-RECORD 0---------------------------
 doc_id      | 474                  
 2           | null                 
 3           | null                 
 4           | null                 
 5           | null                 
 area        | null                 
 arrived     | null                 
 away        | null                 
 bar         | null                 
 bathroom    | null                 
 beach       | null                 
 beautiful   | null                 
 bed         | null                 
 beds        | null                 
 best        | null                 
 better      | null                 
 big         | null                 
 bit         | null                 
 booked      | 3.266906239790918E-4 
 breakfast   | 1.508523155830442E-4 
 buffet      | null                 
 check       | null                 
 city        | null                 
 clean       | 1.558603491271820... 
 close       | null                 
 comfortable | 3.388681802778719... 
 