In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, lower, regexp_replace

# cleaned_df = example_df.replace({'User Info Error': None}, subset=['Status'])

class PostsDataCleaning():
    def __init__(self):
        print("PostData Cleaning init")
    
    def clean_pin_data_replace_empty_with_none(self, df):
        # convert description column to lower case
        df = df.withColumn('description', lower(df['description']))
        # Replace specific value in 'description' column with None 
        df = df.withColumn('description', when(df['description'] == 'no description available story format', None).otherwise(df['description']))

        # convert title column to lower case
        df = df.withColumn('title', lower(df['title']))
        # Replace specific value in 'description' column with None 
        df = df.withColumn('title', when(df['title'] == 'no title data available', None).otherwise(df['title']))

        # convert tag_list column to lower case
        df = df.withColumn('tag_list', lower(df['tag_list']))
        # Replace specific value in 'description' column with None 
        df = df.withColumn('tag_list', when(df['tag_list'] == 'n,o, ,t,a,g,s, ,a,v,a,i,l,a,b,l,e', None).otherwise(df['tag_list']))

        # convert follower_count column to lower case
        df = df.withColumn('follower_count', lower(df['follower_count']))
        # Replace specific value in 'description' column with None 
        df = df.withColumn('follower_count', when(df['follower_count'] == 'user info error', None).otherwise(df['follower_count']))

        # convert poster_namet column to lower case
        df = df.withColumn('poster_name', lower(df['poster_name']))
        # Replace specific value in 'description' column with None 
        df = df.withColumn('poster_name', when(df['poster_name'] == 'user info error', None).otherwise(df['poster_name']))

        # convert image_src column to lower case
        df = df.withColumn('image_src', lower(df['image_src']))
        # Replace specific value in 'description' column with None 
        df = df.withColumn('image_src', when(df['image_src'] == 'image src error.', None).otherwise(df['image_src']))

        return df

    def transform_foller_count(self, df):
        #regex replace k with 000
        df = df.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
        #regex replace m with 000000
        df = df.withColumn("follower_count", regexp_replace("follower_count", "m", "000000"))
        #convert column type to int
        df = df.withColumn("follower_count", col("follower_count").cast("int"))                                   
        return df
    
    def number_col_type_to_int(self, df):
        df = df.withColumn("downloaded", col("downloaded").cast("int"))
        df = df.withColumn("index", col("index").cast("int"))                                      
        return df
    
    def clean_save_path(self, df):
        df = df.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))
        return df
    
    def rename_index_to_ind(self, df):
        df = df.withColumnRenamed("index", "ind")
        return df
    
    def reorder_columns(self, df):
        df = df.select(
            "ind",
            "unique_id",
            "title",
            "description",
            "follower_count",
            "poster_name",
            "tag_list",
            "is_image_or_video",
            "image_src",
            "save_location",
            "category"
        )
        return df


# Replace empty entries and entries with no relevant data in each column with Nones
data_cleaning_instance = PostsDataCleaning()
cleaned_data = data_cleaning_instance.clean_pin_data_replace_empty_with_none(pin_df)
# Perform the necessary transformations on the follower_count to ensure every entry is a number. Make sure the data type of this column is an int.
cleaned_data_appended_follower_count = data_cleaning_instance.transform_foller_count(cleaned_data)
# Ensure that each column containing numeric data has a numeric data type
cleaned_data_cols_to_int = data_cleaning_instance.transform_foller_count(cleaned_data_appended_follower_count)
# Clean the data in the save_location column to include only the save location path
cleaned_data_append_save_path = data_cleaning_instance.clean_save_path(cleaned_data_cols_to_int)
# Rename the index column to ind.
cleaned_data_renamed_index = data_cleaning_instance.rename_index_to_ind(cleaned_data_append_save_path)
# Reorder the DataFrame columns to have the following column order: ind, unique_id, title, description, follower_count, poster_name, tag_list, is_image_or_video, image_src, save_location, category
cleaned_data_rordered_columns = data_cleaning_instance.reorder_columns(cleaned_data_renamed_index)


cleaned_pin_data = cleaned_data_rordered_columns

display(cleaned_pin_data)
# display(cleaned_data_cols_to_int.dtypes)
