---
title: "Data Cleaning"
format:
    html: 
        code-fold: false
---

<!-- After digesting the instructions, you can delete this cell, these are assignment instructions and do not need to be included in your final submission.  -->

{{< include cleaning.qmd >}} 

# Codes

In [None]:
# NOTICE: This code is supposed to be run within the same .ipynb file as the data collection codes.

# Read the json data
def convert_reddit_data(json_input_path, csv_output_path):
    with open(json_input_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Lists to store the processed data
    processed_data = []

    # Process each submission and its comments
    for submission in data:
        submission_text = submission['title']
        if submission['text']:  # Combine text content if it exists
            submission_text += " " + submission['text']
            
        # Add submission data
        processed_data.append({
            'subreddit': submission['subreddit'],
            'id': submission['post_id'],
            'type': f"submission_{submission['sort_type']}",  # submission_hot or submission_con
            'depth': 0,  # submissions are depth 0
            'score': submission['score'],
            'time': submission['created_utc'],
            'text': submission_text
        })
        
        # Process comments
        for comment in submission['comments']:
            processed_data.append({
                'subreddit': submission['subreddit'],
                'id': comment['comment_id'],
                'type': 'comment',
                'depth': comment['depth'] + 1,  # increment depth by 1
                'score': comment['score'],
                'time': comment['created_utc'],
                'text': comment['body']
            })

    # Convert to DataFrame
    df = pd.DataFrame(processed_data)
    
    # Save to CSV
    df.to_csv(csv_output_path, index=False)
    
    # Print some basic statistics
    print("\nFile convert complete. Results saved to data/processed-data/text_inital.json.")
    print(f"Total rows: {len(df)}")
    print("\nDistribution by type:")
    print(df['type'].value_counts())
    print("\nDepth distribution:")
    print(df['depth'].value_counts().sort_index())

    return df

# Execute the conversion
file_path_text_raw = "data/raw-data/text_raw.json"
file_path_text_initial = "data/processed-data/text_initial.csv"  

df_text_initial = convert_reddit_data(file_path_text_raw, file_path_text_initial)
df_text_initial.head(6)

In [None]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import emoji

# Download NLTK stopwords and WordNet data
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    # Remove extra spaces
    text = text.strip()
    
    # Remove newline characters and extra spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace characters (including newline, tabs) with a single space
    
    # Remove meaningless characters, such as special symbols 
    text = re.sub(r'[^\w\s,.\-]', '', text)  # Keep letters, numbers, spaces, commas, periods, and hyphens
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove links
    text = re.sub(r'@\S+', '', text)  # Remove @username mentions
    text = re.sub(r'#[\w]+', '', text)  # Remove all hashtags (e.g., #aiart)
    text = re.sub(r'-', '', text)  # Remove hyphens
    
    # Remove emojis
    text = emoji.replace_emoji(text, replace='')  
    
    # Convert the text to lowercase
    text = text.lower()

    # Remove double quotes
    text = text.replace('"', '')  

    if text.startswith('"') and text.endswith('"'):
        text = text[1:-1]

    return text

def remove_non_english_words(text):
    # English words only
    english_words = re.findall(r'\b[a-zA-Z]+\b', text)
    return ' '.join(english_words)

def process_text(text):
    # Tokenize the text into words
    words = text.split()
    # Remove stop words and apply lemmatization
    processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Join the processed words back into a string
    return ' '.join(processed_words)

# Clean the text of each row in the 'text' column
df_text_initial['text'] = df_text_initial['text'].apply(lambda x: clean_text(x))  # Clean the 'text' field
df_text_initial['text'] = df_text_initial['text'].apply(lambda x: remove_non_english_words(x))  # Keep only English words
df_text_initial['text'] = df_text_initial['text'].apply(lambda x: process_text(x))  # Remove stop words and apply lemmatization

# Drop duplicate rows based on the 'text' column
df_text_clean = df_text_initial.drop_duplicates(subset=['text'])

# Output the cleaned DataFrame to a CSV file
file_path_text_clean = "data/processed-data/text_clean.csv"
df_text_clean.to_csv(file_path_text_clean, index=False, encoding='utf-8')
print(f"Data cleaning complete. Results saved to {file_path_text_clean}")
df_text_clean.head(6)
