In [9]:
import numpy as np
import pandas as pd
import json
import os

In [None]:
# takes ~30 seconds don't run too often
json_objects = []
with open('../data/extracted_data/submissions_data.ndjson', 'r') as file:
    for line in file:
        if line.strip():
            json_objects.append(json.loads(line))


In [4]:
data_subset = json_objects[0: 1234]
data_subset
#use one dataset for now

In [None]:
data_subset

In [14]:
def validate_json_object(json_obj, required_columns):
    """
    Validates a JSON object for required columns and data types.
    
    Args:
        json_obj (dict): The JSON object to validate
        required_columns (dict): Dictionary of column names and their expected types
        
    Returns:
        tuple: (bool, dict) - (is_valid, processed_data)
    """
    processed_data = {}
    
    # Check if all required columns exist and have valid types
    for column, expected_type in required_columns.items():
        # Check if column exists
        if column not in json_obj:
            return False, None
            
        value = json_obj[column]
        
        # Handle different data types
        if expected_type == int:
            if not isinstance(value, (int, float)):
                try:
                    processed_data[column] = int(value)
                except (ValueError, TypeError):
                    return False, None
            else:
                processed_data[column] = int(value)
                
        elif expected_type == float:
            if not isinstance(value, (int, float)):
                try:
                    processed_data[column] = float(value)
                except (ValueError, TypeError):
                    return False, None
            else:
                processed_data[column] = float(value)
                
        elif expected_type == str:
            if not isinstance(value, str):
                try:
                    processed_data[column] = str(value)
                except (ValueError, TypeError):
                    return False, None
            else:
                processed_data[column] = value
                
        elif expected_type == bool:
            if not isinstance(value, bool):
                if isinstance(value, (int, float)):
                    processed_data[column] = bool(value)
                elif isinstance(value, str):
                    processed_data[column] = value.lower() in ('true', '1', 'yes')
                else:
                    return False, None
            else:
                processed_data[column] = value
    
    return True, processed_data

def process_json_data(json_objects):
    """
    Process a list of JSON objects and validate them for PostgreSQL insertion.
    
    Args:
        json_objects (list): List of JSON objects to process
        
    Returns:
        tuple: (valid_data, invalid_indices)
    """
    # Define your required columns and their expected types
    required_columns = {
        'subreddit': str,
        'title': str,
        'author': str,
        'id': str,
    }
    
    valid_data = []
    invalid_indices = []
    
    for idx, json_obj in enumerate(json_objects):
        is_valid, processed_obj = validate_json_object(json_obj, required_columns)
        
        if is_valid:
            valid_data.append(processed_obj)
        else:
            invalid_indices.append(idx)
    
    return valid_data, invalid_indices



In [15]:
valid_data, invalid_indices = process_json_data(json_objects)
    
# Print summary
print(f"Total objects processed: {len(json_objects)}")
print(f"Valid objects: {len(valid_data)}")
print(f"Invalid objects: {len(invalid_indices)}")
    
if invalid_indices:
    print("\nInvalid objects found at indices:", invalid_indices[:10], "...")
        
# Example of accessing validated data
if valid_data:
    print("\nExample of processed valid object:")
    print(valid_data[0])

Total objects processed: 839181
Valid objects: 839181
Invalid objects: 0

Example of processed valid object:
{'subreddit': 'AcademicPsychology', 'title': 'Sister subreddit /r/psychscience', 'author': 'ilikebluepens', 'id': 'iemkn'}


In [11]:
comment_objects = []
with open('../data/extracted_data/comments_data.ndjson', 'r') as file:
    for line in file:
        if line.strip():
            json_objects.append(json.loads(line))


In [None]:
comment_objects[0]

In [None]:
def validate_comment_object(json_obj, required_columns):
    """
    Validates a JSON object for required columns and data types.
    
    Args:
        json_obj (dict): The JSON object to validate
        required_columns (dict): Dictionary of column names and their expected types
        
    Returns:
        tuple: (bool, dict) - (is_valid, processed_data)
    """
    processed_data = {}
    
    # Check if all required columns exist and have valid types
    for column, expected_type in required_columns.items():
        # Check if column exists
        if column not in json_obj:
            return False, None
            
        value = json_obj[column]
        
        # Handle different data types
        if expected_type == int:
            if not isinstance(value, (int, float)):
                try:
                    processed_data[column] = int(value)
                except (ValueError, TypeError):
                    return False, None
            else:
                processed_data[column] = int(value)
                
        elif expected_type == float:
            if not isinstance(value, (int, float)):
                try:
                    processed_data[column] = float(value)
                except (ValueError, TypeError):
                    return False, None
            else:
                processed_data[column] = float(value)
                
        elif expected_type == str:
            if not isinstance(value, str):
                try:
                    processed_data[column] = str(value)
                except (ValueError, TypeError):
                    return False, None
            else:
                processed_data[column] = value
                
        elif expected_type == bool:
            if not isinstance(value, bool):
                if isinstance(value, (int, float)):
                    processed_data[column] = bool(value)
                elif isinstance(value, str):
                    processed_data[column] = value.lower() in ('true', '1', 'yes')
                else:
                    return False, None
            else:
                processed_data[column] = value
    
    return True, processed_data

def process_comment_data(json_objects):
    """
    Process a list of JSON objects and validate them for PostgreSQL insertion.
    
    Args:
        json_objects (list): List of JSON objects to process
        
    Returns:
        tuple: (valid_data, invalid_indices)
    """
    # Define your required columns and their expected types
    required_columns = {
        'subreddit': str,
        'title': str,
        'author': str,
        'id': str,
    }
    
    valid_data = []
    invalid_indices = []
    
    for idx, json_obj in enumerate(json_objects):
        is_valid, processed_obj = validate_comment_object(json_obj, required_columns)
        
        if is_valid:
            valid_data.append(processed_obj)
        else:
            invalid_indices.append(idx)
    
    return valid_data, invalid_indices

In [None]:
valid_data, invalid_indices = process_comment_data(comment_objects)
    
# Print summary
print(f"Total objects processed: {len(json_objects)}")
print(f"Valid objects: {len(valid_data)}")
print(f"Invalid objects: {len(invalid_indices)}")
    
if invalid_indices:
    print("\nInvalid objects found at indices:", invalid_indices[:10], "...")
        
# Example of accessing validated data
if valid_data:
    print("\nExample of processed valid object:")
    print(valid_data[0])