In [11]:
import pandas as pd
from collections import defaultdict

def read_conll(file_path):
    """Read CONLL format file and return structured data"""
    data = []
    current_sent = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:  # Sentence boundary
                if current_sent:
                    data.append(current_sent)
                    current_sent = []
                continue
                
            if line.startswith('#'):  # Skip comments
                continue
                
            parts = line.split('\t')
            if len(parts) >= 10:  # Standard CONLL-U has 10 columns
                token_data = {
                    'id': parts[0],
                    'form': parts[1],
                    'lemma': parts[2],
                    'upos': parts[3],
                    'xpos': parts[4],
                    'feats': parts[5],
                    'head': parts[6],
                    'deprel': parts[7],
                    'deps': parts[8],
                    'misc': parts[9]
                }
                current_sent.append(token_data)
    
    if current_sent:  # Add last sentence if file doesn't end with newline
        data.append(current_sent)
        
    return data

def combine_data(csv_data, conll_data):
    """
    Combine Telegram message data with linguistic annotations
    
    Args:
        csv_data: DataFrame from scraped_data.csv
        conll_data: List of sentences from CONLL file
        
    Returns:
        Combined DataFrame with messages and their linguistic annotations
    """
    # Process CONLL data to group by message
    # Assuming each message in CONLL is separated by empty lines
    # and corresponds to messages in CSV in order
    
    # Create a list to hold combined records
    combined = []
    
    # Check we have matching counts
    if len(csv_data) != len(conll_data):
        print(f"Warning: Mismatched data lengths - CSV: {len(csv_data)}, CONLL: {len(conll_data)}")
        # We'll proceed with the smaller count
        min_len = min(len(csv_data), len(conll_data))
    else:
        min_len = len(csv_data)
    
    for i in range(min_len):
        # Get CSV record
        csv_row = csv_data.iloc[i]
        
        # Get CONLL annotation
        conll_sent = conll_data[i]
        
        # Create combined record
        combined_record = {
            'channel_name': csv_row['channel_name'],
            'message_id': csv_row['message_id'],
            'timestamp': csv_row['timestamp'],
            'views': csv_row['views'],
            'has_image': csv_row['has_image'],
            'text_original': csv_row['text_original'],
            'tokens': conll_sent  # List of token dictionaries
        }
        
        combined.append(combined_record)
    
    return pd.DataFrame(combined)

# Main execution
if __name__ == "__main__":
    # Load CSV data
    csv_data = pd.read_csv('../data/scraped_data.csv')
    
    # Load CONLL data
    conll_data = read_conll('../data/clean_processed.conll')
    
    # Combine the datasets
    combined_df = combine_data(csv_data, conll_data)
    
    # Save combined data
    combined_df.to_json('../data/combined_data.json', orient='records', lines=True, force_ascii=False)
    print("Combined data saved to combined_data.json")
    
    # Alternatively save as CSV (though nested data will be stringified)
    combined_df.to_csv('../data/combined_data.csv', index=False, encoding='utf-8')
    print("Combined data saved to combined_data.csv")

Combined data saved to combined_data.json
Combined data saved to combined_data.csv


In [16]:
df.combine_data

NameError: name 'df' is not defined