In [20]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(37)

def load_data(dem_train_path, rep_train_path, dem_test_path, rep_test_path):
    """
    Load and combine the democratic and republican data from parquet files
    """
    # Load the four datasets
    dem_train = pd.read_parquet(dem_train_path)
    rep_train = pd.read_parquet(rep_train_path)
    dem_test = pd.read_parquet(dem_test_path)
    rep_test = pd.read_parquet(rep_test_path)
    
    # Combine the training data
    train_data = pd.concat([dem_train, rep_train], ignore_index=True)
    
    # Combine the test data
    test_data = pd.concat([dem_test, rep_test], ignore_index=True)
    
    # Display the shape of the datasets
    print(f"Training data shape: {train_data.shape}")
    print(f"Test data shape: {test_data.shape}")
    
    # Check for data balance
    print("\nClass distribution in training data:")
    print(train_data['label'].value_counts())
    
    print("\nClass distribution in test data:")
    print(test_data['label'].value_counts())
    
    return train_data[['text', 'label']], test_data[['text', 'label']]

train, test = load_data('democratic_only.train.en.parquet', 'republican_only.train.en.parquet', 'democratic_only.test.en.parquet', 'republican_only.test.en.parquet')



Training data shape: (537922, 6)
Test data shape: (56000, 6)

Class distribution in training data:
label
democratic    268961
republican    268961
Name: count, dtype: int64

Class distribution in test data:
label
democratic    28000
republican    28000
Name: count, dtype: int64


In [21]:
# Calculate how many rows we need (one third of the total)
sample_size = len(train) // 3

# Randomly select one third of the data
sampled_df = train.sample(n=sample_size, random_state=37)

# Verify the size of the sampled dataset
print(f"Original dataset size: {len(train)} rows")
print(f"Sampled dataset size: {len(sampled_df)} rows")
print(f"Proportion: {len(sampled_df)/len(train):.2f}")

Original dataset size: 537922 rows
Sampled dataset size: 179307 rows
Proportion: 0.33


In [22]:
sampled_df.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)