In [None]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(37)

def load_data(dem_train_path, rep_train_path, dem_test_path, rep_test_path):
    # Load the four datasets
    dem_train = pd.read_parquet(dem_train_path)
    rep_train = pd.read_parquet(rep_train_path)
    dem_test = pd.read_parquet(dem_test_path)
    rep_test = pd.read_parquet(rep_test_path)
    
    # Combine the training data
    train_data = pd.concat([dem_train, rep_train], ignore_index=True)
    
    # Combine the test data
    test_data = pd.concat([dem_test, rep_test], ignore_index=True)
    
    # Display the shape of the datasets
    print(f"Training data shape: {train_data.shape}")
    print(f"Test data shape: {test_data.shape}")
    
    # Check for data balance
    print("\nClass distribution in training data:")
    print(train_data['label'].value_counts())
    
    print("\nClass distribution in test data:")
    print(test_data['label'].value_counts())
    
    return train_data[['text', 'label']], test_data[['text', 'label']]

train, test = load_data('democratic_only.train.en.parquet', 'republican_only.train.en.parquet', 'democratic_only.test.en.parquet', 'republican_only.test.en.parquet')

Training data shape: (537922, 6)
Test data shape: (56000, 6)

Class distribution in training data:
label
democratic    268961
republican    268961
Name: count, dtype: int64

Class distribution in test data:
label
democratic    28000
republican    28000
Name: count, dtype: int64


In [None]:
# Calculate how many rows we need (one eighth of the total)
sample_size = len(train) // 8

# Randomly select one eighth of the data
sampled_df = train.sample(n=sample_size, random_state=37)

# Verify the size of the sampled dataset
print(f"Original dataset size: {len(train)} rows")
print(f"Sampled dataset size: {len(sampled_df)} rows")
print(f"Proportion: {len(sampled_df)/len(train):.2f}")

Original dataset size: 537922 rows
Sampled dataset size: 67240 rows
Proportion: 0.12


In [None]:
# Save the files
sampled_df.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

In [None]:
# Clean and preprocess the text data
def preprocess_text(df, text_column='text'):
    df = df.copy()
    
    def clean_text(text):
        # Convert to lowercase
        text = text.lower()
        
        # Remove punctuation
        text = re.sub(f'[{string.punctuation}]', ' ', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = text.split()
        text = ' '.join([word for word in words if word not in stop_words])
        
        return text
    
    df['clean_text'] = df[text_column].apply(clean_text)
    
    return df

In [None]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

# Preprocess data
print("\nPreprocessing text data...")
train_processed = preprocess_text(train_data)
test_processed = preprocess_text(test_data)

train_processed.to_csv("clean_train.csv", index=False)
test_processed.to_csv("clean_test.csv", index=False)