# Natural Tweet Data Generator

Generates synthetic tweet data with **natural, realistic patterns** for the Southeast US region.

**Natural patterns:**
- Normal daily activity cycles (people tweet more during waking hours)
- Geographic distribution following population density
- Entity mentions reflect real place names in the region
- No artificial hurricane-specific behaviors

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from shapely.geometry import Point
import os

## Simple Configuration

In [None]:
# ==============================================================================
# CONFIGURATION - Just change these!
# ==============================================================================

NUM_TWEETS = 50000  # How many tweets to generate

DATASET_NAME = 'natural_test'  # Output filename

# Time range (natural daily pattern)
START_DATE = datetime(2024, 9, 26, 0, 0, 0)
NUM_DAYS = 3  # Duration of dataset

print(f"Configuration:")
print(f"  Generating {NUM_TWEETS:,} tweets")
print(f"  Time range: {START_DATE} + {NUM_DAYS} days")
print(f"  Region: Southeast US (FL, GA, NC, SC, AL, TN)")

## Load Reference Data

In [None]:
# Load real data just to extract entity names
local_path = os.path.dirname(os.getcwd())
helene_path = os.path.join(local_path, 'data', 'geojson', 'helene.geojson')

real_gdf = gpd.read_file(helene_path)

# Extract all unique entities from real data
all_entities = set()
for gpe in real_gdf['GPE'].dropna():
    if str(gpe).strip():
        parts = str(gpe).replace(',', '|').replace(';', '|').split('|')
        all_entities.update([e.strip() for e in parts if e.strip()])

all_facilities = list(real_gdf[real_gdf['FAC'] != '']['FAC'].dropna().unique())

entity_list = sorted(list(all_entities))

print(f"Extracted {len(entity_list)} unique place names from real data")
print(f"Extracted {len(all_facilities)} facility names")

## Natural Pattern Functions

In [None]:
# Major cities and their populations (for weighted sampling)
CITIES_WITH_POPULATION = [
    ('Tampa', 27.95, -82.46, 400000),
    ('Atlanta', 33.75, -84.39, 500000),
    ('Charlotte', 35.23, -80.84, 900000),
    ('Jacksonville', 30.33, -81.66, 950000),
    ('Tallahassee', 30.44, -84.28, 200000),
    ('Asheville', 35.60, -82.55, 95000),
    ('Gainesville', 29.65, -82.32, 140000),
    ('Savannah', 32.08, -81.09, 150000),
    ('Birmingham', 33.52, -86.80, 200000),
    ('Nashville', 36.16, -86.78, 700000),
]

# Calculate weights based on population
city_weights = [pop for _, _, _, pop in CITIES_WITH_POPULATION]
city_weights = np.array(city_weights) / sum(city_weights)

def natural_timestamp(start_date, num_days):
    """
    Generate timestamp with natural daily patterns:
    - People sleep at night (less activity 12 AM - 6 AM)
    - Peak activity during day/evening (9 AM - 10 PM)
    """
    # Random day and hour
    day_offset = np.random.randint(0, num_days)
    
    # Hour probability (people are awake during the day)
    hour_weights = np.array([
        0.2, 0.1, 0.05, 0.05, 0.1, 0.3,  # 12AM-6AM (sleeping)
        0.7, 1.0, 1.2, 1.3, 1.2, 1.1,    # 6AM-12PM (morning/lunch)
        1.0, 1.0, 1.0, 1.1, 1.2, 1.4,    # 12PM-6PM (afternoon)
        1.5, 1.4, 1.2, 1.0, 0.7, 0.4     # 6PM-12AM (evening)
    ])
    hour_weights = hour_weights / hour_weights.sum()
    
    hour = np.random.choice(24, p=hour_weights)
    minute = np.random.randint(0, 60)
    second = np.random.randint(0, 60)
    
    return start_date + timedelta(days=day_offset, hours=hour, minutes=minute, seconds=second)


def natural_location():
    """
    Generate location following population distribution.
    More tweets from larger cities.
    """
    # Select city weighted by population
    city_idx = np.random.choice(len(CITIES_WITH_POPULATION), p=city_weights)
    city_name, lat, lon, pop = CITIES_WITH_POPULATION[city_idx]
    
    # Add scatter around city center (gaussian)
    # Larger cities have more spread
    spread = 0.2 + (pop / 1000000) * 0.3  # 0.2 to 0.5 degrees
    lat = lat + np.random.normal(0, spread)
    lon = lon + np.random.normal(0, spread)
    
    return lat, lon


def natural_gpe(entity_list):
    """
    Generate GPE naturally:
    - Most tweets mention 1 place
    - Some mention 2 places
    - Rarely 3+
    """
    if np.random.random() < 0.15:
        # 15% no location mention
        return ''
    
    # How many entities?
    num_entities = np.random.choice([1, 2, 3], p=[0.70, 0.25, 0.05])
    
    entities = np.random.choice(entity_list, size=min(num_entities, len(entity_list)), replace=False)
    return ', '.join(entities)


def natural_fac(facilities):
    """
    Facilities mentioned occasionally (5% of tweets)
    """
    if facilities and np.random.random() < 0.05:
        return np.random.choice(facilities)
    return ''


print("✓ Natural pattern functions defined")

## Generate Natural Dataset

In [None]:
print("\nGenerating natural tweet data...")
print("Patterns: Normal daily cycles + population-based geography")
print()

synthetic_data = []

for i in range(NUM_TWEETS):
    lat, lon = natural_location()
    
    record = {
        'FAC': natural_fac(all_facilities),
        'LOC': '',
        'GPE': natural_gpe(entity_list),
        'time': natural_timestamp(START_DATE, NUM_DAYS),
        'Latitude': lat,
        'Longitude': lon,
        'make_polygon': 1,
        'geometry': Point(lon, lat)
    }
    synthetic_data.append(record)
    
    if (i + 1) % 10000 == 0:
        print(f"  Generated {i + 1:,} / {NUM_TWEETS:,} tweets")

# Create GeoDataFrame
synthetic_gdf = gpd.GeoDataFrame(synthetic_data, crs='EPSG:4326')

print(f"\n✓ Generated {len(synthetic_gdf):,} tweets with natural patterns")

## Review Generated Data

In [None]:
print("\nGenerated Dataset Statistics:")
print("=" * 60)

print(f"\nTotal tweets: {len(synthetic_gdf):,}")

print(f"\nTime range:")
print(f"  Start: {synthetic_gdf['time'].min()}")
print(f"  End: {synthetic_gdf['time'].max()}")
print(f"  Duration: {(synthetic_gdf['time'].max() - synthetic_gdf['time'].min()).total_seconds() / 3600:.1f} hours")

print(f"\nGeographic extent:")
print(f"  Latitude: {synthetic_gdf['Latitude'].min():.2f} to {synthetic_gdf['Latitude'].max():.2f}")
print(f"  Longitude: {synthetic_gdf['Longitude'].min():.2f} to {synthetic_gdf['Longitude'].max():.2f}")

print(f"\nEntity mentions:")
gpe_count = (synthetic_gdf['GPE'] != '').sum()
fac_count = (synthetic_gdf['FAC'] != '').sum()
print(f"  GPE (places): {gpe_count:,} ({gpe_count/len(synthetic_gdf)*100:.1f}%)")
print(f"  FAC (facilities): {fac_count:,} ({fac_count/len(synthetic_gdf)*100:.1f}%)")

print(f"\nSample tweets:")
print(synthetic_gdf[['GPE', 'FAC', 'time', 'Latitude', 'Longitude']].head(10).to_string())

## Save Dataset

In [None]:
# Save to GeoJSON
output_path = os.path.join(local_path, 'data', 'geojson', f'{DATASET_NAME}.geojson')
synthetic_gdf.to_file(output_path, driver='GeoJSON')

file_size_mb = os.path.getsize(output_path) / 1024 / 1024

print(f"\n{'='*60}")
print("SAVED")
print("="*60)
print(f"\nOutput: {output_path}")
print(f"Size: {file_size_mb:.2f} MB")
print(f"\n✓ Ready to use in processing pipeline!")

## Visualize Patterns

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Temporal pattern (hourly)
synthetic_gdf['hour'] = synthetic_gdf['time'].dt.hour
hourly_counts = synthetic_gdf['hour'].value_counts().sort_index()

axes[0].bar(hourly_counts.index, hourly_counts.values, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Hour of Day')
axes[0].set_ylabel('Tweet Count')
axes[0].set_title('Natural Daily Activity Pattern')
axes[0].set_xticks(range(0, 24, 2))
axes[0].grid(alpha=0.3, axis='y')
axes[0].axvspan(0, 6, alpha=0.2, color='gray', label='Sleep hours')
axes[0].legend()

# Geographic distribution
sample_size = min(5000, len(synthetic_gdf))
sample = synthetic_gdf.sample(sample_size)

axes[1].scatter(sample['Longitude'], sample['Latitude'], s=1, alpha=0.4, color='darkgreen')
axes[1].set_xlabel('Longitude')
axes[1].set_ylabel('Latitude')
axes[1].set_title(f'Geographic Distribution (n={sample_size:,})')
axes[1].grid(alpha=0.3)

# Mark major cities
for city_name, lat, lon, _ in CITIES_WITH_POPULATION:
    axes[1].plot(lon, lat, 'r*', markersize=8, alpha=0.7)
    axes[1].text(lon, lat, f'  {city_name}', fontsize=7, alpha=0.7)

plt.tight_layout()
plt.savefig(os.path.join(local_path, 'natural_data_patterns.png'), dpi=150, bbox_inches='tight')
plt.show()

print("\n✓ Pattern visualization saved: natural_data_patterns.png")