In [1]:
!pip install "numpy<2"



In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Function to load and process the migration data
def load_migration_data(file_path):
    """Load and process the California to Nashville migration data"""
    migration_df = pd.read_csv(file_path)
    
    # Aggregate migration data by year (if available) or by dataset source
    # For now, we'll just get the total
    total_migration = migration_df['inflow_count'].sum()
    
    # Group by subgeography to see top source areas
    top_sources = migration_df.groupby('subgeography')['inflow_count'].sum().sort_values(ascending=False)
    
    return migration_df, total_migration, top_sources

# Function to load and process the rental price data
def load_rental_data(file_path):
    """Load and process the rental price data, filtering for Nashville"""
    rental_df = pd.read_csv(file_path)
    
    # Filter for Nashville
    nashville_rental = rental_df[rental_df['RegionName'] == 'Nashville, TN']
    
    if nashville_rental.empty:
        # Try partial matching if exact match not found
        nashville_rental = rental_df[rental_df['RegionName'].str.contains('Nashville', case=False)]
    
    # Convert rental data to time series format
    if not nashville_rental.empty:
        # Get all date columns
        date_columns = [col for col in nashville_rental.columns if '-' in col]
        
        # Get the first row (Nashville data)
        nashville_row = nashville_rental.iloc[0]
        
        # Create a time series DataFrame
        nashville_ts = pd.DataFrame({
            'date': date_columns,
            'rent': [nashville_row[col] for col in date_columns]
        })
        
        # Convert date strings to datetime
        nashville_ts['date'] = pd.to_datetime(nashville_ts['date'])
        
        # Set date as index
        nashville_ts.set_index('date', inplace=True)
        
        return nashville_rental, nashville_ts
    
    return None, None

# Function to load and process the housing price data
def load_housing_data(file_path):
    """Load and process the housing price data, filtering for Nashville"""
    housing_df = pd.read_csv(file_path)
    
    # Filter for Nashville
    nashville_housing = housing_df[housing_df['RegionName'] == 'Nashville, TN']
    
    if nashville_housing.empty:
        # Try partial matching if exact match not found
        nashville_housing = housing_df[housing_df['RegionName'].str.contains('Nashville', case=False)]
    
    # Convert housing data to time series format
    if not nashville_housing.empty:
        # Get all date columns
        date_columns = [col for col in nashville_housing.columns if '-' in col]
        
        # Get the first row (Nashville data)
        nashville_row = nashville_housing.iloc[0]
        
        # Create a time series DataFrame
        nashville_ts = pd.DataFrame({
            'date': date_columns,
            'median_price': [nashville_row[col] for col in date_columns]
        })
        
        # Convert date strings to datetime
        nashville_ts['date'] = pd.to_datetime(nashville_ts['date'])
        
        # Set date as index
        nashville_ts.set_index('date', inplace=True)
        
        return nashville_housing, nashville_ts
    
    return None, None

# Function to calculate year-over-year growth rates
def calculate_yoy_growth(time_series):
    """Calculate year-over-year growth for a time series"""
    # Calculate YoY percentage change
    yoy_change = time_series.pct_change(periods=12) * 100
    return yoy_change

# Function to visualize the data
def visualize_trends(migration_data, rental_ts, housing_ts):
    """Create visualizations to explore relationships between migration and prices"""
    # Create figure and axes
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
    
    # Plot rental prices
    if rental_ts is not None:
        ax1.plot(rental_ts.index, rental_ts['rent'], 'b-', label='Monthly Rent')
        ax1.set_ylabel('Average Rent ($)')
        ax1.set_title('Nashville Rental Prices vs. California Migration')
        ax1.legend(loc='upper left')
    
    # Plot housing prices
    if housing_ts is not None:
        ax2.plot(housing_ts.index, housing_ts['median_price'], 'g-', label='Median Sale Price')
        ax2.set_ylabel('Median Sale Price ($)')
        ax2.set_xlabel('Date')
        ax2.set_title('Nashville Housing Prices vs. California Migration')
        ax2.legend(loc='upper left')
    
    # Adjust layout
    plt.tight_layout()
    plt.savefig('nashville_price_trends.png')
    plt.close()
    
    # Create YoY change chart
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
    
    # Plot rental price YoY change
    if rental_ts is not None:
        rental_yoy = calculate_yoy_growth(rental_ts['rent'])
        ax1.plot(rental_yoy.index, rental_yoy, 'b-', label='YoY Rent Change (%)')
        ax1.axhline(y=0, color='r', linestyle='-', alpha=0.3)
        ax1.set_ylabel('YoY Change (%)')
        ax1.set_title('Nashville Rental Price YoY Change')
        ax1.legend(loc='upper left')
    
    # Plot housing price YoY change
    if housing_ts is not None:
        housing_yoy = calculate_yoy_growth(housing_ts['median_price'])
        ax2.plot(housing_yoy.index, housing_yoy, 'g-', label='YoY Price Change (%)')
        ax2.axhline(y=0, color='r', linestyle='-', alpha=0.3)
        ax2.set_ylabel('YoY Change (%)')
        ax2.set_xlabel('Date')
        ax2.set_title('Nashville Housing Price YoY Change')
        ax2.legend(loc='upper left')
    
    # Adjust layout
    plt.tight_layout()
    plt.savefig('nashville_price_yoy_changes.png')
    plt.close()

# Main analysis function
def analyze_migration_price_relationship():
    """Analyze the relationship between California migration and Nashville prices"""
    # Load migration data
    migration_df, total_migration, top_sources = load_migration_data('california_nashville_migration_combined.csv')
    print(f"Total California to Nashville migration: {total_migration}")
    print("\nTop 5 source areas in California:")
    print(top_sources.head(5))
    
    # Load rental data
    nashville_rental, rental_ts = load_rental_data('Metro_zori_uc_mfr_sm_month.csv')
    if rental_ts is None:
        print("Could not find Nashville rental data")
    
    # Load housing data
    nashville_housing, housing_ts = load_housing_data('Metro_median_sale_price_uc_sfrcondo_sm_sa_month.csv')
    if housing_ts is None:
        print("Could not find Nashville housing data")
    
    # Visualize trends
    visualize_trends(migration_df, rental_ts, housing_ts)
    



In [17]:
analyze_migration_price_relationship()

Total California to Nashville migration: 15366

Top 5 source areas in California:
subgeography
State                               7895
Los Angeles-Long Beach-Anaheim      2030
Los Angeles                          853
San Francisco-Oakland-Berkeley       676
Riverside-San Bernardino-Ontario     592
Name: inflow_count, dtype: int64


  yoy_change = time_series.pct_change(periods=12) * 100
