In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [17]:
# Load JSON data
player_movements_data = pd.read_json('player_movements.json', lines=True)

In [18]:
# Create the directory if it doesn't exist
output_dir = '/Trend Outputs'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [4]:
# Function to calculate average tax tier
def calculate_average_tax_tier(row):
    tiers = {'No Tax': 0, 'Low': 1, 'Medium': 2, 'High': 3}
    year1 = tiers.get(row['prev_state_tier_year1'], -1)
    year2 = tiers.get(row['prev_state_tier_year2'], -1)
    return max(year1, year2) if year1 != year2 else year1

In [5]:
# Calculate average tax tier
player_movements_data['Average Tax Tier'] = player_movements_data.apply(calculate_average_tax_tier, axis=1)


In [6]:
# Ensure the PCI columns are converted to floats
player_movements_data['new_city_pci_year1'] = player_movements_data['new_city_pci_year1'].str.replace(",", "").astype(float)
player_movements_data['prev_city_pci_year1'] = player_movements_data['prev_city_pci_year1'].str.replace(",", "").astype(float)
player_movements_data['new_city_pci_year2'] = player_movements_data['new_city_pci_year2'].str.replace(",", "").astype(float)
player_movements_data['prev_city_pci_year2'] = player_movements_data['prev_city_pci_year2'].str.replace(",", "").astype(float)

In [7]:
# Calculate average PCI for year 1 and year 2
player_movements_data['new_city_avg_pci'] = (player_movements_data['new_city_pci_year1'] + player_movements_data['new_city_pci_year2']) / 2
player_movements_data['prev_city_avg_pci'] = (player_movements_data['prev_city_pci_year1'] + player_movements_data['prev_city_pci_year2']) / 2

In [8]:
# Calculate PCI difference
player_movements_data['PCI Difference'] = player_movements_data['new_city_avg_pci'] - player_movements_data['prev_city_avg_pci']

In [9]:
# Ensure the temperature columns are converted to floats
player_movements_data['new_city_year1_avg_temp'] = player_movements_data['new_city_year1_avg_temp'].astype(float)
player_movements_data['prev_city_year1_avg_temp'] = player_movements_data['prev_city_year1_avg_temp'].astype(float)
player_movements_data['new_city_year2_avg_temp'] = player_movements_data['new_city_year2_avg_temp'].astype(float)
player_movements_data['prev_city_year2_avg_temp'] = player_movements_data['prev_city_year2_avg_temp'].astype(float)

In [10]:
# Calculate average temperature for year 1 and year 2
player_movements_data['new_city_avg_temp'] = (player_movements_data['new_city_year1_avg_temp'] + player_movements_data['new_city_year2_avg_temp']) / 2
player_movements_data['prev_city_avg_temp'] = (player_movements_data['prev_city_year1_avg_temp'] + player_movements_data['prev_city_year2_avg_temp']) / 2

In [11]:
# Calculate temperature difference
player_movements_data['Temperature Difference'] = player_movements_data['new_city_avg_temp'] - player_movements_data['prev_city_avg_temp']


In [12]:
# Define the bins and labels
bins = [1940, 1980, 1999, 2015, 2025]
labels = ['1940-1980', '1981-1999', '2000-2015', '2016-2025']

In [13]:
# Categorize data into bins
player_movements_data['year_bin'] = pd.cut(player_movements_data['Year 1'], bins=bins, labels=labels, right=False)

In [14]:
# Function to plot and save frequency bar charts
def plot_frequency_bar(data, column, title, xlabel, filename):
    data[column].plot(kind='hist', bins=4, rwidth=0.8)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel('Frequency')
    plt.savefig(os.path.join(output_dir, filename))
    plt.close()

In [15]:
# Plot frequency bar charts for each bin
for label in labels:
    bin_data = player_movements_data[player_movements_data['year_bin'] == label]
    
    # PCI value plot
    plot_frequency_bar(bin_data, 'PCI Difference', f'PCI Difference Frequency ({label})', 'PCI Difference', f'pci_difference_{label}.png')
    plot_frequency_bar(bin_data, 'new_city_avg_pci', f'New City Average PCI Frequency ({label})', 'Average PCI', f'new_city_avg_pci_{label}.png')
    plot_frequency_bar(bin_data, 'prev_city_avg_pci', f'Previous City Average PCI Frequency ({label})', 'Average PCI', f'prev_city_avg_pci_{label}.png')
    
    # Temperature value plot
    plot_frequency_bar(bin_data, 'Temperature Difference', f'Temperature Difference Frequency ({label})', 'Temperature Difference (deg C)', f'temperature_difference_{label}.png')
    plot_frequency_bar(bin_data, 'new_city_avg_temp', f'New City Average Temperature Frequency ({label})', 'Average Temperature (deg C)', f'new_city_avg_temp_{label}.png')
    plot_frequency_bar(bin_data, 'prev_city_avg_temp', f'Previous City Average Temperature Frequency ({label})', 'Average Temperature (deg C)', f'prev_city_avg_temp_{label}.png')
    
    # Tax tier plot
    plot_frequency_bar(bin_data, 'Average Tax Tier', f'Average Tax Tier Frequency ({label})', 'Average Tax Tier', f'average_tax_tier_{label}.png')