In [24]:
import pandas as pd
import json
from scipy.stats import ttest_rel

In [25]:
# Function to load JSON lines from a file
def load_json_lines(filename):
    data = []
    with open(filename, 'r') as file:
        for line in file:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {filename}: {e}")
    return data

In [26]:
# Load JSON data
player_movements_data = load_json_lines('player_movements.json')
pci_data = load_json_lines('player_movements_pci.json')
tax_data = load_json_lines('player_movements_tax.json')
temp_data = load_json_lines('player_movements_temp.json')

In [27]:
# Convert JSON data to DataFrames
player_movements_df = pd.json_normalize(player_movements_data)
pci_df = pd.json_normalize(pci_data)
tax_df = pd.json_normalize(tax_data)
temp_df = pd.json_normalize(temp_data)

In [28]:
# Merge DataFrames
merged_df = player_movements_df.merge(pci_df, left_on='_id.$oid', right_on='_id.$oid')\
                               .merge(tax_df, left_on='_id.$oid', right_on='_id.$oid')\
                               .merge(temp_df, left_on='_id.$oid', right_on='_id.$oid')

In [39]:
# Inspect columns to ensure correct names
print("Columns in merged DataFrame:", merged_df.columns)

Columns in merged DataFrame: Index(['Name', 'Former City', 'New City', 'Year 1', 'Year 2',
       'prev_city_latitude', 'prev_city_longitude', 'new_city_latitude',
       'new_city_longitude', 'prev_city_year1_avg_temp_x',
       'prev_city_year2_avg_temp_x', 'new_city_year1_avg_temp_x',
       'new_city_year2_avg_temp_x', 'new_city_state', 'prev_city_state',
       'new_city_pci_year1_x', 'new_city_pci_year2_x', 'prev_city_pci_year1_x',
       'prev_city_pci_year2_x', 'new_state_tier_year1_x',
       'new_state_tier_year2_x', 'prev_state_tier_year1_x',
       'prev_state_tier_year2_x', '_id.$oid', 'new_city_pci_year1_y',
       'new_city_pci_year2_y', 'prev_city_pci_year1_y',
       'prev_city_pci_year2_y', 'new_state_tier_year1_y',
       'new_state_tier_year2_y', 'prev_state_tier_year1_y',
       'prev_state_tier_year2_y', 'prev_city_year1_avg_temp_y',
       'prev_city_year2_avg_temp_y', 'new_city_year1_avg_temp_y',
       'new_city_year2_avg_temp_y', 'temp_diff_year1', 'temp_diff_

In [41]:
# Calculate temperature, tax, and PCI differences
merged_df['temp_diff_year1'] = merged_df['new_city_year1_avg_temp_y'] - merged_df['prev_city_year1_avg_temp_y']
merged_df['temp_diff_year2'] = merged_df['new_city_year2_avg_temp_y'] - merged_df['prev_city_year2_avg_temp_y']

# Handle missing or invalid PCI values
merged_df['new_city_pci_year1_y'] = pd.to_numeric(merged_df['new_city_pci_year1_y'].str.replace(',', ''), errors='coerce')
merged_df['prev_city_pci_year1_y'] = pd.to_numeric(merged_df['prev_city_pci_year1_y'].str.replace(',', ''), errors='coerce')
merged_df['new_city_pci_year2_y'] = pd.to_numeric(merged_df['new_city_pci_year2_y'].str.replace(',', ''), errors='coerce')
merged_df['prev_city_pci_year2_y'] = pd.to_numeric(merged_df['prev_city_pci_year2_y'].str.replace(',', ''), errors='coerce')

merged_df['pci_diff_year1'] = merged_df['new_city_pci_year1_y'] - merged_df['prev_city_pci_year1_y']
merged_df['pci_diff_year2'] = merged_df['new_city_pci_year2_y'] - merged_df['prev_city_pci_year2_y']

# Convert tax tier categorical values to numerical values
tax_tier_mapping = {'no tax': 0, 'low': 1, 'moderate': 2, 'high': 3}
merged_df['new_state_tier_year1_num'] = merged_df['new_state_tier_year1_y'].map(tax_tier_mapping)
merged_df['prev_state_tier_year1_num'] = merged_df['prev_state_tier_year1_y'].map(tax_tier_mapping)
merged_df['new_state_tier_year2_num'] = merged_df['new_state_tier_year2_y'].map(tax_tier_mapping)
merged_df['prev_state_tier_year2_num'] = merged_df['prev_state_tier_year2_y'].map(tax_tier_mapping)

merged_df['tax_diff_year1'] = merged_df['new_state_tier_year1_num'] - merged_df['prev_state_tier_year1_num']
merged_df['tax_diff_year2'] = merged_df['new_state_tier_year2_num'] - merged_df['prev_state_tier_year2_num']

In [47]:
# Ensure sufficient sample sizes
min_sample_size = 10

In [48]:
if len(temp_diff) >= min_sample_size:
    temp_t_stat, temp_p_value = ttest_rel(temp_diff['temp_diff_year1'], temp_diff['temp_diff_year2'])
    print(f'Temperature differences - t-statistic: {temp_t_stat}, p-value: {temp_p_value}')
else:
    print('Not enough data for temperature significance test.')

if len(pci_diff) >= min_sample_size:
    pci_t_stat, pci_p_value = ttest_rel(pci_diff['pci_diff_year1'], pci_diff['pci_diff_year2'])
    print(f'PCI differences - t-statistic: {pci_t_stat}, p-value: {pci_p_value}')
else:
    print('Not enough data for PCI significance test.')

if len(tax_diff) >= min_sample_size:
    tax_t_stat, tax_p_value = ttest_rel(tax_diff['tax_diff_year1'], tax_diff['tax_diff_year2'])
    print(f'Tax differences - t-statistic: {tax_t_stat}, p-value: {tax_p_value}')
else:
    print('Not enough data for tax significance test.')


Temperature differences - t-statistic: -0.69962809136618, p-value: 0.48483154381081073
PCI differences - t-statistic: -0.8894896300814626, p-value: 0.3746653616580382
Not enough data for tax significance test.


In [50]:
most_significant = min(results, key=results.get)
print(f'The most significant difference is in: {most_significant}')

The most significant difference is in: PCI
