In [None]:
! pip install seaborn
! pip install jenkspy
import jenkspy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
SAT_Scores = pd.read_csv('SAT_states2024.csv')

Sorted_SAT_Scores = SAT_Scores.sort_values (by = 'Total', ascending = True)

In [None]:
Breaks_Jenks = jenkspy.jenks_breaks(Sorted_SAT_Scores['Total'], n_classes = 4)
Sorted_SAT_Scores['Group'] = pd.cut(Sorted_SAT_Scores['Total'], 
                                    bins = Breaks_Jenks, labels = ['Very Low', 
                                                                     'Low', 
                                                                     'Middle', 'High'], 
                                      include_lowest = True)

plt.figure(figsize = (14, 8))
sns.barplot(y = 'Total', x = 'state', hue = 'Group', 
            data = Sorted_SAT_Scores, palette = 'plasma')
plt.ylim(0, 1300)
plt.xticks(rotation = 90)
plt.title('SAT Scores by State')
plt.ylabel('Total SAT Scores')
plt.xlabel('State')

In [None]:
breaks_quantiles = [np.min(Sorted_SAT_Scores['Total']), 
                    np.percentile(Sorted_SAT_Scores['Total'], 25),
                    np.median(Sorted_SAT_Scores['Total']), 
                    np.percentile(Sorted_SAT_Scores['Total'], 75),
                    np.max(Sorted_SAT_Scores['Total'])]
Sorted_SAT_Scores['Group_quant'] = pd.cut(Sorted_SAT_Scores['Total'], 
                                            bins = breaks_quantiles, 
                                            labels = ['Very Low', 'Low', 
                                                      'High', 'Very High'], 
                                            include_lowest = True)
breaks_quantiles

plt.figure(figsize = (14, 8))
sns.barplot(y = 'Total', x = 'state', hue = 'Group_quant', 
            data = Sorted_SAT_Scores, palette = 'plasma')
plt.ylim(0, 1300)
plt.xticks(rotation = 90)
plt.title('SAT Scores by State')
plt.ylabel('Total SAT Scores')
plt.xlabel('State')

In [None]:
std_value = np.std(Sorted_SAT_Scores['Total'])
mean_value = np.mean(Sorted_SAT_Scores['Total'])
breaks_std = [np.min(Sorted_SAT_Scores['Total']), 
              mean_value - std_value, 
              mean_value,
              mean_value + std_value, 
              np.max(Sorted_SAT_Scores['Total'])]
Sorted_SAT_Scores['Group_std'] = pd.cut(Sorted_SAT_Scores['Total'], 
                                          bins = breaks_std, 
                                          labels = ['Very Low', 'Low', 
                                                    'High', 'Very High'], 
                                          include_lowest = True)
breaks_std

plt.figure(figsize = (14, 8))
sns.barplot(y = 'Total', x = 'state', hue = 'Group_std', 
            data = Sorted_SAT_Scores, palette = 'plasma')
plt.ylim(0 , 1300)
plt.xticks(rotation = 90)
plt.title('SAT Scores by State')
plt.ylabel('Total SAT Scores')
plt.xlabel('State')

In [None]:
def equal_breaks(data, n_classes = 4):
    breaks_ls = []
    min_val = np.min(data)
    max_val = np.max(data)
    interval = (max_val - min_val) / n_classes
    breaks_ls.append(min_val)
    for i in range(1, n_classes):
        breaks_ls.append(min_val + (interval * i))
    breaks_ls.append(max_val)
    print('interval:', np.round(interval, 3))
    return breaks_ls

breaks_equal = equal_breaks(Sorted_SAT_Scores['Total'], n_classes = 4)
Sorted_SAT_Scores['Group_equal'] = pd.cut(Sorted_SAT_Scores['Total'], 
                                            bins = breaks_equal, 
                                            labels = ['Very Low', 'Low', 
                                                      'High', 'Very High'], 
                                            include_lowest = True)
breaks_equal

plt.figure(figsize = (14, 8))
sns.barplot(y = 'Total', x = 'state', hue = 'Group_std', 
            data = Sorted_SAT_Scores, palette = 'plasma')
plt.ylim(0 , 1300)
plt.xticks(rotation = 90)
plt.title('SAT Scores by State')
plt.ylabel('Total SAT Scores')
plt.xlabel('State')

In [None]:
import matplotlib.pyplot as plt

In [None]:
SUSB = pd.read_csv('SUSB_centroid.csv')

In [None]:
SUSB.columns


In [None]:
SUSB.head(10)

In [None]:
mean_x = np.mean(SUSB['LONG'])
mean_y = np.mean(SUSB['LAT'])
mean_center = (mean_x, mean_y)
print('Mean center:', mean_center)

In [None]:
weighted_mean_x = np.average(SUSB['LONG'], weights = SUSB['Firms'])
weighted_mean_y = np.average(SUSB['LAT'], weights = SUSB['Firms'])
weighted_mean_center = (weighted_mean_x, weighted_mean_y)
print('Weighted mean center:', weighted_mean_center)

In [None]:
def weighted_median(data, weights):
    sorted_indices = np.argsort(data)
    sorted_data = data[sorted_indices]
    sorted_weights = weights[sorted_indices]
    cumulative_weight = np.cumsum(sorted_weights)
    median_index = np.searchsorted(cumulative_weight, cumulative_weight[-1] / 2)
    return sorted_data[median_index]
weighted_median_x = weighted_median(SUSB['LONG'].values, 
                                    SUSB['Firms'].values)
weighted_median_y = weighted_median(SUSB['LAT'].values, 
                                    SUSB['Firms'].values)
wmc = (weighted_median_x, weighted_median_y)
print('Weighted Median Center:', wmc)

In [None]:
sum_weighted_distances = []
for i in range(len(SUSB)):
    x_i = SUSB['LONG'][i]
    y_i = SUSB['LAT'][i]
    distance_to_others = np.sqrt((SUSB['LONG'] - x_i) ** 2 
                                 + (SUSB['LAT'] - y_i) ** 2)
    weighted_distance = distance_to_others * SUSB['Firms']
    sum_weighted_distance = np.sum(weighted_distance)
    sum_weighted_distances.append(sum_weighted_distance)
    
SUSB['SumWeightedDistance'] = sum_weighted_distances
min_distance_point = SUSB.loc[SUSB['SumWeightedDistance'].idxmin()]
min_distance_point

In [None]:
def weighted_median(data, weights):
    sorted_indices = np.argsort(data)
    sorted_data = data[sorted_indices]
    sorted_weights = weights[sorted_indices]
    cumulative_weight = np.cumsum(sorted_weights)
    median_index = np.searchsorted(cumulative_weight, cumulative_weight[-1] / 2)
    return sorted_data[median_index]

years = [2007, 2009, 2019, 2021]
SUSB_filtered = SUSB[SUSB['Year'].isin(years)]
results = {}

for year, group in SUSB_filtered.groupby('Year'):
    coords = group[['LONG', 'LAT']].values
    distance_matrix = np.sqrt(np.sum((coords[:, None, :] - coords[None, :, :]) ** 2, axis=2))
    
    weighted_distances = distance_matrix @ group['Firms'].values  
    group['SumWeightedDistance'] = weighted_distances
    min_distance_point = group.loc[group['SumWeightedDistance'].idxmin()]

    weighted_median_x = weighted_median(group['LONG'].values, group['Firms'].values)
    weighted_median_y = weighted_median(group['LAT'].values, group['Firms'].values)
    wmc = (weighted_median_x, weighted_median_y)

    results[year] = {
        'Weighted Median Center': wmc,
        'Min Distance Point': min_distance_point
    }
for year, result in results.items():
    print(f"Year: {year}")
    print(f"Weighted Median Center: {result['Weighted Median Center']}")
    print(f"Min Distance Point: {result['Min Distance Point']}")
    print()