In [2]:
edu_csv_file = "education data new_transposed.csv"
pop_csv_file = "population data_transposed.csv"

In [3]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
# read in the data
edu_data = pd.read_csv(edu_csv_file)
pop_data = pd.read_csv(pop_csv_file)

# join 2 dataframes
data = pd.concat([edu_data, pop_data], axis=1)

# drop the duplicate state column
data = data.loc[:,~data.columns.duplicated()]

# print the first 5 rows of the data
print(data.head())

        State High school graduate or higher Bachelor's degree or higher  \
0     Alabama                         88.80%                      28.80%   
1      Alaska                         93.30%                      30.60%   
2     Arizona                         89.20%                      33.00%   
3    Arkansas                         89.10%                      25.40%   
4  California                         84.70%                      37.00%   

        Total  
0   5,024,279  
1     733,391  
2   7,151,502  
3   3,011,524  
4  39,538,223  


In [5]:
# State - High school graduate or higher - Bachelor's degree or higher - Total

# perform t-test
from scipy.stats import ttest_ind

high_school_data = []
bachelors_data = []
total_pop_data = []

def convert_percentage_to_number(x):
    return float(x[:-1]) / 100

def convert_number(x):
    return int(x.replace(',', ''))

for i in range(len(data)):
    high_school_data.append(convert_percentage_to_number(data['High school graduate or higher'][i]))
    bachelors_data.append(convert_percentage_to_number(data["Bachelor's degree or higher"][i]))
    total_pop_data.append(convert_number(data['Total'][i]))

# calculate the t-test between high school and bachelors
ttest = ttest_ind(high_school_data, bachelors_data)
print("T-test between high school graduate or higher and bachelors degree or higher: ", ttest)

# calculate the t-test between population and high school, and population and bachelors
ttest_hs = ttest_ind(total_pop_data, high_school_data)
print("T-test between population and high school graduate or higher: ", ttest_hs)

ttest_bachelors = ttest_ind(total_pop_data, bachelors_data)
print("T-test between population and bachelors degree or higher: ", ttest_bachelors)

T-test between high school graduate or higher and bachelors degree or higher:  TtestResult(statistic=54.3467054051472, pvalue=4.0063535901463424e-77, df=102.0)
T-test between population and high school graduate or higher:  TtestResult(statistic=6.316806047334502, pvalue=7.065823672498182e-09, df=102.0)
T-test between population and bachelors degree or higher:  TtestResult(statistic=6.316806594991512, pvalue=7.065805688846119e-09, df=102.0)


In [6]:
# perform ANOVA
from scipy.stats import f_oneway

# calculate the ANOVA between high school and bachelors
anova = f_oneway(high_school_data, bachelors_data)
print("ANOVA between high school graduate or higher and bachelors degree or higher: ", anova)

# calculate the ANOVA between population and high school, and population and bachelors
anova_hs = f_oneway(total_pop_data, high_school_data)
print("ANOVA between population and high school graduate or higher: ", anova_hs)

anova_bachelors = f_oneway(total_pop_data, bachelors_data)
print("ANOVA between population and bachelors degree or higher: ", anova_bachelors)

ANOVA between high school graduate or higher and bachelors degree or higher:  F_onewayResult(statistic=2953.5643883938615, pvalue=4.0063535901459075e-77)
ANOVA between population and high school graduate or higher:  F_onewayResult(statistic=39.90203863964173, pvalue=7.065823672498275e-09)
ANOVA between population and bachelors degree or higher:  F_onewayResult(statistic=39.902045558528265, pvalue=7.065805688846224e-09)


In [10]:
total_pop_data = np.array(total_pop_data)
high_school_data = np.array(high_school_data)

# correlation coefficient
correlation_hs = np.corrcoef(total_pop_data, high_school_data)[0, 1]
print("Correlation coefficient between population and high school graduate or higher: ", correlation_hs)

correlation_bachelors = np.corrcoef(total_pop_data, bachelors_data)[0, 1]
print("Correlation coefficient between population and bachelors degree or higher: ", correlation_bachelors)

Correlation coefficient between population and high school graduate or higher:  -0.47788784868743467
Correlation coefficient between population and bachelors degree or higher:  0.05724716936392127
