In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import requests

from epitools import get_data

In [2]:
year = 2017
if year >= 2016:
    URL = f"https://www2.census.gov/programs-surveys/demo/tables/geographic-mobility/{year}/state-to-state-migration/State_to_State_Migrations_Table_{year}.xls"
else:
    URL = f"https://www2.census.gov/programs-surveys/demo/tables/geographic-mobility/{year}/state-to-state-migration/state_to_state_migrations_table_{year}.xls"

df = pd.read_excel(
    URL,
    header=6
)
df = df.rename(columns={"Unnamed: 0": "state_target"}) #, "Unnamed: 1": "population"
df["year"] = year

columns = list(filter(lambda x: "Unnamed" not in x and "Total" not in x, list(df)))
df = df[columns].dropna(subset=["state_target"])
df = df[~df["state_target"].isin(["United States1", "United States2", "Current residence in --"])].head(52).replace("N/A2", np.nan)

df.head()

Unnamed: 0,state_target,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,...,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Puerto Rico,U.S. Island Area,Foreign Country4,year
4,Alabama,,424.0,1513.0,517.0,2328.0,1974,336,22,378,...,0,4277,1289,1070,1121,104,476,0,13191,2017
5,Alaska,942.0,,1387.0,203.0,3556.0,1487,0,0,0,...,0,360,3653,0,2500,6,0,965,5738,2017
6,Arizona,1337.0,2255.0,,1763.0,59233.0,12810,730,810,362,...,96,3593,14392,318,3897,997,438,91,44793,2017
7,Arkansas,574.0,241.0,1060.0,,5334.0,2547,0,0,0,...,0,1087,1539,0,958,12,573,0,8385,2017
8,California,3715.0,2606.0,26907.0,3471.0,,25038,5935,491,6818,...,895,21162,33143,789,9348,1395,1864,842,316046,2017


In [3]:
dd = df.melt(id_vars=["state_target", "year"], var_name="state_origin", value_name="migration").dropna()
dd

Unnamed: 0,state_target,year,state_origin,migration
1,Alaska,2017,Alabama,942
2,Arizona,2017,Alabama,1337
3,Arkansas,2017,Alabama,574
4,California,2017,Alabama,3715
5,Colorado,2017,Alabama,3120
...,...,...,...,...
2803,Washington,2017,Foreign Country4,61405
2804,West Virginia,2017,Foreign Country4,2988
2805,Wisconsin,2017,Foreign Country4,20492
2806,Wyoming,2017,Foreign Country4,2403


In [4]:
aa = dd.pivot(index="state_origin", columns="state_target", values="migration")
aa = aa.loc[list(aa.columns)]
aa.head()

state_target,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
state_origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,,942.0,1337.0,574.0,3715.0,3120,206,506,494,9389,...,144,9523,10235,644,0,2341,1605,239,757,264
Alaska,424.0,,2255.0,241.0,2606.0,2030,540,0,0,1400,...,183,417,3543,660,203,736,7532,121,389,152
Arizona,1513.0,1387.0,,1060.0,26907.0,6765,644,76,549,5577,...,402,2522,14875,8144,132,2587,11733,13,1702,220
Arkansas,517.0,203.0,1763.0,,3471.0,1356,160,0,105,3028,...,83,5843,14222,177,0,1184,1074,0,601,722
California,2328.0,3556.0,59233.0,5334.0,,27014,2807,580,3763,30919,...,3599,7879,63174,17020,1246,14509,52484,563,4741,2344


In [5]:
test = (aa > aa.T).reset_index().melt(id_vars=["state_origin"], value_name="in_out")
test["in_out"] = test["in_out"].astype(int)
test = test[test["state_origin"] != test["state_target"]]
test["year"] = year - 1
for suffix in ["origin", "target"]:
    test[f"state_{suffix}"] = test[f"state_{suffix}"].str.upper()

test

Unnamed: 0,state_origin,state_target,in_out,year
1,ALASKA,ALABAMA,0,2016
2,ARIZONA,ALABAMA,1,2016
3,ARKANSAS,ALABAMA,0,2016
4,CALIFORNIA,ALABAMA,0,2016
5,COLORADO,ALABAMA,0,2016
...,...,...,...,...
2698,VERMONT,WYOMING,0,2016
2699,VIRGINIA,WYOMING,1,2016
2700,WASHINGTON,WYOMING,1,2016
2701,WEST VIRGINIA,WYOMING,0,2016


In [6]:
output = []
method = "nv"

for year in [2000, 2004, 2008, 2012, 2016, 2020]:
    df_tmp = get_data(
        country="United States",
        year=year,
        aggregation="state",
        election="first_round",
        method=method
    )
    df_tmp["year"] = year
    
    output.append(df_tmp)

df_dv = pd.concat(output, ignore_index=True)
df_dv["state"] = df_dv["state"].str.upper()
df_dv["year"] = df_dv["year"].astype(int)
df_dv = df_dv.rename(columns={"epi": "value"})
df_dv.head()

Unnamed: 0,state,value,epi_between,epi_within,year
0,ALABAMA,0.954538,0.812398,0.14214,2000
1,ALASKA,0.791327,0.648163,0.143164,2000
2,ARIZONA,0.969194,0.886976,0.082218,2000
3,ARKANSAS,0.99221,0.842943,0.149267,2000
4,CALIFORNIA,0.997502,0.782541,0.214961,2000


In [12]:
df_gini = pd.read_csv("data_curated/United States/Income_Inequality.csv")
df_gini.head()

Unnamed: 0,year,state,gini,moe
0,1917,UNITED STATES,0.507681,
1,1917,ALABAMA,0.399792,
2,1917,ARIZONA,0.399658,
3,1917,ARKANSAS,0.376483,
4,1917,CALIFORNIA,0.449363,


In [14]:
def clean_bea_data(
    PATH,
    value_name="value"
):
    tmp = pd.read_csv(PATH, header=3).dropna(subset=["GeoName"])
    tmp = tmp.melt(id_vars=["GeoFips", "GeoName"], var_name="year", value_name=value_name)
    tmp = tmp.rename(columns={"GeoFips": "state_id", "GeoName": "state"})
    tmp = tmp[~(tmp["state_id"].str[0] == "9")]
    tmp = tmp[tmp[value_name] != "(NA)"]
    tmp = tmp[tmp["state_id"] != "00000"]
    tmp["state"] = tmp["state"].replace({" *": ""}).str.upper()
    tmp = tmp.drop(columns=["state_id"])
    tmp[value_name] = tmp[value_name].astype(float)
    tmp["year"] = tmp["year"].astype(int)

    return tmp


df_personal_income = clean_bea_data(
    PATH="data_external/United States/BEA_Personal_income.csv",
    value_name="personal_income"
)
df_personal_income.head()

Unnamed: 0,state,year,personal_income
1,ALABAMA,1929,319.0
3,ARIZONA,1929,598.0
4,ARKANSAS,1929,303.0
5,CALIFORNIA,1929,994.0
6,COLORADO,1929,631.0


In [15]:
cc = pd.merge(test, df_dv, left_on=["state_origin", "year"], right_on=["state", "year"])
cc = pd.merge(cc, df_dv, left_on=["state_target", "year"], right_on=["state", "year"])
cc = pd.merge(cc, df_gini, left_on=["state_origin", "year"], right_on=["state", "year"])
cc = pd.merge(cc, df_gini, left_on=["state_target", "year"], right_on=["state", "year"])
cc = pd.merge(cc, df_personal_income, left_on=["state_origin", "year"], right_on=["state", "year"])
cc = pd.merge(cc, df_personal_income, left_on=["state_target", "year"], right_on=["state", "year"])

cc["epi_in_out"] = cc["value_x"] > cc["value_y"]
cc["epi_diff"] = cc["value_x"] - cc["value_y"]
cc["epi_diff_between"] = cc["epi_between_x"] - cc["epi_between_y"]
cc["epi_diff_within"] = cc["epi_within_x"] - cc["epi_within_y"]

cc.to_csv("test.csv")
cc

  cc = pd.merge(cc, df_gini, left_on=["state_target", "year"], right_on=["state", "year"])
  cc = pd.merge(cc, df_personal_income, left_on=["state_target", "year"], right_on=["state", "year"])


Unnamed: 0,state_origin,state_target,in_out,year,state_x,value_x,epi_between_x,epi_within_x,state_y,value_y,...,gini_y,moe_y,state_x.1,personal_income_x,state_y.1,personal_income_y,epi_in_out,epi_diff,epi_diff_between,epi_diff_within
0,ARIZONA,ALABAMA,1,2016,ARIZONA,0.987250,0.891965,0.095286,ALABAMA,0.935563,...,0.4769,±0.0019,ARIZONA,41473.0,ALABAMA,39014.0,True,0.051687,0.238536,-0.186849
1,ARKANSAS,ALABAMA,0,2016,ARKANSAS,0.888906,0.650742,0.238164,ALABAMA,0.935563,...,0.4769,±0.0019,ARKANSAS,40873.0,ALABAMA,39014.0,False,-0.046657,-0.002686,-0.043971
2,CALIFORNIA,ALABAMA,0,2016,CALIFORNIA,0.899948,0.669357,0.230591,ALABAMA,0.935563,...,0.4769,±0.0019,CALIFORNIA,56560.0,ALABAMA,39014.0,False,-0.035615,0.015929,-0.051544
3,COLORADO,ALABAMA,0,2016,COLORADO,0.988376,0.743760,0.244617,ALABAMA,0.935563,...,0.4769,±0.0019,COLORADO,52390.0,ALABAMA,39014.0,True,0.052813,0.090331,-0.037518
4,CONNECTICUT,ALABAMA,1,2016,CONNECTICUT,0.916482,0.835912,0.080569,ALABAMA,0.935563,...,0.4769,±0.0019,CONNECTICUT,68680.0,ALABAMA,39014.0,False,-0.019082,0.182484,-0.201565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2251,WASHINGTON,ARIZONA,1,2016,WASHINGTON,0.975183,0.741111,0.234072,ARIZONA,0.987250,...,0.4682,±0.0018,WASHINGTON,54918.0,ARIZONA,41473.0,False,-0.012067,-0.150854,0.138787
2252,WEST VIRGINIA,ARIZONA,1,2016,WEST VIRGINIA,0.709818,0.571399,0.138420,ARIZONA,0.987250,...,0.4682,±0.0018,WEST VIRGINIA,37380.0,ARIZONA,41473.0,False,-0.277432,-0.320566,0.043134
2253,WISCONSIN,ARIZONA,1,2016,WISCONSIN,0.997991,0.751453,0.246538,ARIZONA,0.987250,...,0.4682,±0.0018,WISCONSIN,47205.0,ARIZONA,41473.0,True,0.010740,-0.140512,0.151252
2254,WYOMING,ARIZONA,1,2016,WYOMING,0.702832,0.489195,0.213637,ARIZONA,0.987250,...,0.4682,±0.0018,WYOMING,54827.0,ARIZONA,41473.0,False,-0.284418,-0.402769,0.118351


In [16]:
cc[cc["state_target"] == "CALIFORNIA"]

Unnamed: 0,state_origin,state_target,in_out,year,state_x,value_x,epi_between_x,epi_within_x,state_y,value_y,...,gini_y,moe_y,state_x.1,personal_income_x,state_y.1,personal_income_y,epi_in_out,epi_diff,epi_diff_between,epi_diff_within
94,ARIZONA,CALIFORNIA,0,2016,ARIZONA,0.98725,0.891965,0.095286,CALIFORNIA,0.899948,...,0.488,±0.0010,ARIZONA,41473.0,CALIFORNIA,56560.0,True,0.087302,0.222607,-0.135306
95,ARKANSAS,CALIFORNIA,0,2016,ARKANSAS,0.888906,0.650742,0.238164,CALIFORNIA,0.899948,...,0.488,±0.0010,ARKANSAS,40873.0,CALIFORNIA,56560.0,False,-0.011042,-0.018615,0.007572
96,COLORADO,CALIFORNIA,0,2016,COLORADO,0.988376,0.74376,0.244617,CALIFORNIA,0.899948,...,0.488,±0.0010,COLORADO,52390.0,CALIFORNIA,56560.0,True,0.088428,0.074402,0.014026
97,CONNECTICUT,CALIFORNIA,1,2016,CONNECTICUT,0.916482,0.835912,0.080569,CALIFORNIA,0.899948,...,0.488,±0.0010,CONNECTICUT,68680.0,CALIFORNIA,56560.0,True,0.016533,0.166555,-0.150022
98,DELAWARE,CALIFORNIA,0,2016,DELAWARE,0.981587,0.753831,0.227756,CALIFORNIA,0.899948,...,0.488,±0.0010,DELAWARE,48734.0,CALIFORNIA,56560.0,True,0.081638,0.084474,-0.002836
99,FLORIDA,CALIFORNIA,0,2016,FLORIDA,0.99844,0.774788,0.223653,CALIFORNIA,0.899948,...,0.488,±0.0010,FLORIDA,46454.0,CALIFORNIA,56560.0,True,0.098492,0.10543,-0.006938
100,GEORGIA,CALIFORNIA,0,2016,GEORGIA,1.005585,0.644877,0.360709,CALIFORNIA,0.899948,...,0.488,±0.0010,GEORGIA,43033.0,CALIFORNIA,56560.0,True,0.105637,-0.024481,0.130117
101,IDAHO,CALIFORNIA,0,2016,IDAHO,0.854455,0.641937,0.212518,CALIFORNIA,0.899948,...,0.488,±0.0010,IDAHO,40098.0,CALIFORNIA,56560.0,False,-0.045493,-0.02742,-0.018073
102,ILLINOIS,CALIFORNIA,1,2016,ILLINOIS,0.97723,0.669594,0.307636,CALIFORNIA,0.899948,...,0.488,±0.0010,ILLINOIS,52036.0,CALIFORNIA,56560.0,True,0.077281,0.000237,0.077044
103,INDIANA,CALIFORNIA,0,2016,INDIANA,0.938855,0.698549,0.240307,CALIFORNIA,0.899948,...,0.488,±0.0010,INDIANA,43645.0,CALIFORNIA,56560.0,True,0.038907,0.029191,0.009715


In [9]:
# aa - np.tile(aa.mean(axis=0), (52, 1)).T > 0