In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.ensemble import RandomForestRegressor
from prophet import Prophet
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error



In [2]:
immigration_data_2012_2021 = "Resources/immigration_data_2005_2021.csv"
#state_unemployment = "Resources/emp-unemployment.xls"
#gini = "Resources/gini_index_by_state_2016_2018.xlsx"

In [3]:
inmigration = pd.read_csv(immigration_data_2012_2021)
inmigration.head()

Unnamed: 0,Year,Region and country of birth,Total Permanent Residents,Percentage,Alabama,Alaska,Arizona,Arkansas,California,Colorado,...,Virginia,Washington,West Virginia,Wisconsin,Wyoming,U.S. Armed Services posts,U.S. possessions,U.S. Dependencies,Guam,Puerto Rico
0,2005,Total,1122373,100.0,4200,1525,18988,2698,232023,11977,...,27100,26482,847,7909,321,128,5868,0,0,0
1,2005,"China, People's Republic",69967,6.23,328,92,543,202,17668,765,...,1327,1508,101,593,28,13,184,0,0,0
2,2005,Dominican Republic,27504,2.45,5,42,22,0,82,6,...,90,18,6,39,0,0,2346,0,0,0
3,2005,India,84681,7.54,431,15,739,215,14724,516,...,2776,1747,133,876,0,0,24,0,0,0
4,2005,Iran,13887,1.24,48,4,285,9,7059,131,...,562,318,18,48,0,0,0,0,0,0


## Cleaning country regressor dataset

In [4]:
#Read the csv
countries_raw_metadata = pd.read_csv("Resources/countries_metadata.csv", encoding='latin-1')
display(countries_raw_metadata.head())
print(countries_raw_metadata.shape)

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1960 [YR1960],1961 [YR1961],1962 [YR1962],1963 [YR1963],1964 [YR1964],1965 [YR1965],...,2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022]
0,China,CHN,"Population, total",SP.POP.TOTL,667070000,660330000,665770000,682335000,698355000,715185000,...,1363240000.0,1371860000.0,1379860000,1387790000,1396215000.0,1402760000.0,1407745000.0,1411100000.0,1412360000.0,1412175000
1,China,CHN,GDP per capita (current US$),NY.GDP.PCAP.CD,89.5202179159242,75.8055639067957,70.9091553472599,74.3133748233246,85.4982461036748,98.4864217470782,...,7020.38568208449,7636.07422276004,8016.44601585644,8094.39037512163,8817.04549566316,9905.406,10143.8602060373,10408.7191247747,12617.5049863004,12720.2156397612
2,China,CHN,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.NE.ZS,..,..,..,..,..,..,...,4.05,4.1,..,..,3.9,4.93,5.15,5.61,5.11,..
3,China,CHN,Urban population (% of total population),SP.URB.TOTL.IN.ZS,16.203,16.708,17.226,17.757,18.299,18.086,...,53.013,54.259,55.5,56.736,57.96,59.152,60.308,61.428,62.512,63.56
4,China,CHN,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,20.86,18.02,37.01,43.37,39.14,37.88,...,13.03,13.83,11.99,13.57,12.64,10.86,10.41,8.52,7.52,..


(70, 67)


In [5]:
#Delete the "[YR..]" in the year columns
for column in countries_raw_metadata.columns:
    if "[YR" in column:
        new_column_name = column.split("[")[0].strip() # Extract the part before '[YR' and remove any leading/trailing spaces
        countries_raw_metadata.rename(columns={column: new_column_name}, inplace=True)
countries_raw_metadata.head()

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,China,CHN,"Population, total",SP.POP.TOTL,667070000,660330000,665770000,682335000,698355000,715185000,...,1363240000.0,1371860000.0,1379860000,1387790000,1396215000.0,1402760000.0,1407745000.0,1411100000.0,1412360000.0,1412175000
1,China,CHN,GDP per capita (current US$),NY.GDP.PCAP.CD,89.5202179159242,75.8055639067957,70.9091553472599,74.3133748233246,85.4982461036748,98.4864217470782,...,7020.38568208449,7636.07422276004,8016.44601585644,8094.39037512163,8817.04549566316,9905.406,10143.8602060373,10408.7191247747,12617.5049863004,12720.2156397612
2,China,CHN,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.NE.ZS,..,..,..,..,..,..,...,4.05,4.1,..,..,3.9,4.93,5.15,5.61,5.11,..
3,China,CHN,Urban population (% of total population),SP.URB.TOTL.IN.ZS,16.203,16.708,17.226,17.757,18.299,18.086,...,53.013,54.259,55.5,56.736,57.96,59.152,60.308,61.428,62.512,63.56
4,China,CHN,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,20.86,18.02,37.01,43.37,39.14,37.88,...,13.03,13.83,11.99,13.57,12.64,10.86,10.41,8.52,7.52,..


In [6]:
#Clean the "Country Names" columns 
unique_names = countries_raw_metadata["Country Name"].unique()
unique_names

array(['China', 'United States', 'Iran, Islamic Rep.', 'India',
       'Dominican Republic', 'Mexico', 'United Kingdom', 'Pakistan',
       'Philippines', nan,
       'Data from database: World Development Indicators',
       'Last Updated: 06/29/2023', 'Code', 'SP.POP.TOTL',
       'NY.GDP.PCAP.CD', 'SL.UEM.TOTL.NE.ZS',
       'Unemployment is a key measure to monitor whether a country is on track to achieve the Sustainable Development Goal of promoting sustained',
       'However', 'SP.URB.TOTL.IN.ZS',
       "Percentages urban are the numbers of persons residing in an area defined as ''urban'' per 100 total population. They are calculated by the Statistics Division of the United Nations Department of Economic and Social Affairs. Particular caution should be used in interpreting the figures for percentage urban for different countries.",
       'Countries differ in the way they classify population as "urban" or "rural." The population of a city or metropolitan area depends on the bou

In [7]:
#List the country names we want to keep.
attributes_to_keep = ['China', 'United States', 'Iran, Islamic Rep.', 'India', 'Dominican Republic', 'Mexico', 'United Kingdom', 'Pakistan', 'Philippines']

# Filter the DataFrame to keep only the desired attributes
df_filtered = countries_raw_metadata[countries_raw_metadata['Country Name'].isin(attributes_to_keep)]

# Print the filtered DataFrame
df_filtered.head()

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,China,CHN,"Population, total",SP.POP.TOTL,667070000,660330000,665770000,682335000,698355000,715185000,...,1363240000.0,1371860000.0,1379860000,1387790000,1396215000.0,1402760000.0,1407745000.0,1411100000.0,1412360000.0,1412175000
1,China,CHN,GDP per capita (current US$),NY.GDP.PCAP.CD,89.5202179159242,75.8055639067957,70.9091553472599,74.3133748233246,85.4982461036748,98.4864217470782,...,7020.38568208449,7636.07422276004,8016.44601585644,8094.39037512163,8817.04549566316,9905.406,10143.8602060373,10408.7191247747,12617.5049863004,12720.2156397612
2,China,CHN,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.NE.ZS,..,..,..,..,..,..,...,4.05,4.1,..,..,3.9,4.93,5.15,5.61,5.11,..
3,China,CHN,Urban population (% of total population),SP.URB.TOTL.IN.ZS,16.203,16.708,17.226,17.757,18.299,18.086,...,53.013,54.259,55.5,56.736,57.96,59.152,60.308,61.428,62.512,63.56
4,China,CHN,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,20.86,18.02,37.01,43.37,39.14,37.88,...,13.03,13.83,11.99,13.57,12.64,10.86,10.41,8.52,7.52,..


In [8]:
#Prepare the "Series Name" to move it to the columns, these will be our regressors. 
series_names = df_filtered["Series Name"].unique()
series_names

array(['Population, total', 'GDP per capita (current US$)',
       'Unemployment, total (% of total labor force) (national estimate)',
       'Urban population (% of total population)',
       'Birth rate, crude (per 1,000 people)', 'Gini index'], dtype=object)

In [9]:
df_filtered.loc[df_filtered["Series Name"] == 'Population, total', "Series Name"] = "Population_total"
df_filtered.loc[df_filtered["Series Name"] == 'GDP per capita (current US$)', "Series Name"] = "GDP_per_capita"
df_filtered.loc[df_filtered["Series Name"] == 'Unemployment, total (% of total labor force) (national estimate)', "Series Name"] = "UR"
df_filtered.loc[df_filtered["Series Name"] == 'Urban population (% of total population)', "Series Name"] = "Urban_population"
df_filtered.loc[df_filtered["Series Name"] == 'Birth rate, crude (per 1,000 people)', "Series Name"] = "Birth_rate_crude"
df_filtered.loc[df_filtered["Series Name"] == 'Gini index', "Series Name"] = "Gini_index"
df_filtered.loc[df_filtered["Country Name"] == 'Iran, Islamic Rep.', "Country Name"] = "Iran"
df_filtered

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,China,CHN,Population_total,SP.POP.TOTL,667070000,660330000,665770000,682335000,698355000,715185000,...,1363240000,1371860000,1379860000,1387790000,1396215000,1402760000.0,1407745000,1411100000,1412360000,1412175000
1,China,CHN,GDP_per_capita,NY.GDP.PCAP.CD,89.5202179159242,75.8055639067957,70.9091553472599,74.3133748233246,85.4982461036748,98.4864217470782,...,7020.38568208449,7636.07422276004,8016.44601585644,8094.39037512163,8817.04549566316,9905.406,10143.8602060373,10408.7191247747,12617.5049863004,12720.2156397612
2,China,CHN,UR,SL.UEM.TOTL.NE.ZS,..,..,..,..,..,..,...,4.05,4.1,..,..,3.9,4.93,5.15,5.61,5.11,..
3,China,CHN,Urban_population,SP.URB.TOTL.IN.ZS,16.203,16.708,17.226,17.757,18.299,18.086,...,53.013,54.259,55.5,56.736,57.96,59.152,60.308,61.428,62.512,63.56
4,China,CHN,Birth_rate_crude,SP.DYN.CBRT.IN,20.86,18.02,37.01,43.37,39.14,37.88,...,13.03,13.83,11.99,13.57,12.64,10.86,10.41,8.52,7.52,..
5,China,CHN,Gini_index,SI.POV.GINI,..,..,..,..,..,..,...,39.7,39.2,38.6,38.5,39.1,38.5,38.2,..,..,..
6,United States,USA,Population_total,SP.POP.TOTL,180671000,183691000,186538000,189242000,191889000,194303000,...,316059947,318386329,320738994,323071755,325122128,326838200.0,328329953,331511512,332031554,333287557
7,United States,USA,GDP_per_capita,NY.GDP.PCAP.CD,3007.12344537862,3066.56286916615,3243.84307754988,3374.51517105082,3573.94118474743,3827.52710972039,...,53291.1276891406,55123.8497869046,56762.7294515989,57866.7449341091,59907.754260885,62823.31,65120.3946628653,63528.6343027508,70219.472454115,76398.5917422054
8,United States,USA,UR,SL.UEM.TOTL.NE.ZS,5.5,6.7,5.5,5.7,5.2,4.5,...,7.37,6.17,5.28,4.87,4.36,3.9,3.67,8.05,5.35,3.65
9,United States,USA,Urban_population,SP.URB.TOTL.IN.ZS,69.996,70.377,70.757,71.134,71.508,71.879,...,81.299,81.483,81.671,81.862,82.058,82.256,82.459,82.664,82.873,83.084


In [10]:
# Get rid the unnecessary columns
countries_regressors_raw = df_filtered.drop(columns= {"Country Code", "Series Code"} )
countries_regressors_raw.head()

Unnamed: 0,Country Name,Series Name,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,China,Population_total,667070000,660330000,665770000,682335000,698355000,715185000,735400000,754550000,...,1363240000.0,1371860000.0,1379860000,1387790000,1396215000.0,1402760000.0,1407745000.0,1411100000.0,1412360000.0,1412175000
1,China,GDP_per_capita,89.5202179159242,75.8055639067957,70.9091553472599,74.3133748233246,85.4982461036748,98.4864217470782,104.324189073855,96.58918279471,...,7020.38568208449,7636.07422276004,8016.44601585644,8094.39037512163,8817.04549566316,9905.406,10143.8602060373,10408.7191247747,12617.5049863004,12720.2156397612
2,China,UR,..,..,..,..,..,..,..,..,...,4.05,4.1,..,..,3.9,4.93,5.15,5.61,5.11,..
3,China,Urban_population,16.203,16.708,17.226,17.757,18.299,18.086,17.915,17.785,...,53.013,54.259,55.5,56.736,57.96,59.152,60.308,61.428,62.512,63.56
4,China,Birth_rate_crude,20.86,18.02,37.01,43.37,39.14,37.88,35.05,33.96,...,13.03,13.83,11.99,13.57,12.64,10.86,10.41,8.52,7.52,..


In [11]:
#countries_regressors_raw.info()

## Cleaning inmigration csv

In [12]:
migration = inmigration.rename(columns={"Region and country of birth": "Country_origin", "Total Permanent Residents": "Total_inmigrants"})
migration = migration[migration["Country_origin"]!= "Total"]
migration = migration.drop(columns="Percentage")
migration["Country_origin"] = migration["Country_origin"].replace("China, People's Republic", "China")
migration

Unnamed: 0,Year,Country_origin,Total_inmigrants,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,...,Virginia,Washington,West Virginia,Wisconsin,Wyoming,U.S. Armed Services posts,U.S. possessions,U.S. Dependencies,Guam,Puerto Rico
1,2005,China,69967,328,92,543,202,17668,765,894,...,1327,1508,101,593,28,13,184,0,0,0
2,2005,Dominican Republic,27504,5,42,22,0,82,6,319,...,90,18,6,39,0,0,2346,0,0,0
3,2005,India,84681,431,15,739,215,14724,516,1571,...,2776,1747,133,876,0,0,24,0,0,0
4,2005,Iran,13887,48,4,285,9,7059,131,88,...,562,318,18,48,0,0,0,0,0,0
5,2005,Mexico,161445,569,96,8373,870,63092,2891,316,...,870,2330,30,1054,75,0,129,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,2021,Iran,5734,21,0,86,17,2206,89,39,...,218,187,9,38,0,0,0,0,0,0
149,2021,Mexico,107230,323,33,6859,610,31715,3131,247,...,586,2539,20,949,69,0,0,0,0,39
150,2021,Pakistan,9691,31,0,59,41,1104,52,111,...,784,166,17,57,3,0,0,0,0,0
151,2021,Philippines,27511,152,190,555,129,6478,228,178,...,648,737,69,236,27,0,0,0,300,9


In [13]:
country_names=['China', 'India',
       'Dominican Republic', 'Mexico', 'United Kingdom', 'Pakistan',
       'Philippines']




state_names=['Alabama', 'Alaska',
       'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
       'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii',
       'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
       'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
       'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
       'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming']



variables_by_state_raw = pd.read_csv("Resources/variable 2011-2021.csv")
variables_by_state_rename = variables_by_state_raw.rename(columns={"Unemployeement Rate": "UR_STATE",
                                                                   "GDP Per Capita":"GDP_per_capita_state", "Birth Rate":"Bith_rate_state"  })
# Multiply the "unemployment rate" column by 10
variables_by_state_rename["UR_STATE"] *= 10



results_df = pd.DataFrame(columns=['Country', 'State', 'MAE', 'MAPE','pred2023'])


for country in country_names:
    
    # Filter by country
    regressor_by_country = countries_regressors_raw.loc[countries_regressors_raw["Country Name"]== country]
    # Drop the Country Name column
    drop_countryname = regressor_by_country.drop(columns="Country Name")
    # Transpose rows per columns
    regressor_by_country_not_cleaned = drop_countryname.transpose()
    # Use the first row as the column names
    regressor_by_country_not_cleaned.columns = regressor_by_country_not_cleaned.iloc[0]
    # Drop the first row (previous integer index) since it's now redundant
    regressor_by_country_drop_row = regressor_by_country_not_cleaned.iloc[1:]
    print(country)
    regressor_by_country_reset_index = regressor_by_country_drop_row.reset_index().rename(columns={"index":"Year"})
    display(regressor_by_country_reset_index)
    regressor_by_country_reset_index["Year"] = regressor_by_country_reset_index["Year"].astype(int)
    regressor_by_country_reset_index["Population_total"] = regressor_by_country_reset_index["Population_total"].astype(int)
    regressor_by_country_reset_index["GDP_per_capita"] = regressor_by_country_reset_index["GDP_per_capita"].astype(float)
    regressor_by_country_reset_index['UR'].replace('..', np.nan, inplace=True)
    regressor_by_country_reset_index["UR"] = regressor_by_country_reset_index["UR"].astype(float)
    regressor_by_country_reset_index["Urban_population"] = regressor_by_country_reset_index["Urban_population"].astype(float)
    regressor_by_country_reset_index['Birth_rate_crude'].replace('..', np.nan, inplace=True)
    regressor_by_country_reset_index["Birth_rate_crude"] = regressor_by_country_reset_index["Birth_rate_crude"].astype(float)
    regressor_by_country_reset_index['Gini_index'].replace('..', np.nan, inplace=True)
    regressor_by_country_reset_index["Gini_index"] = regressor_by_country_reset_index["Gini_index"].astype(float)
    regressor_by_country = regressor_by_country_reset_index
    
    
    for state in state_names:



        # Getting the DataFrame with the years, total number of inmigrants in the US for a given country and 
        # the number of inmigrants of this country in one US state
        county_origin_us_df = migration[migration["Country_origin"]==country]
        country_columns = ["Year","Total_inmigrants",state]
        inmigration_us_df = county_origin_us_df[country_columns]

        #Filter by state
        state_test = variables_by_state_rename[variables_by_state_rename["State"]==state]
        regressor_by_state = state_test.drop(columns= "State").reset_index(drop=True)
    
        # First merge inmigration_us_df and regressor_by_country
        df_merged1 = pd.merge(inmigration_us_df, regressor_by_country, on="Year", how="left")

        # Then merge the result with regressor_by_state
        df_merged = pd.merge(df_merged1, regressor_by_state, on="Year", how="left")

        
        # PROPHET TRAINING
        
        # Handle NaN values in the DataFrame
        # For simplicity, we'll fill the NaN values with the column means
        df_merged.fillna(df_merged.mean(), inplace=True)
    
    
        
        # Step 1: Prepare the DataFrame with time series data and regressors
        prophet_df = df_merged[['Year', state, 'Population_total', 'GDP_per_capita', 'UR', 'Urban_population', 'Birth_rate_crude', 'Gini_index', 'GDP', 'GDP_per_capita_state']].copy()
        prophet_df.rename(columns={'Year': 'ds', state: 'y'}, inplace=True)
        prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')

        # Step 2: Handle missing values in the DataFrame
        imputer = SimpleImputer(strategy='mean')
        prophet_df[['Population_total', 'GDP_per_capita', 'UR', 'Urban_population', 'Gini_index', 'GDP', 'GDP_per_capita_state']] = imputer.fit_transform(prophet_df[['Population_total', 'GDP_per_capita', 'UR', 'Urban_population', 'Gini_index', 'GDP',  'GDP_per_capita_state']])

        prophet_df_2=prophet_df[:-1].copy()


        # Separate the target variable 'y' and other features
        X = prophet_df_2.drop(columns=['y','ds'])
        y = prophet_df_2['y']

        # Create a Random Forest Regressor model
        rf = RandomForestRegressor(random_state=42)
        rf.fit(X, y)

        # Get the feature importances
        feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf.feature_importances_})

        # Sort the features by importance in descending order
        feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

        # Display the feature importances
        print("Feature importance for "+country+" and "+state+" "+str(feature_importances))


        top_n_features = feature_importances['Feature'][:3].tolist()


        prophet_df_pred=prophet_df[:-1].copy()

        prophet_df_pred['y']=np.log1p(prophet_df_pred['y'])

        # Step 3: Set up the Prophet model with additional regressors
        holidays = pd.DataFrame({
            'holiday': '2020',
            'ds': pd.to_datetime(['2020-01-01']),
            'lower_window': 0,
            'upper_window': 0,
        })

        model = Prophet(holidays=holidays)

        for feature in top_n_features:
            model.add_regressor(feature)


        # Step 4: Fit the model to your data
        model.fit(prophet_df_pred)
        
        regressors_row_2021 = prophet_df.iloc[-1, :][top_n_features].to_dict()

        future_dates=pd.DataFrame({'ds': pd.to_datetime(['2021'], format='%Y')})


        # Step 5: Create a DataFrame with the future dates for prediction (e.g., for 2021)
        for feature in top_n_features:
            future_dates[feature] = regressors_row_2021[feature]


        future_dates["ds"]=pd.to_datetime('2021', format='%Y')
        
        
        display(future_dates)

        # Step 6: Predict the number of immigrants for the future dates
        forecast = model.predict(future_dates)

        # Step 7: Extract the predicted values for the future dates
        predicted_values = np.exp(forecast['yhat'])

        # Step 8: Display the predicted values for 2021

        from sklearn.metrics import mean_absolute_error
        y_true=df_merged[state][-1:].values
        print("2021- Real data for "+country+" "+state+":"+str(y_true))
        y_pred=np.exp(forecast['yhat'].values)
        print("2021- Predicted for "+country+" "+state+": "+str(y_pred))
        mae=mean_absolute_error(y_true,y_pred)
        print("The MAE while testing 2021 for "+country+" "+state+": "+str(mae))
        mape=mean_absolute_percentage_error(y_true, y_pred)
        print("The MAPE while testing 2021 for "+country+" "+state+": "+str(mape))
        
        
        #mae = mean_absolute_error(y_true, y_pred)
        #mse = mean_squared_error(y_true, y_pred)
        #rmse = np.sqrt(mse)
        #r2 = r2_score(y_true, y_pred)

        #print("Mean Absolute Error (MAE):", mae)
        #print("Mean Squared Error (MSE):", mse)
        #print("Root Mean Squared Error (RMSE):", rmse)
        #print("R-squared (R2):", r2)

        
        future_dates["ds"]=pd.to_datetime('2023', format='%Y')

        forecast=model.predict(future_dates)
        predicted_values = forecast[['ds', 'yhat']]
        print("2023- Predicted for "+country+" "+state+": " +str(np.exp(forecast['yhat'].values)))

        import matplotlib.pyplot as plt

        #fig=model.plot(forecast)
        #plt.scatter(pd.to_datetime(['2023'], format='%Y'), forecast['yhat'], c='r', s=100, label='2023 Prediction')
        #plt.legend()
        #plt.show()

        results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},
                                       ignore_index=True)
        
        
        del(prophet_df)
        del(prophet_df_2)
        del(prophet_df_pred)
        del(model)
        


China


Series Name,Year,Population_total,GDP_per_capita,UR,Urban_population,Birth_rate_crude,Gini_index
0,1960,667070000,89.5202179159242,..,16.203,20.86,..
1,1961,660330000,75.8055639067957,..,16.708,18.02,..
2,1962,665770000,70.9091553472599,..,17.226,37.01,..
3,1963,682335000,74.3133748233246,..,17.757,43.37,..
4,1964,698355000,85.4982461036748,..,18.299,39.14,..
...,...,...,...,...,...,...,...
58,2018,1402760000.0,9905.406118,4.93,59.152,10.86,38.5
59,2019,1407745000,10143.8602060373,5.15,60.308,10.41,38.2
60,2020,1411100000,10408.7191247747,5.61,61.428,8.52,..
61,2021,1412360000,12617.5049863004,5.11,62.512,7.52,..


Feature importance for China and Alabama                 Feature  Importance
2                    UR    0.182732
1        GDP_per_capita    0.175623
4      Birth_rate_crude    0.169791
3      Urban_population    0.149991
0      Population_total    0.129925
6                   GDP    0.112015
5            Gini_index    0.054558
7  GDP_per_capita_state    0.025365


09:32:46 - cmdstanpy - INFO - Chain [1] start processing
09:32:46 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,UR,GDP_per_capita,Birth_rate_crude
0,2021-01-01,5.11,12617.504986,7.52


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China Alabama:[146]
2021- Predicted for China Alabama: [129.27804359]
The MAE while testing 2021 for China Alabama: 16.721956408774304
The MAPE while testing 2021 for China Alabama: 0.11453394800530345
2023- Predicted for China Alabama: [121.08752729]
Feature importance for China and Alaska                 Feature  Importance
1        GDP_per_capita    0.341818
0      Population_total    0.291961
3      Urban_population    0.277579
5            Gini_index    0.041019
4      Birth_rate_crude    0.017123
2                    UR    0.014310
6                   GDP    0.011054
7  GDP_per_capita_state    0.005136


09:32:47 - cmdstanpy - INFO - Chain [1] start processing
09:32:47 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,GDP_per_capita,Population_total,Urban_population
0,2021-01-01,12617.504986,1412360000.0,62.512


2021- Real data for China Alaska:[20]
2021- Predicted for China Alaska: [23.48943793]
The MAE while testing 2021 for China Alaska: 3.4894379322071245
The MAPE while testing 2021 for China Alaska: 0.17447189661035623
2023- Predicted for China Alaska: [11.94159858]


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},
09:32:48 - cmdstanpy - INFO - Chain [1] start processing


Feature importance for China and Arizona                 Feature  Importance
4      Birth_rate_crude    0.214213
5            Gini_index    0.167290
0      Population_total    0.135937
1        GDP_per_capita    0.128047
6                   GDP    0.114899
2                    UR    0.098378
3      Urban_population    0.090038
7  GDP_per_capita_state    0.051199


09:32:48 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Birth_rate_crude,Gini_index,Population_total
0,2021-01-01,7.52,40.333333,1412360000.0


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China Arizona:[437]
2021- Predicted for China Arizona: [394.33574537]
The MAE while testing 2021 for China Arizona: 42.66425462760833
The MAPE while testing 2021 for China Arizona: 0.09762987328972157
2023- Predicted for China Arizona: [555.81096124]
Feature importance for China and Arkansas                 Feature  Importance
5            Gini_index    0.251707
2                    UR    0.204603
1        GDP_per_capita    0.143316
4      Birth_rate_crude    0.138181
3      Urban_population    0.108070
0      Population_total    0.092016
7  GDP_per_capita_state    0.042139
6                   GDP    0.019968


09:32:49 - cmdstanpy - INFO - Chain [1] start processing
09:32:49 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Gini_index,UR,GDP_per_capita
0,2021-01-01,40.333333,5.11,12617.504986


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China Arkansas:[74]
2021- Predicted for China Arkansas: [113.16198411]
The MAE while testing 2021 for China Arkansas: 39.16198410538318
The MAPE while testing 2021 for China Arkansas: 0.5292160014240971
2023- Predicted for China Arkansas: [120.73383625]
Feature importance for China and California                 Feature  Importance
4      Birth_rate_crude    0.412685
2                    UR    0.111527
5            Gini_index    0.097420
1        GDP_per_capita    0.096806
3      Urban_population    0.095767
0      Population_total    0.066717
7  GDP_per_capita_state    0.064071
6                   GDP    0.055006


09:32:49 - cmdstanpy - INFO - Chain [1] start processing
09:32:50 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Birth_rate_crude,UR,Gini_index
0,2021-01-01,7.52,5.11,40.333333


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China California:[17705]
2021- Predicted for China California: [16112.23532514]
The MAE while testing 2021 for China California: 1592.7646748647821
The MAPE while testing 2021 for China California: 0.0899612920002701
2023- Predicted for China California: [21622.30938621]


09:32:50 - cmdstanpy - INFO - Chain [1] start processing


Feature importance for China and Colorado                 Feature  Importance
5            Gini_index    0.243050
1        GDP_per_capita    0.191062
3      Urban_population    0.162657
0      Population_total    0.161219
2                    UR    0.111184
4      Birth_rate_crude    0.096425
7  GDP_per_capita_state    0.019983
6                   GDP    0.014421


09:32:51 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Gini_index,GDP_per_capita,Urban_population
0,2021-01-01,40.333333,12617.504986,62.512


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China Colorado:[399]
2021- Predicted for China Colorado: [657.35769762]
The MAE while testing 2021 for China Colorado: 258.3576976183699
The MAPE while testing 2021 for China Colorado: 0.6475130266124559
2023- Predicted for China Colorado: [3912.11294485]


09:32:51 - cmdstanpy - INFO - Chain [1] start processing


Feature importance for China and Connecticut                 Feature  Importance
1        GDP_per_capita    0.314018
0      Population_total    0.238518
3      Urban_population    0.211533
5            Gini_index    0.082691
2                    UR    0.057958
4      Birth_rate_crude    0.044194
7  GDP_per_capita_state    0.033309
6                   GDP    0.017780


09:32:51 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,GDP_per_capita,Population_total,Urban_population
0,2021-01-01,12617.504986,1412360000.0,62.512


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China Connecticut:[368]
2021- Predicted for China Connecticut: [392.9346178]
The MAE while testing 2021 for China Connecticut: 24.93461779518509
The MAPE while testing 2021 for China Connecticut: 0.06775711357387253
2023- Predicted for China Connecticut: [288.88720534]
Feature importance for China and Delaware                 Feature  Importance
7  GDP_per_capita_state    0.307623
4      Birth_rate_crude    0.201900
3      Urban_population    0.106552
1        GDP_per_capita    0.100863
6                   GDP    0.100589
0      Population_total    0.091393
5            Gini_index    0.077482
2                    UR    0.013597


09:32:52 - cmdstanpy - INFO - Chain [1] start processing
09:32:52 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,GDP_per_capita_state,Birth_rate_crude,Urban_population
0,2021-01-01,130352.601,7.52,62.512


2021- Real data for China Delaware:[151]
2021- Predicted for China Delaware: [203.88372311]
The MAE while testing 2021 for China Delaware: 52.883723105046556
The MAPE while testing 2021 for China Delaware: 0.3502233318215004
2023- Predicted for China Delaware: [2641.61368943]


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},
09:32:53 - cmdstanpy - INFO - Chain [1] start processing


Feature importance for China and District of Columbia                 Feature  Importance
7  GDP_per_capita_state    0.669273
4      Birth_rate_crude    0.193909
0      Population_total    0.033923
3      Urban_population    0.030184
2                    UR    0.028620
5            Gini_index    0.018722
6                   GDP    0.012784
1        GDP_per_capita    0.012586


09:32:53 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,GDP_per_capita_state,Birth_rate_crude,Population_total
0,2021-01-01,335732.0975,7.52,1412360000.0


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China District of Columbia:[58]
2021- Predicted for China District of Columbia: [11.11996747]
The MAE while testing 2021 for China District of Columbia: 46.88003252931553
The MAPE while testing 2021 for China District of Columbia: 0.8082764229192333
2023- Predicted for China District of Columbia: [6.23605398]
Feature importance for China and Florida                 Feature  Importance
4      Birth_rate_crude    0.292963
2                    UR    0.216353
0      Population_total    0.133421
1        GDP_per_capita    0.110635
7  GDP_per_capita_state    0.100137
3      Urban_population    0.074505
5            Gini_index    0.049401
6                   GDP    0.022584


09:32:53 - cmdstanpy - INFO - Chain [1] start processing
09:32:54 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Birth_rate_crude,UR,Population_total
0,2021-01-01,7.52,5.11,1412360000.0


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China Florida:[1037]
2021- Predicted for China Florida: [1864.17927392]
The MAE while testing 2021 for China Florida: 827.1792739233413
The MAPE while testing 2021 for China Florida: 0.7976656450562597
2023- Predicted for China Florida: [3160.17810838]


09:32:54 - cmdstanpy - INFO - Chain [1] start processing


Feature importance for China and Georgia                 Feature  Importance
4      Birth_rate_crude    0.567507
2                    UR    0.099151
0      Population_total    0.086965
3      Urban_population    0.081725
1        GDP_per_capita    0.080022
7  GDP_per_capita_state    0.039518
6                   GDP    0.023733
5            Gini_index    0.021379


09:32:54 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Birth_rate_crude,UR,Population_total
0,2021-01-01,7.52,5.11,1412360000.0


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China Georgia:[734]
2021- Predicted for China Georgia: [509.47553287]
The MAE while testing 2021 for China Georgia: 224.52446713323206
The MAPE while testing 2021 for China Georgia: 0.305891644595684
2023- Predicted for China Georgia: [318.10973711]
Feature importance for China and Hawaii                 Feature  Importance
1        GDP_per_capita    0.216725
0      Population_total    0.174471
5            Gini_index    0.166986
3      Urban_population    0.163808
4      Birth_rate_crude    0.134638
6                   GDP    0.057790
7  GDP_per_capita_state    0.044394
2                    UR    0.041188


09:32:55 - cmdstanpy - INFO - Chain [1] start processing
09:32:55 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,GDP_per_capita,Population_total,Gini_index
0,2021-01-01,12617.504986,1412360000.0,40.333333


2021- Real data for China Hawaii:[358]
2021- Predicted for China Hawaii: [695.33710964]
The MAE while testing 2021 for China Hawaii: 337.33710963961664
The MAPE while testing 2021 for China Hawaii: 0.9422824291609403
2023- Predicted for China Hawaii: [1509.12669089]


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


Feature importance for China and Idaho                 Feature  Importance
5            Gini_index    0.244225
0      Population_total    0.151591
1        GDP_per_capita    0.142584
2                    UR    0.137787
4      Birth_rate_crude    0.120214
3      Urban_population    0.113268
6                   GDP    0.057292
7  GDP_per_capita_state    0.033038


09:32:56 - cmdstanpy - INFO - Chain [1] start processing
09:32:56 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Gini_index,Population_total,GDP_per_capita
0,2021-01-01,40.333333,1412360000.0,12617.504986


2021- Real data for China Idaho:[102]
2021- Predicted for China Idaho: [63.43300919]
The MAE while testing 2021 for China Idaho: 38.566990809081474
The MAPE while testing 2021 for China Idaho: 0.37810775303021055
2023- Predicted for China Idaho: [59.07332394]


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


Feature importance for China and Illinois                 Feature  Importance
4      Birth_rate_crude    0.356479
2                    UR    0.118362
7  GDP_per_capita_state    0.110457
6                   GDP    0.098048
0      Population_total    0.093028
1        GDP_per_capita    0.083677
3      Urban_population    0.082749
5            Gini_index    0.057200


09:32:57 - cmdstanpy - INFO - Chain [1] start processing
09:33:15 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Birth_rate_crude,UR,GDP_per_capita_state
0,2021-01-01,7.52,5.11,122469.9654


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China Illinois:[1480]
2021- Predicted for China Illinois: [1559.919054]
The MAE while testing 2021 for China Illinois: 79.91905399615462
The MAPE while testing 2021 for China Illinois: 0.053999360808212576
2023- Predicted for China Illinois: [2170.78522137]


09:33:16 - cmdstanpy - INFO - Chain [1] start processing


Feature importance for China and Indiana                 Feature  Importance
2                    UR    0.214710
4      Birth_rate_crude    0.178015
1        GDP_per_capita    0.173129
0      Population_total    0.160771
3      Urban_population    0.143091
5            Gini_index    0.091000
6                   GDP    0.027319
7  GDP_per_capita_state    0.011965


09:33:16 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,UR,Birth_rate_crude,GDP_per_capita
0,2021-01-01,5.11,7.52,12617.504986


2021- Real data for China Indiana:[340]
2021- Predicted for China Indiana: [402.15146332]
The MAE while testing 2021 for China Indiana: 62.15146332007515
The MAPE while testing 2021 for China Indiana: 0.1827984215296328
2023- Predicted for China Indiana: [423.95933219]


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


Feature importance for China and Iowa                 Feature  Importance
4      Birth_rate_crude    0.222943
2                    UR    0.179803
1        GDP_per_capita    0.126057
3      Urban_population    0.123184
0      Population_total    0.119072
7  GDP_per_capita_state    0.086051
5            Gini_index    0.078722
6                   GDP    0.064169


09:33:17 - cmdstanpy - INFO - Chain [1] start processing
09:33:17 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Birth_rate_crude,UR,GDP_per_capita
0,2021-01-01,7.52,5.11,12617.504986


2021- Real data for China Iowa:[153]
2021- Predicted for China Iowa: [218.29096085]
The MAE while testing 2021 for China Iowa: 65.29096084552532
The MAPE while testing 2021 for China Iowa: 0.42673830617990405
2023- Predicted for China Iowa: [162.87953651]


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


Feature importance for China and Kansas                 Feature  Importance
4      Birth_rate_crude    0.164713
2                    UR    0.157478
1        GDP_per_capita    0.144632
0      Population_total    0.136851
5            Gini_index    0.119572
7  GDP_per_capita_state    0.109163
3      Urban_population    0.106791
6                   GDP    0.060802


09:33:18 - cmdstanpy - INFO - Chain [1] start processing
09:33:18 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Birth_rate_crude,UR,GDP_per_capita
0,2021-01-01,7.52,5.11,12617.504986


2021- Real data for China Kansas:[124]
2021- Predicted for China Kansas: [212.46017539]
The MAE while testing 2021 for China Kansas: 88.46017538708028
The MAPE while testing 2021 for China Kansas: 0.7133885111861313
2023- Predicted for China Kansas: [161.76498703]


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},
09:33:18 - cmdstanpy - INFO - Chain [1] start processing


Feature importance for China and Kentucky                 Feature  Importance
1        GDP_per_capita    0.184226
0      Population_total    0.178728
2                    UR    0.161936
3      Urban_population    0.149178
4      Birth_rate_crude    0.091950
5            Gini_index    0.089531
6                   GDP    0.076881
7  GDP_per_capita_state    0.067570


09:33:19 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,GDP_per_capita,Population_total,UR
0,2021-01-01,12617.504986,1412360000.0,5.11


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China Kentucky:[125]
2021- Predicted for China Kentucky: [140.74181371]
The MAE while testing 2021 for China Kentucky: 15.741813707996158
The MAPE while testing 2021 for China Kentucky: 0.12593450966396927
2023- Predicted for China Kentucky: [88.65808637]
Feature importance for China and Louisiana                 Feature  Importance
4      Birth_rate_crude    0.208532
5            Gini_index    0.203669
7  GDP_per_capita_state    0.158029
0      Population_total    0.103754
2                    UR    0.102794
1        GDP_per_capita    0.090292
3      Urban_population    0.088379
6                   GDP    0.044551


09:33:19 - cmdstanpy - INFO - Chain [1] start processing
09:33:19 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Birth_rate_crude,Gini_index,GDP_per_capita_state
0,2021-01-01,7.52,40.333333,106609.4223


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China Louisiana:[123]
2021- Predicted for China Louisiana: [193.46659434]
The MAE while testing 2021 for China Louisiana: 70.46659434469535
The MAPE while testing 2021 for China Louisiana: 0.5728991410137834
2023- Predicted for China Louisiana: [207.94073738]


09:33:20 - cmdstanpy - INFO - Chain [1] start processing


Feature importance for China and Maine                 Feature  Importance
1        GDP_per_capita    0.244543
0      Population_total    0.231711
3      Urban_population    0.182835
5            Gini_index    0.111189
2                    UR    0.103156
4      Birth_rate_crude    0.062757
6                   GDP    0.032825
7  GDP_per_capita_state    0.030984


09:33:20 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,GDP_per_capita,Population_total,Urban_population
0,2021-01-01,12617.504986,1412360000.0,62.512


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China Maine:[36]
2021- Predicted for China Maine: [26.88987117]
The MAE while testing 2021 for China Maine: 9.110128834557536
The MAPE while testing 2021 for China Maine: 0.2530591342932649
2023- Predicted for China Maine: [4.93459713]


09:33:21 - cmdstanpy - INFO - Chain [1] start processing


Feature importance for China and Maryland                 Feature  Importance
4      Birth_rate_crude    0.347686
2                    UR    0.278373
1        GDP_per_capita    0.121471
3      Urban_population    0.086197
0      Population_total    0.078210
5            Gini_index    0.040185
7  GDP_per_capita_state    0.030142
6                   GDP    0.017737


09:33:21 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Birth_rate_crude,UR,GDP_per_capita
0,2021-01-01,7.52,5.11,12617.504986


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China Maryland:[843]
2021- Predicted for China Maryland: [1017.35400558]
The MAE while testing 2021 for China Maryland: 174.35400558218498
The MAPE while testing 2021 for China Maryland: 0.2068256293976097
2023- Predicted for China Maryland: [1113.18321862]
Feature importance for China and Massachusetts                 Feature  Importance
4      Birth_rate_crude    0.310697
5            Gini_index    0.177182
2                    UR    0.174788
0      Population_total    0.088220
3      Urban_population    0.074863
1        GDP_per_capita    0.074089
7  GDP_per_capita_state    0.069726
6                   GDP    0.030435


09:33:22 - cmdstanpy - INFO - Chain [1] start processing
09:33:42 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Birth_rate_crude,Gini_index,UR
0,2021-01-01,7.52,40.333333,5.11


2021- Real data for China Massachusetts:[2059]
2021- Predicted for China Massachusetts: [1784.55939439]
The MAE while testing 2021 for China Massachusetts: 274.4406056095427
The MAPE while testing 2021 for China Massachusetts: 0.13328829801337672
2023- Predicted for China Massachusetts: [1828.28080782]


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


Feature importance for China and Michigan                 Feature  Importance
1        GDP_per_capita    0.241372
3      Urban_population    0.169402
0      Population_total    0.158588
5            Gini_index    0.148921
2                    UR    0.140613
4      Birth_rate_crude    0.123958
7  GDP_per_capita_state    0.009605
6                   GDP    0.007540


09:33:43 - cmdstanpy - INFO - Chain [1] start processing
09:33:43 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,GDP_per_capita,Urban_population,Population_total
0,2021-01-01,12617.504986,62.512,1412360000.0


2021- Real data for China Michigan:[743]
2021- Predicted for China Michigan: [775.86412628]
The MAE while testing 2021 for China Michigan: 32.864126280630785
The MAPE while testing 2021 for China Michigan: 0.04423166390394453
2023- Predicted for China Michigan: [1627.18727329]


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


Feature importance for China and Minnesota                 Feature  Importance
1        GDP_per_capita    0.209093
0      Population_total    0.180995
3      Urban_population    0.170571
5            Gini_index    0.150138
2                    UR    0.137772
4      Birth_rate_crude    0.067166
6                   GDP    0.060250
7  GDP_per_capita_state    0.024016


09:33:44 - cmdstanpy - INFO - Chain [1] start processing
09:33:44 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,GDP_per_capita,Population_total,Urban_population
0,2021-01-01,12617.504986,1412360000.0,62.512


  results_df = results_df.append({'Country': country, 'State': state, 'MAE': mae, 'MAPE': mape,'data2021':y_true, 'pred2021':y_pred, 'pred2023':np.exp(forecast['yhat'].values)},


2021- Real data for China Minnesota:[342]
2021- Predicted for China Minnesota: [444.12742782]
The MAE while testing 2021 for China Minnesota: 102.12742781785039
The MAPE while testing 2021 for China Minnesota: 0.2986182099937146
2023- Predicted for China Minnesota: [242.80181121]
Feature importance for China and Mississippi                 Feature  Importance
4      Birth_rate_crude    0.389607
2                    UR    0.191868
1        GDP_per_capita    0.106287
0      Population_total    0.095656
3      Urban_population    0.080130
5            Gini_index    0.077926
7  GDP_per_capita_state    0.048146
6                   GDP    0.010382


09:33:45 - cmdstanpy - INFO - Chain [1] start processing
09:33:45 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,Birth_rate_crude,UR,GDP_per_capita
0,2021-01-01,7.52,5.11,12617.504986


2021- Real data for China Mississippi:[45]
2021- Predicted for China Mississippi: [85.10718686]
The MAE while testing 2021 for China Mississippi: 40.107186858530625
The MAPE while testing 2021 for China Mississippi: 0.8912708190784583


KeyboardInterrupt: 

In [None]:


results_df["pred2023"] = results_df["pred2023"].astype(int)
results_df["pred2021"] = results_df["pred2021"].astype(int)
display(results_df)
print(results_df.describe())

import seaborn as sns
# Reshape the DataFrame to a pivot table for the heatmap
heatmap_df = results_df.pivot(index='Country', columns='State', values='MAPE')  # You can choose 'MAPE' here if needed

# Plot the heatmap
plt.figure(figsize=(30, 28))
sns.heatmap(heatmap_df, annot=True, cmap='YlGnBu', fmt=".2f", cbar_kws={'label': 'MAPE'},vmin=0, vmax=1)  # You can choose 'MAPE' here if needed
plt.title('MAPE Heatmap')
plt.xlabel('State')
plt.ylabel('Country')
plt.xticks(fontsize=22)
plt.yticks(fontsize=22)


plt.show()

In [None]:
display(results_df[(results_df['State'] == 'South Dakota') & (results_df['Country'] == 'China')])

In [None]:
display(results_df[(results_df['State'] == 'West Virginia') & (results_df['Country'] == 'Dominican Republic')])

In [None]:
display(results_df[(results_df['State'] == 'South Dakota') & (results_df['Country'] == 'United Kingdom')])

In [None]:
mape_condition = results_df['MAPE'] < 0.3

# Group by 'Country' and 'State', and count the valid combinations
num_combinations_with_low_mape = results_df[mape_condition].groupby(['Country', 'State']).size().reset_index(name='Count')

print(num_combinations_with_low_mape)

In [None]:

results_df.to_csv("Output/output_data.csv", index=False)