In [1]:
import pandas as pd
import plotly.graph_objects as go

In [2]:
#  Read the file and remove the rows and columns not needed
df = pd.read_excel("countries_variables.xls", skiprows=2)
df = df[df["Indicator Name"].isin(["Fertility rate, total (births per woman)", "GDP per capita, PPP (current international $)", "Unemployment, total (% of total labor force) (national estimate)", "Gini index"])]
df = df.drop(df.columns[3:5],axis=1)
df = df.drop(df.columns[1],axis=1)
df.head()

Unnamed: 0,Country Name,Indicator Name,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
1,China,"Fertility rate, total (births per woman)",1.624,1.644,1.666,1.701,1.714,1.687,1.668,1.798,1.714,1.769,1.67,1.772,1.813,1.554,1.496,1.281,1.164
2,China,"GDP per capita, PPP (current international $)",5334.646639,5979.781712,6795.174012,7412.874363,8069.354638,8884.588031,10292.940478,11168.697298,11872.497381,12480.33853,12897.502287,13483.377267,14243.532611,15497.82574,16655.399364,17209.442954,19484.308087
4,China,"Unemployment, total (% of total labor force) (...",4.1,4.0,4.2,4.3,4.1,4.1,4.1,4.05,4.1,,,3.9,4.93,5.15,5.61,5.11,
6,China,Gini index,,,43.0,,43.7,42.4,42.2,39.7,39.2,38.6,38.5,39.1,38.5,38.2,,,
9,Dominican Republic,"Fertility rate, total (births per woman)",2.605,2.533,2.51,2.533,2.556,2.538,2.513,2.485,2.466,2.445,2.43,2.406,2.39,2.381,2.344,2.303,2.273


In [3]:
# Unpivot the data
df2 = pd.melt(df, id_vars=["Country Name", "Indicator Name"], var_name="Years", value_name="Variables")

# Reset Index
pivot_df2 = df2.pivot(index=["Country Name", "Years"],columns='Indicator Name', values = "Variables")
pivot_df2 = pivot_df2.reset_index(level=[0, 1])

# # Rename title
pivot_df2.rename(columns={"Fertility rate, total (births per woman)": "Fertility rate_country", "GDP per capita, PPP (current international $)":"GDP per capita_country", "Unemployment, total (% of total labor force) (national estimate)": "Unemployment_country", "Gini index": "Gini index_country" }, inplace=True)
pivot_df2.head()



Indicator Name,Country Name,Years,Fertility rate_country,GDP per capita_country,Gini index_country,Unemployment_country
0,China,2005,1.624,5334.646639,,4.1
1,China,2006,1.644,5979.781712,,4.0
2,China,2007,1.666,6795.174012,43.0,4.2
3,China,2008,1.701,7412.874363,,4.3
4,China,2009,1.714,8069.354638,43.7,4.1


In [4]:
#  Clean the null fields
country_gini_averages = pivot_df2.groupby(["Country Name"])["Gini index_country"].mean()
country_unemployeement_average = pivot_df2.groupby('Country Name')['Unemployment_country'].mean()
country_fertility_average = pivot_df2.groupby('Country Name')['Fertility rate_country'].mean()

# Step 2: Fill null values in 'Gini Index' with the corresponding country's average
pivot_df2["Gini index_country"] = pivot_df2.apply(lambda row: country_gini_averages[row['Country Name']] if pd.isnull(row['Gini index_country']) else row['Gini index_country'], axis=1)
pivot_df2['Unemployment_country'] = pivot_df2.apply(lambda row: country_unemployeement_average[row['Country Name']] if pd.isnull(row['Unemployment_country']) else row['Unemployment_country'], axis=1)
pivot_df2['Fertility rate_country'] = pivot_df2.apply(lambda row: country_unemployeement_average[row['Country Name']] if pd.isnull(row['Fertility rate_country']) else row['Fertility rate_country'], axis=1)

pivot_df2 = pivot_df2.rename(columns={"Years": "Year"})
pivot_df2["Country Name"] = pivot_df2["Country Name"].replace('Iran, Islamic Rep.', 'Iran')
pivot_df2["Year"] = pivot_df2["Year"].astype(int)
pivot_df2.tail()


Indicator Name,Country Name,Year,Fertility rate_country,GDP per capita_country,Gini index_country,Unemployment_country
131,United Kingdom,2017,1.74,46104.055397,32.6,4.0
132,United Kingdom,2018,1.68,47202.199606,33.7,3.74
133,United Kingdom,2019,1.63,49288.693112,32.8,5.949286
134,United Kingdom,2020,1.56,45872.027289,32.6,5.949286
135,United Kingdom,2021,1.56,50056.266291,33.7625,5.949286


In [5]:
# Immigration data by from 2005 - 2011

data = pd.read_csv('immigration_data_2005_2021.csv')
data = pd.melt(data, id_vars = ["Year", "Region and country of birth", "Total Permanent Residents", "Percentage"], var_name = "State", value_name="Population Count")
data = data[data["Region and country of birth"].isin(["China, People's Republic", "Dominican Republic", "India","Iran", "Mexico", "Pakistan", "Philippines", "United Kingdom"])]
data = data.rename(columns={"Region and country of birth": "Country Name"})
data["Country Name"] = data["Country Name"].replace("China, People's Republic", "China")

# drop states not valid

values_to_drop = ['U.S. Armed Services posts', 'U.S. possessions', 'U.S. Dependencies', 'Guam', 'Puerto Rico']
data = data[~data['State'].isin(values_to_drop)]
data = data.reset_index(drop=True)

data.tail()
# data.to_csv("immigration_data_2005_2021_cleaned.csv")

Unnamed: 0,Year,Country Name,Total Permanent Residents,Percentage,State,Population Count
6931,2021,Iran,5734,0.77,Wyoming,0
6932,2021,Mexico,107230,14.49,Wyoming,69
6933,2021,Pakistan,9691,1.31,Wyoming,3
6934,2021,Philippines,27511,3.72,Wyoming,27
6935,2021,United Kingdom,9229,1.25,Wyoming,8


In [6]:
list1 = data["Country Name"].unique().tolist()

In [7]:
list2 = pivot_df2["Country Name"].unique().tolist()

In [8]:
combine = pd.merge(pivot_df2, data, how = "right", on = ["Year","Country Name"])
combine.tail()
# combine.to_csv("combine_clean.csv")


Unnamed: 0,Country Name,Year,Fertility rate_country,GDP per capita_country,Gini index_country,Unemployment_country,Total Permanent Residents,Percentage,State,Population Count
6931,Iran,2021,1.692,16557.046182,40.99,11.416875,5734,0.77,Wyoming,0
6932,Mexico,2021,1.822,19578.403808,48.144444,3.26,107230,14.49,Wyoming,69
6933,Pakistan,2021,3.47,5773.450917,29.985714,6.34,9691,1.31,Wyoming,3
6934,Philippines,2021,3.23,10133.195894,44.6,3.23,27511,3.72,Wyoming,27
6935,United Kingdom,2021,1.56,50056.266291,33.7625,5.949286,9229,1.25,Wyoming,8


In [9]:
state = pd.read_excel("variable_employeement_income.xlsx", sheet_name='Sheet3', skiprows=2)
state = state.rename(columns = {"State and area": "State"})
state.dtypes

Year                          float64
State                          object
Unemployeement Rate           float64
per capita personal Income      int64
dtype: object

In [10]:
combine2 = pd.merge(combine, state, on = ["Year", "State"], how = "left")
combine2[combine2["Year"] == 2021]

Unnamed: 0,Country Name,Year,Fertility rate_country,GDP per capita_country,Gini index_country,Unemployment_country,Total Permanent Residents,Percentage,State,Population Count,Unemployeement Rate,per capita personal Income
128,China,2021,1.164,19484.308087,40.281818,4.410714,49847,6.74,Alabama,146,0.033755,49671
129,Dominican Republic,2021,2.273,20553.099508,38.500000,6.232500,24553,3.32,Alabama,33,0.033755,49671
130,India,2021,2.031,7367.994665,35.257143,4.820000,93450,12.63,Alabama,346,0.033755,49671
131,Iran,2021,1.692,16557.046182,40.990000,11.416875,5734,0.77,Alabama,21,0.033755,49671
132,Mexico,2021,1.822,19578.403808,48.144444,3.260000,107230,14.49,Alabama,323,0.033755,49671
...,...,...,...,...,...,...,...,...,...,...,...,...
6931,Iran,2021,1.692,16557.046182,40.990000,11.416875,5734,0.77,Wyoming,0,0.046230,69584
6932,Mexico,2021,1.822,19578.403808,48.144444,3.260000,107230,14.49,Wyoming,69,0.046230,69584
6933,Pakistan,2021,3.470,5773.450917,29.985714,6.340000,9691,1.31,Wyoming,3,0.046230,69584
6934,Philippines,2021,3.230,10133.195894,44.600000,3.230000,27511,3.72,Wyoming,27,0.046230,69584


In [11]:
combine2["Year"].unique()

array([2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020, 2021])

In [12]:
covid_factor = pd.DataFrame(
    {"Year": [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021],
     "Covid": [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1]
       }
)
covid_factor

Unnamed: 0,Year,Covid
0,2005,0
1,2006,0
2,2007,0
3,2008,0
4,2009,0
5,2010,0
6,2011,0
7,2012,0
8,2013,0
9,2014,0


In [13]:
combine3 = pd.merge(combine2, covid_factor, on = "Year", how = "left")
combine3

Unnamed: 0,Country Name,Year,Fertility rate_country,GDP per capita_country,Gini index_country,Unemployment_country,Total Permanent Residents,Percentage,State,Population Count,Unemployeement Rate,per capita personal Income,Covid
0,China,2005,1.624,5334.646639,40.281818,4.100000,69967,6.23,Alabama,328,0.044474,29949,0
1,Dominican Republic,2005,2.605,10215.499371,50.000000,5.650000,27504,2.45,Alabama,5,0.044474,29949,0
2,India,2005,2.958,2936.560443,35.257143,5.651429,84681,7.54,Alabama,431,0.044474,29949,0
3,Iran,2005,1.775,14738.152239,43.600000,11.520000,13887,1.24,Alabama,48,0.044474,29949,0
4,Mexico,2005,2.495,12725.197590,50.100000,3.570000,161445,14.38,Alabama,569,0.044474,29949,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6931,Iran,2021,1.692,16557.046182,40.990000,11.416875,5734,0.77,Wyoming,0,0.046230,69584,1
6932,Mexico,2021,1.822,19578.403808,48.144444,3.260000,107230,14.49,Wyoming,69,0.046230,69584,1
6933,Pakistan,2021,3.470,5773.450917,29.985714,6.340000,9691,1.31,Wyoming,3,0.046230,69584,1
6934,Philippines,2021,3.230,10133.195894,44.600000,3.230000,27511,3.72,Wyoming,27,0.046230,69584,1


In [14]:
combine3.to_csv("sample.csv")