In [93]:
import pandas as pd
# The DataFrame 'df' is not displaying the headers correctly due to the initial row misalignment. 
# We need to adjust this first. Let's reload the data and skip the first row which seems to be incorrect.
df_corrected = pd.read_csv('raw_data/mean_years_in_school_women_percent_men_25_to_34_years.csv', header=None)

# Now we define the first row as the header
df_corrected.columns = df_corrected.iloc[0]
df_corrected = df_corrected.drop(df_corrected.index[0])

# The headers now should be countries, let's transpose the data to make countries as one of the columns
df_transposed = df_corrected.set_index(df_corrected.columns[0]).T
df_transposed.reset_index(inplace=True)
df_transposed.rename(columns={df_transposed.columns[0]: 'year'}, inplace=True)

# Melt the transposed dataframe to get "country, year, GDP per capita" format
df_melted = pd.melt(df_transposed, id_vars=['year'], var_name='country', value_name='Gender ratio of mean years in school (women % men, 25 to 34 years)')

In [94]:
df_melted.dtypes

year                                                                  float64
country                                                                object
Gender ratio of mean years in school (women % men, 25 to 34 years)    float64
dtype: object

In [87]:
df_melted

Unnamed: 0,year,country,Ratio of mean years in school between men and women
0,1970.0,Afghanistan,15.4
1,1971.0,Afghanistan,15.8
2,1972.0,Afghanistan,15.4
3,1973.0,Afghanistan,15.6
4,1974.0,Afghanistan,15.9
...,...,...,...
8643,2011.0,Zimbabwe,90.4
8644,2012.0,Zimbabwe,90.8
8645,2013.0,Zimbabwe,91.2
8646,2014.0,Zimbabwe,91.7


In [67]:
def reformatGDP(gdp):
    gdp_str = str(gdp).lower()
    if 'k' in gdp_str:
        new_str = gdp_str.replace('k', '')
        return float(new_str) * 1000
    return float(gdp)
    

In [69]:
print(reformatGDP(187))

187.0


In [70]:
df_melted['GDP per capita'] = df_melted['GDP per capita'].map(reformatGDP)

In [90]:
df_melted.to_csv('reformatted_gender_equality.csv', index=False)

In [72]:
df_melted.head()

Unnamed: 0,year,country,GDP per capita
0,1800,Afghanistan,599.0
1,1801,Afghanistan,599.0
2,1802,Afghanistan,599.0
3,1803,Afghanistan,599.0
4,1804,Afghanistan,599.0


In [95]:
df = pd.read_csv('reformatted_data/reformatted_life_expectancy.csv')

In [97]:
def date_to_int(year):
    return int(year)

In [98]:
df['year'] = df_melted['year'].map(date_to_int)

In [99]:
df

Unnamed: 0,year,country,Life expectancy
0,1970.0,Afghanistan,28.2
1,1971.0,Afghanistan,28.2
2,1972.0,Afghanistan,28.2
3,1973.0,Afghanistan,28.2
4,1974.0,Afghanistan,28.2
...,...,...,...
58991,,Zimbabwe,70.8
58992,,Zimbabwe,71.0
58993,,Zimbabwe,71.1
58994,,Zimbabwe,71.2
