In [4]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
adult_df = pd.read_csv('Adult_df.csv') #Reading the Adult_df.csv file into a DataFrame called adult_df
adult_df.head()

Unnamed: 0,entity,code,year,adult_literacy_rate__population_15plus_years__both_sexes__pct__lr_ag15t99
0,Afghanistan,AFG,1979,18.0
1,Afghanistan,AFG,2011,31.0
2,Afghanistan,AFG,2015,33.75384
3,Afghanistan,AFG,2021,37.0
4,Albania,ALB,2001,99.0


In [6]:
#Getting a concise summary of the DataFrame, including the number of non-null entries, data types of each column, and memory usage. This helps us understand the structure of the DataFrame and identify any potential issues with missing data or incorrect data types.
adult_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1725 entries, 0 to 1724
Data columns (total 4 columns):
 #   Column                                                                     Non-Null Count  Dtype  
---  ------                                                                     --------------  -----  
 0   entity                                                                     1725 non-null   object 
 1   code                                                                       996 non-null    object 
 2   year                                                                       1725 non-null   int64  
 3   adult_literacy_rate__population_15plus_years__both_sexes__pct__lr_ag15t99  1725 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 54.0+ KB


In [7]:
#Checking the number of rows and columns in the DataFrame. The shape attribute returns a tuple where the first element is the number of rows and the second element is the number of columns.
adult_df.shape

(1725, 4)

In [8]:
#Renaming the columns 'entity' to 'Country' and 'adult_literacy_rate__population_15plus_years__both_sexes__pct__lr_ag15t99' to 'Adult_Literacy_Rate' for better readability and understanding of the data. The rename() function is used to change the column names in the DataFrame, and the inplace=True argument ensures that the changes are made directly to the original DataFrame without needing to assign it back to a new variable.      
adult_df.rename(columns={'entity': 'Country', 'adult_literacy_rate__population_15plus_years__both_sexes__pct__lr_ag15t99': 'Adult_Literacy_Rate'}, inplace=True)

In [9]:
#Cleaning the 'Country' column by removing leading and trailing whitespace and converting the text to title case (capitalizing the first letter of each word) for better readability and consistency.
adult_df['Country'] = adult_df['Country'].str.strip().str.title()


In [10]:
#Getting a statistical summary of the DataFrame, including count, mean, standard deviation, minimum, 25th percentile, median (50th percentile), 75th percentile, and maximum values for each numeric column. This helps us understand the distribution and central tendency of the data.
adult_df.describe()

Unnamed: 0,year,Adult_Literacy_Rate
count,1725.0,1725.0
mean,2003.86087,77.336331
std,13.176796,19.335627
min,1970.0,9.0
25%,1994.0,64.49239
50%,2007.0,82.15758
75%,2015.0,94.0
max,2023.0,100.0


In [11]:
#Checking for missing values in the DataFrame. The isnull() function returns a DataFrame of the same shape as adult_df, where each cell contains True if the corresponding value in adult_df is null (missing) and False otherwise. The sum() function then counts the number of True values in each column, giving us the total count of missing values for each column.
adult_df.isnull().sum()

Country                  0
code                   729
year                     0
Adult_Literacy_Rate      0
dtype: int64

In [12]:
#Dropping the 'code' column from the DataFrame since it is having more than 40 % of null values not needed for our analysis.
adult_df = adult_df.drop(columns=['code'])

In [13]:
#Checking for duplicate rows in the DataFrame. The duplicated() function returns a Series of boolean values indicating whether each row is a duplicate of a previous row. The sum() function then counts the number of True values in this Series, giving us the total count of duplicate rows in the DataFrame.
adult_df.duplicated().sum()

np.int64(0)

In [14]:
adult_df.sample(10) #Displaying a random sample of 10 rows from the DataFrame to get a quick overview of the data. The sample() function is used to randomly select a specified number of rows from the DataFrame, and in this case, we are selecting 10 rows for inspection.

Unnamed: 0,Country,year,Adult_Literacy_Rate
1503,Tanzania,2015,78.0
467,Europe And Northern America (Sdg),2020,98.66912
266,Dominican Republic,1981,73.0
679,Latin America And The Caribbean (Sdg),1997,88.04464
820,Malaysia,1991,83.0
894,Middle East And North Africa (Wb),1984,50.99546
510,Guinea-Bissau,2019,54.88232
1476,Sub-Saharan Africa (Wb),2012,62.14601
869,Mexico,2005,92.0
969,Middle-Income Countries,2010,83.27524


In [15]:
youth_df = pd.read_csv('Youth_df.csv') #Reading the Youth_df.csv file into a DataFrame called youth_df
youth_df.head()

Unnamed: 0,entity,code,year,youth_literacy_rate__population_15_24_years__male__pct__lr_ag15t24_m,youth_literacy_rate__population_15_24_years__female__pct__lr_ag15t24_f,owid_region
0,Afghanistan,AFG,1979,46.0,11.0,Asia
1,Afghanistan,AFG,2011,62.0,32.0,Asia
2,Afghanistan,AFG,2015,57.73505,25.48416,Asia
3,Afghanistan,AFG,2021,71.0,42.0,Asia
4,Afghanistan,AFG,2022,83.4,44.17171,Asia


In [16]:
#Getting a statistical summary of the DataFrame, including count, mean, standard deviation, minimum, 25th percentile, median (50th percentile), 75th percentile, and maximum values for each numeric column. This helps us understand the distribution and central tendency of the data.
youth_df.describe()

Unnamed: 0,year,youth_literacy_rate__population_15_24_years__male__pct__lr_ag15t24_m,youth_literacy_rate__population_15_24_years__female__pct__lr_ag15t24_f
count,2002.0,1925.0,2002.0
mean,2004.844156,89.394051,84.052252
std,12.765391,12.571677,19.450914
min,1970.0,22.0,7.0
25%,1997.0,84.0,73.830225
50%,2007.0,94.4,93.59
75%,2015.0,98.6,98.90969
max,2023.0,100.0,100.0


In [17]:
#Checking for missing values in the DataFrame. The isnull() function returns a DataFrame of the same shape as adult_df, where each cell contains True if the corresponding value in adult_df is null (missing) and False otherwise. The sum() function then counts the number of True values in each column, giving us the total count of missing values for each column.
youth_df.isnull().sum()

entity                                                                      0
code                                                                      742
year                                                                        0
youth_literacy_rate__population_15_24_years__male__pct__lr_ag15t24_m       77
youth_literacy_rate__population_15_24_years__female__pct__lr_ag15t24_f      0
owid_region                                                               791
dtype: int64

In [18]:
#imputing the missing values in the 'youth_literacy_rate__population
median_val = youth_df['youth_literacy_rate__population_15_24_years__male__pct__lr_ag15t24_m'].median()
youth_df['youth_literacy_rate__population_15_24_years__male__pct__lr_ag15t24_m'] = \
    youth_df['youth_literacy_rate__population_15_24_years__male__pct__lr_ag15t24_m'].fillna(median_val)

In [19]:
#Dropping the 'code' column from the DataFrame since it is having more than 40 % of null values not needed for our analysis.
youth_df = youth_df.drop(columns=['code'])

In [20]:
#Dropping the 'owid_region' column from the DataFrame since it is having more than 40 % of null values not needed for our analysis.
youth_df = youth_df.drop(columns=['owid_region'])

In [21]:
#Checking for duplicate rows in the DataFrame. The duplicated() function returns a Series of boolean values indicating whether each row is a duplicate of a previous row. The sum() function then counts the number of True values in this Series, giving us the total count of duplicate rows in the DataFrame.
youth_df.duplicated().sum()

np.int64(0)

In [22]:
#Renaming the columns. The rename() function is used to change the column names in the DataFrame, and the inplace=True argument ensures that the changes are made directly to the original DataFrame without needing to assign it back to a new variable.      
youth_df.rename(columns={'entity': 'Country', 'youth_literacy_rate__population_15_24_years__male__pct__lr_ag15t24_m': 'Youth_Literacy_Rate_Male', 'youth_literacy_rate__population_15_24_years__female__pct__lr_ag15t24_f':'Youth_Literacy_Rate_Female'}, inplace=True)


In [23]:
#Cleaning the 'Country' column by removing leading and trailing whitespace and converting the text to title case (capitalizing the first letter of each word) for better readability and consistency.
youth_df['Country'] = youth_df['Country'].str.strip().str.title()

In [24]:
youth_df.sample(10) 


Unnamed: 0,Country,year,Youth_Literacy_Rate_Male,Youth_Literacy_Rate_Female
1604,South Asia (Wb),2018,90.6786,86.08996
1142,Middle-Income Countries,2016,93.68402,91.14309
1313,Oceania (Excluding Australia And New Zealand) ...,2018,70.31829,76.28157
1232,North America (Wb),2019,99.66159,99.56584
1935,Vietnam,2014,94.4,98.81
1150,Moldova,1989,100.0,100.0
1914,Venezuela,2002,97.2,98.4
641,Guinea-Bissau,2000,75.0,46.0
888,Low-Income Countries,1989,66.95475,48.77668
558,Europe And Central Asia (Wb),2005,99.37011,99.01076


In [25]:
# Merge on Country and year
df_literacy = pd.merge(
    adult_df,
    youth_df,
    on=["Country", "year"],   # common keys
    how="outer"               # keep all rows from both
)



In [26]:
df_literacy.head()

Unnamed: 0,Country,year,Adult_Literacy_Rate,Youth_Literacy_Rate_Male,Youth_Literacy_Rate_Female
0,Afghanistan,1979,18.0,46.0,11.0
1,Afghanistan,2011,31.0,62.0,32.0
2,Afghanistan,2015,33.75384,57.73505,25.48416
3,Afghanistan,2021,37.0,71.0,42.0
4,Afghanistan,2022,,83.4,44.17171


In [None]:
import pandas as pd

# Assuming df_literacy already exists with columns:
# Country, year, Adult_Literacy_Rate, Youth_Literacy_Rate_Male, Youth_Literacy_Rate_Female, GDP, Avg_Schooling_Years



# 1. Literacy Gender Gap
df_literacy['Literacy_Gender_Gap'] = df_literacy['Youth_Literacy_Rate_Male'] - df_literacy['Youth_Literacy_Rate_Female']

# 2. Education Index (simplified version)
df_literacy['Education_Index'] = (
    df_literacy[['Adult_Literacy_Rate', 'Youth_Literacy_Rate_Male', 'Youth_Literacy_Rate_Female']].mean(axis=1)
)

# 3. Youth Literacy Average
df_literacy['Youth_Literacy_Avg'] = (
    df_literacy['Youth_Literacy_Rate_Male'] + df_literacy['Youth_Literacy_Rate_Female']
) / 2

# 4. Literacy Growth Rate (year-over-year change by country)
df_literacy['Literacy_Growth_Rate'] = (
    df_literacy.groupby('Country')['Adult_Literacy_Rate'].pct_change(fill_method=None) * 100
)

In [37]:
df_literacy.sample(20)  

Unnamed: 0,Country,year,Adult_Literacy_Rate,Youth_Literacy_Rate_Male,Youth_Literacy_Rate_Female,Literacy_Gender_Gap,Education_Index,Youth_Literacy_Avg,Literacy_Growth_Rate
885,Lesotho,2018,86.24087,92.35318,98.26914,-5.91596,92.28773,95.31116,-11.091887
717,Iraq,2012,77.0,87.0,79.0,8.0,81.0,83.0,
1618,South Asia (Wb),2016,71.10578,90.26568,85.64712,4.61856,82.339527,87.9564,1.896803
458,Eastern And South-Eastern Asia (Sdg),2016,95.33295,98.89226,98.82578,0.06648,97.683663,98.85902,0.22024
1821,Turkey,2017,96.0,100.0,100.0,0.0,98.666667,100.0,0.0
1702,Sub-Saharan Africa (Wb),1987,50.62446,72.09146,56.50027,15.59119,59.73873,64.295865,1.258945
1984,World,2003,82.11429,90.82547,84.83368,5.99179,85.92448,87.829575,0.580719
126,Botswana,2003,81.0,92.0,96.0,-4.0,89.666667,94.0,17.391304
1217,New Caledonia,1996,96.0,99.0,99.0,0.0,98.0,99.0,3.225806
1525,Senegal,2018,46.60591,66.22319,65.68582,0.53737,59.504973,65.954505,-10.37325


In [39]:
df_literacy.shape

(2019, 9)

In [38]:
df_literacy.isnull().sum()


Country                         0
year                            0
Adult_Literacy_Rate           294
Youth_Literacy_Rate_Male       17
Youth_Literacy_Rate_Female     17
Literacy_Gender_Gap            17
Education_Index                 0
Youth_Literacy_Avg             17
Literacy_Growth_Rate          611
dtype: int64

In [None]:
import pandas as pd

def clean_literacy_data(df_literacy):
    # 1. Adult_Literacy_Rate â†’ interpolate within each country
    df_literacy['Adult_Literacy_Rate'] = (
        df_literacy.groupby('Country')['Adult_Literacy_Rate']
        .transform(lambda x: x.interpolate())
    )

    # 2. If some values remain missing after interpolation, fill with column mean
    df_literacy['Adult_Literacy_Rate'] = df_literacy['Adult_Literacy_Rate'].fillna(
        df_literacy['Adult_Literacy_Rate'].mean()
    )

    # 3. Recompute engineered features
    
    df_literacy['Literacy_Gender_Gap'] = (
        df_literacy['Youth_Literacy_Rate_Male'] - df_literacy['Youth_Literacy_Rate_Female']
    )
    df_literacy['Youth_Literacy_Avg'] = (
        df_literacy[['Youth_Literacy_Rate_Male', 'Youth_Literacy_Rate_Female']].mean(axis=1)
    )
    df_literacy['Education_Index'] = (
        df_literacy[['Adult_Literacy_Rate', 'Youth_Literacy_Rate_Male', 'Youth_Literacy_Rate_Female']].mean(axis=1)
    )
    df_literacy['Literacy_Growth_Rate'] = (
        df_literacy.groupby('Country')['Adult_Literacy_Rate'].pct_change(fill_method=None) * 100
    ).fillna(0)

    return df_literacy


# --- Usage ---
df_literacy = clean_literacy_data(df_literacy)



# Verify nulls
print(df_literacy.isnull().sum())

Country                       0
year                          0
Adult_Literacy_Rate           0
Youth_Literacy_Rate_Male      0
Youth_Literacy_Rate_Female    0
Literacy_Gender_Gap           0
Education_Index               0
Youth_Literacy_Avg            0
Literacy_Growth_Rate          0
Illiteracy_Percent            0
dtype: int64


In [57]:
df_literacy.sample(20)

Unnamed: 0,Country,year,Adult_Literacy_Rate,Youth_Literacy_Rate_Male,Youth_Literacy_Rate_Female,Literacy_Gender_Gap,Education_Index,Youth_Literacy_Avg,Literacy_Growth_Rate
1410,Paraguay,2018,94.0,97.7,98.9,-1.2,96.866667,98.3,-0.529101
1101,Middle East And North Africa (Wb),2012,77.99774,92.95354,86.87201,6.08153,85.941097,89.912775,0.871503
75,Belarus,1989,98.0,100.0,100.0,0.0,99.333333,100.0,0.0
1617,South Asia (Wb),2015,69.78215,89.4431,84.15232,5.29078,81.125857,86.79771,1.938481
1476,Romania,2011,99.0,99.0,99.0,0.0,99.0,99.0,2.061856
466,Ecuador,1982,84.0,95.0,92.0,3.0,90.333333,93.5,0.0
515,El Salvador,2013,87.0,96.9,97.5,-0.6,93.8,97.2,1.162791
1143,Middle-Income Countries,2005,80.87994,92.06743,86.04476,6.02267,86.33071,89.056095,0.000532
776,Latin America And Caribbean (Wb),1976,79.0867,88.74199,87.22708,1.51491,85.01859,87.984535,0.443588
837,Latin America And The Caribbean (Sdg),1987,83.21799,89.05918,91.00017,-1.94099,87.759113,90.029675,0.352521


In [56]:
df_literacy.columns

Index(['Country', 'year', 'Adult_Literacy_Rate', 'Youth_Literacy_Rate_Male',
       'Youth_Literacy_Rate_Female', 'Literacy_Gender_Gap', 'Education_Index',
       'Youth_Literacy_Avg', 'Literacy_Growth_Rate'],
      dtype='object')

In [58]:
# Save cleaned dataset
df_literacy.to_csv('literacy_rates.csv', index=False)

In [29]:
%pip install mysql-connector-python

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [59]:
import pandas as pd
import numpy as np


# 1. Load CSV into Pandas

df=pd.read_csv("literacy_rates.csv")   
print("CSV loaded successfully!")
print(df.head())

CSV loaded successfully!
       Country  year  Adult_Literacy_Rate  Youth_Literacy_Rate_Male  \
0  Afghanistan  1979             18.00000                  46.00000   
1  Afghanistan  2011             31.00000                  62.00000   
2  Afghanistan  2015             33.75384                  57.73505   
3  Afghanistan  2021             37.00000                  71.00000   
4  Afghanistan  2022             37.00000                  83.40000   

   Youth_Literacy_Rate_Female  Literacy_Gender_Gap  Education_Index  \
0                    11.00000             35.00000        25.000000   
1                    32.00000             30.00000        41.666667   
2                    25.48416             32.25089        38.991017   
3                    42.00000             29.00000        50.000000   
4                    44.17171             39.22829        54.857237   

   Youth_Literacy_Avg  Literacy_Growth_Rate  
0           28.500000              0.000000  
1           47.000000        

In [66]:
# 2. Connect to MySQL Database
import mysql.connector

# ---- Database Connection ----
conn = mysql.connector.connect(
        host="localhost",
        user="root",
        password="KAvi",
        database="global_literacy_db"
    )

cursor = conn.cursor()
if conn.is_connected(): 
   print("Connected to MySQL server successfully!")
else:
    print("Failed to connect to MySQL server.")

Connected to MySQL server successfully!


In [64]:
# 3. Create database if not exists
cursor.execute("CREATE DATABASE IF NOT EXISTS global_literacy_db")
cursor.execute("USE global_literacy_db")

# Create table if not exists
create_table_query = """
CREATE TABLE literacy_rates (
    country VARCHAR(250) NOT NULL,
    year INT NOT NULL,
    adult_literacy_rate DECIMAL(5,2),
    youth_literacy_rate_male DECIMAL(5,2),
    youth_literacy_rate_female DECIMAL(5,2),
    literacy_gender_gap DECIMAL(5,2),
    education_index DECIMAL(5,2),
    youth_literacy_avg DECIMAL(5,2),
    literacy_growth_rate DECIMAL(6,2),
    PRIMARY KEY (country, year)
);
"""
cursor.execute(create_table_query)


print ("Table created successfully!")

cursor.close()
conn.close()

Table created successfully!


In [67]:
# 4. Push DataFrame into MySQL

# Using executemany (manual control)


insert_query = """
INSERT INTO literacy_rates (
    country,
    year,
    adult_literacy_rate,
    youth_literacy_rate_male,
    youth_literacy_rate_female,
    literacy_gender_gap,
    education_index,
    youth_literacy_avg,
    literacy_growth_rate
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
"""


records = df.to_records(index=False)
# Convert DataFrame to native Python types
values = df.replace({np.nan: None}).values.tolist()

# Bulk insert
cursor.executemany(insert_query, values)

conn.commit()
cursor.close()
conn.close()

print("CSV data inserted successfully!")

CSV data inserted successfully!


In [88]:
illiterate_df = pd.read_csv('Illiterate_df.csv') #Reading the Illiterate_df.csv file into a DataFrame called illiterate_df
illiterate_df.head()

Unnamed: 0,entity,code,year,illiteracy_rate,literacy_rate
0,Afghanistan,AFG,1950,97.0,3.0
1,Afghanistan,AFG,1979,82.0,18.0
2,Afghanistan,AFG,2011,69.0,31.0
3,Afghanistan,AFG,2015,66.246155,33.75384
4,Afghanistan,AFG,2021,63.0,37.0


In [89]:
 #1.Checking for missing values in the DataFrame. The isnull() function returns a DataFrame of the same shape as adult_df, where each cell contains True if the corresponding value in adult_df is null (missing) and False otherwise. The sum() function then counts the number of True values in each column, giving us the total count of missing values for each column.
illiterate_df.isnull().sum()

entity               0
code               733
year                 0
illiteracy_rate      0
literacy_rate        0
dtype: int64

In [90]:
#2.Dropping the 'code' column from the DataFrame since it is having more than 40 % of null values not needed for our analysis.
illiterate_df = illiterate_df.drop(columns=['code'])

In [91]:
#3.Checking for duplicate rows in the DataFrame. The duplicated() function returns a Series of boolean values indicating whether each row is a duplicate of a previous row. The sum() function then counts the number of True values in this Series, giving us the total count of duplicate rows in the DataFrame.
illiterate_df.duplicated().sum()

np.int64(0)

In [92]:
#4.Renaming the columns 'entity' to 'Country' and 'adult_literacy_rate__population_15plus_years__both_sexes__pct__lr_ag15t99' to 'Adult_Literacy_Rate' for better readability and understanding of the data. The rename() function is used to change the column names in the DataFrame, and the inplace=True argument ensures that the changes are made directly to the original DataFrame without needing to assign it back to a new variable.      
illiterate_df.rename(columns={'entity': 'Country'}, inplace=True)


In [93]:
#5.Cleaning the 'Country' column by removing leading and trailing whitespace and converting the text to title case (capitalizing the first letter of each word) for better readability and consistency.
illiterate_df['Country'] = illiterate_df['Country'].str.strip().str.title()

In [94]:
#6.Checking the number of rows and columns in the DataFrame. The shape attribute returns a tuple where the first element is the number of rows and the second element is the number of columns.
illiterate_df.shape

(2059, 4)

In [None]:
#7.Getting a concise summary of the DataFrame, including the number of non-null entries, data types of each column, and memory usage. This helps us understand the structure of the DataFrame and identify any potential issues with missing data or incorrect data types.
illiterate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2059 entries, 0 to 2058
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country          2059 non-null   object 
 1   year             2059 non-null   int64  
 2   illiteracy_rate  2059 non-null   float64
 3   literacy_rate    2059 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 64.5+ KB


In [96]:
#8.Getting a statistical summary of the DataFrame, including count, mean, standard deviation, minimum, 25th percentile, median (50th percentile), 75th percentile, and maximum values for each numeric column. This helps us understand the distribution and central tendency of the data.
illiterate_df.describe()


Unnamed: 0,year,illiteracy_rate,literacy_rate
count,2059.0,2059.0,2059.0
mean,1986.084507,27.421922,72.578078
std,64.008447,24.485748,24.485748
min,1475.0,0.0,0.0
25%,1983.0,6.704529,57.98169
50%,2002.0,21.0,79.0
75%,2013.0,42.01831,93.29547
max,2023.0,100.0,100.0


In [97]:
illiterate_df.sample(10) #Displaying a random sample of 10 rows from the DataFrame to get a quick overview of the data. The sample() function is used to randomly select a specified number of rows from the DataFrame, and in this case, we are selecting 10 rows for inspection.  

Unnamed: 0,Country,year,illiteracy_rate,literacy_rate
1549,Sierra Leone,2017,55.16682,44.83318
1947,Uruguay,2018,1.0,99.0
1572,South Africa,1950,57.5,42.5
653,Honduras,2001,20.0,80.0
268,Chile,1992,6.0,94.0
1446,Philippines,2019,4.0,96.0
429,Eastern And South-Eastern Asia (Sdg),2000,9.54808,90.45192
1501,Saint Helena,1998,3.0,97.0
1788,Thailand,1950,47.5,52.5
458,Ecuador,2008,8.0,92.0


In [None]:
gdp_df = pd.read_csv('GDP_df.csv') #Reading the GDP_df.csv file into a DataFrame called gdp_df
gdp_df.head()

Unnamed: 0,entity,code,year,ny_gdp_pcap_pp_kd,owid_region
0,Afghanistan,AFG,2000,1617.8264,Asia
1,Afghanistan,AFG,2001,1454.1108,Asia
2,Afghanistan,AFG,2002,1774.3087,Asia
3,Afghanistan,AFG,2003,1815.9282,Asia
4,Afghanistan,AFG,2004,1776.9182,Asia


In [None]:
avgys_df = pd.read_csv('Avgys_df.csv') #Reading the Avgys_df.csv file into a DataFrame called avgys_df
avgys_df.head()

Unnamed: 0,entity,code,year,literacy_rate,mf_youth_and_adults__15_64_years__average_years_of_education,population_historical,owid_region
0,Afghanistan,AFG,1870,,0.01,4101170.0,Asia
1,Afghanistan,AFG,1875,,0.01,4193635.0,Asia
2,Afghanistan,AFG,1880,,0.01,4288021.0,Asia
3,Afghanistan,AFG,1885,,0.01,4384343.0,Asia
4,Afghanistan,AFG,1890,,0.01,4482635.0,Asia


In [None]:

# # 3. GDP per Schooling Year
# # (only if GDP and Avg_Schooling_Years columns exist)
# df_literacy['GDP_per_Schooling_Year'] = df_literacy['GDP'] / df_literacy['Avg_Schooling_Years']
