# Import libraries and data sets

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import os 

#Import clean data sets 
path = r'C:\Users\ctede\OneDrive\Desktop\World University Rankings Analysis'
school_country = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'school_and_country_table.csv'))
#edu_expense = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'education_expenditure_supplementary_data.csv'))
#edu_attain = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'education_attainment_supplementary_data.csv'))

# School and Country data set cleaning

In [2]:
#Summary stats
school_country.describe()

Unnamed: 0,school_name,country
count,818,818
unique,818,70
top,Harvard University,United States of America
freq,1,161


In [3]:
school_country.head()

Unnamed: 0,school_name,country
0,Harvard University,United States of America
1,California Institute of Technology,United States of America
2,Massachusetts Institute of Technology,United States of America
3,Stanford University,United States of America
4,Princeton University,United States of America


In [4]:
#Show counts of each country to check for consistencies 
school_country['country'].value_counts()

United States of America    161
United Kingdom               78
Japan                        41
Germany                      39
China                        38
                           ... 
Luxembourg                    1
Morocco                       1
Macau                         1
Iceland                       1
Lithuania                     1
Name: country, Length: 70, dtype: int64

In [5]:
#Show counts of each university to check for consistencies 
school_country['school_name'].value_counts()

Harvard University                                1
University of Parma                               1
Marche Polytechnic University                     1
University of Nantes                              1
National and Kapodistrian University of Athens    1
                                                 ..
University of Kiel                                1
University of Southern Denmark                    1
University of Texas at Dallas                     1
University of the Witwatersrand                   1
Yokohama National University                      1
Name: school_name, Length: 818, dtype: int64

### Check for missing data

In [6]:
school_country.isnull().value_counts()

school_name  country
False        False      818
dtype: int64

#### No missing values found 

### Check for full duplicate data 

In [7]:
#Create a dataframe that has full duplicates
school_country_dups = school_country[school_country.duplicated()]
school_country_dups

Unnamed: 0,school_name,country


#### No full duplicates found 

### Rename school_name to university_name 

In [8]:
#CWUR.rename(columns = {'institution':'university_name'}, inplace=True)
school_country.rename(columns = {'school_name':'university_name'}, inplace=True)

In [9]:
school_country.head()

Unnamed: 0,university_name,country
0,Harvard University,United States of America
1,California Institute of Technology,United States of America
2,Massachusetts Institute of Technology,United States of America
3,Stanford University,United States of America
4,Princeton University,United States of America


In [10]:
school_country

Unnamed: 0,university_name,country
0,Harvard University,United States of America
1,California Institute of Technology,United States of America
2,Massachusetts Institute of Technology,United States of America
3,Stanford University,United States of America
4,Princeton University,United States of America
...,...,...
813,Xidian University,China
814,Yeungnam University,South Korea
815,Yıldız Technical University,Turkey
816,Yokohama City University,Japan


In [11]:
school_country.describe()

Unnamed: 0,university_name,country
count,818,818
unique,818,70
top,Harvard University,United States of America
freq,1,161


In [12]:
#Export school and country data set
school_country.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'school_country_clean.pkl'))

# Import educational attainment supplemental data set

In [17]:
edu_attain = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'edu_attain.csv'))

In [22]:
edu_attain

Unnamed: 0,country_name,series_name,1985,1986,1987,1990,1991,1992,1993,1995,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015
0,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.33,,,0.44,,,,0.57,...,0.86,,,,,1.27,,,,
1,Afghanistan,"Barro-Lee: Average years of primary schooling,...",1.03,,,1.26,,,,1.54,...,2.18,,,,,2.64,,,,
2,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.83,,,0.95,,,,1.26,...,1.01,,,,,2.45,,,,
3,Afghanistan,"Barro-Lee: Average years of primary schooling,...",2.34,,,2.22,,,,2.37,...,2.26,,,,,3.55,,,,
4,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.54,,,0.92,,,,0.94,...,2.00,,,,,1.29,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79050,,,,,,,,,,,...,,,,,,,,,,
79051,,,,,,,,,,,...,,,,,,,,,,
79052,,,,,,,,,,,...,,,,,,,,,,
79053,Data from database: Education Statistics: Educ...,,,,,,,,,,...,,,,,,,,,,


In [23]:
#Change settings to show all variables/columns
pd.set_option('display.max_columns', None)

In [24]:
edu_attain

Unnamed: 0,country_name,series_name,1985,1986,1987,1990,1991,1992,1993,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015
0,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.33,,,0.44,,,,0.57,,,,,0.75,,,,,0.86,,,,,1.27,,,,
1,Afghanistan,"Barro-Lee: Average years of primary schooling,...",1.03,,,1.26,,,,1.54,,,,,2.01,,,,,2.18,,,,,2.64,,,,
2,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.83,,,0.95,,,,1.26,,,,,1.92,,,,,1.01,,,,,2.45,,,,
3,Afghanistan,"Barro-Lee: Average years of primary schooling,...",2.34,,,2.22,,,,2.37,,,,,3.83,,,,,2.26,,,,,3.55,,,,
4,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.54,,,0.92,,,,0.94,,,,,1.26,,,,,2.00,,,,,1.29,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79050,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
79051,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
79052,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
79053,Data from database: Education Statistics: Educ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Show descriptive statistics 

In [25]:
edu_attain.describe()

Unnamed: 0,1985,1986,1987,1990,1991,1992,1993,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015
count,51843.0,12.0,3.0,51846.0,1.0,3.0,1.0,51884.0,3.0,0.0,47.0,9.0,51970.0,225.0,184.0,55.0,137.0,52037.0,204.0,286.0,366.0,464.0,52796.0,929.0,1106.0,1879.0,0.0
mean,253.059289,6.365242,1.98822,280.06165,7.58463,5.758003,6.22103,306.774826,7.79931,,8.811033,10.010412,334.780437,16.347225,16.282858,19.528067,11.737983,363.5199,13.172861,16.07378,14.849021,17.527544,386.6418,20.351652,19.483375,23.113742,
std,5812.705429,2.335996,0.883044,6525.945102,,0.24401,,7192.240329,0.30145,,13.700154,0.523507,7877.842661,21.523526,21.445675,22.526506,16.959871,8563.497,14.756343,22.696642,17.947568,22.434285,9146.338,25.186301,24.036373,28.481772,
min,0.0,3.7406,1.11897,0.0,7.58463,5.51415,6.22103,0.0,7.49769,,0.0,9.47552,0.0,0.0,0.0,0.16129,0.0,0.0,0.12933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,1.02,4.8412,1.54011,1.23,7.58463,5.63592,6.22103,1.54,7.64867,,0.348338,9.6581,1.71,3.09057,1.556833,6.910945,3.54881,1.93,5.855385,3.057243,5.408055,4.462345,2.01,2.74371,1.498102,1.050885,
50%,5.07,5.58128,1.96125,5.67,7.58463,5.75769,6.22103,6.25,7.79965,,2.38688,9.87822,6.77,9.06781,7.45533,11.25131,7.29729,7.25,8.62346,9.2346,9.460645,10.165165,7.47,10.79722,10.60282,11.87501,
75%,23.245,7.255653,2.422845,24.1,7.58463,5.87993,6.22103,24.86,7.95012,,9.859935,10.02368,25.2,19.47818,23.969655,28.353175,10.70231,26.41,15.203413,16.020193,14.59002,17.457818,27.0,25.52938,26.92952,31.535075,
max,745898.0,10.19288,2.88444,835430.0,7.58463,6.00217,6.22103,896920.0,8.10059,,55.25822,11.13991,958307.0,98.25549,91.68136,90.41407,85.39676,1034076.0,93.68704,99.99305,97.70276,98.72121,1090693.0,99.52137,98.67698,100.0,


### Check for missing values

In [27]:
edu_attain.isnull().sum()

country_name        3
series_name         5
1985            27212
1986            79043
1987            79052
1990            27209
1991            79054
1992            79052
1993            79054
1995            27171
1996            79052
1997            79055
1998            79008
1999            79046
2000            27085
2001            78830
2002            78871
2003            79000
2004            78918
2005            27018
2006            78851
2007            78769
2008            78689
2009            78591
2010            26259
2011            78126
2012            77949
2013            77176
2015            79055
dtype: int64

In [28]:
#Show missing values in a dataframe 
edu_attain_nulls = edu_attain[edu_attain['country_name'].isnull()==True]

In [29]:
edu_attain_nulls

Unnamed: 0,country_name,series_name,1985,1986,1987,1990,1991,1992,1993,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015
79050,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
79051,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
79052,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [30]:
#Drop index 79050, 79051, and 79052 because all values missing for every variable 
#Drop index 3896 (world rank 99) bc missing all data values 
#AWUR_drop = AWUR.drop(3896)
edu_attain_drop = edu_attain.drop(79050)

In [31]:
edu_attain_drop_nulls = edu_attain_drop[edu_attain_drop['country_name'].isnull()==True]

In [32]:
edu_attain_drop_nulls

Unnamed: 0,country_name,series_name,1985,1986,1987,1990,1991,1992,1993,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015
79051,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
79052,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [33]:
#Drop index 79051 and 79052
#df = df.drop(index=[2,4,6])
edu_attain_drop = edu_attain_drop.drop(index=[79051,79052])

In [34]:
edu_attain_drop[edu_attain_drop['country_name'].isnull()==True]

Unnamed: 0,country_name,series_name,1985,1986,1987,1990,1991,1992,1993,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015


In [35]:
#Check missing values in series_name variable 
edu_attain_drop[edu_attain_drop['series_name'].isnull()==True]

Unnamed: 0,country_name,series_name,1985,1986,1987,1990,1991,1992,1993,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015
79053,Data from database: Education Statistics: Educ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,
79054,Last Updated: 10/20/2015,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [36]:
#Drop 79053 and 79054 index because do not contain data that is significant or of value to analysis
edu_attain_drop2 = edu_attain_drop.drop(index=[79053,79054])

In [38]:
edu_attain_drop2[edu_attain_drop2['series_name'].isnull()==True]

Unnamed: 0,country_name,series_name,1985,1986,1987,1990,1991,1992,1993,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015


### Check for full duplicates

In [39]:
#Create a dataframe that has full duplicates
edu_attain_dups = edu_attain_drop2[edu_attain_drop2.duplicated()]
edu_attain_dups

Unnamed: 0,country_name,series_name,1985,1986,1987,1990,1991,1992,1993,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015


#### There are no full duplicates in this data frame

### Check for other consistencies

In [41]:
#Change "country_name" to "country" to be more consistent with other data sets
edu_attain_drop2.rename(columns = {'country_name':'country'}, inplace=True)

In [42]:
edu_attain_drop2

Unnamed: 0,country,series_name,1985,1986,1987,1990,1991,1992,1993,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015
0,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.33,,,0.44,,,,0.57,,,,,0.75,,,,,0.86,,,,,1.27,,,,
1,Afghanistan,"Barro-Lee: Average years of primary schooling,...",1.03,,,1.26,,,,1.54,,,,,2.01,,,,,2.18,,,,,2.64,,,,
2,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.83,,,0.95,,,,1.26,,,,,1.92,,,,,1.01,,,,,2.45,,,,
3,Afghanistan,"Barro-Lee: Average years of primary schooling,...",2.34,,,2.22,,,,2.37,,,,,3.83,,,,,2.26,,,,,3.55,,,,
4,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.54,,,0.92,,,,0.94,,,,,1.26,,,,,2.00,,,,,1.29,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79045,Zimbabwe,UIS: Percentage of population age 25+ with som...,,,,,,,,,,,,,,,,,,,,,,,,,13.84323,,
79046,Zimbabwe,UIS: Percentage of population age 25+ with som...,,,,,,,,,,,,,,,,,,,,,,,,,16.68491,,
79047,Zimbabwe,UIS: Percentage of population age 25+ with unk...,,,,,,,,,,,,,,,,,,,,,,,,,11.99412,,
79048,Zimbabwe,UIS: Percentage of population age 25+ with unk...,,,,,,,,,,,,,,,,,,,,,,,,,5.77150,,


In [43]:
#Summary stats of new, clean data set
edu_attain_drop2.describe()

Unnamed: 0,1985,1986,1987,1990,1991,1992,1993,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015
count,51843.0,12.0,3.0,51846.0,1.0,3.0,1.0,51884.0,3.0,0.0,47.0,9.0,51970.0,225.0,184.0,55.0,137.0,52037.0,204.0,286.0,366.0,464.0,52796.0,929.0,1106.0,1879.0,0.0
mean,253.059289,6.365242,1.98822,280.06165,7.58463,5.758003,6.22103,306.774826,7.79931,,8.811033,10.010412,334.780437,16.347225,16.282858,19.528067,11.737983,363.5199,13.172861,16.07378,14.849021,17.527544,386.6418,20.351652,19.483375,23.113742,
std,5812.705429,2.335996,0.883044,6525.945102,,0.24401,,7192.240329,0.30145,,13.700154,0.523507,7877.842661,21.523526,21.445675,22.526506,16.959871,8563.497,14.756343,22.696642,17.947568,22.434285,9146.338,25.186301,24.036373,28.481772,
min,0.0,3.7406,1.11897,0.0,7.58463,5.51415,6.22103,0.0,7.49769,,0.0,9.47552,0.0,0.0,0.0,0.16129,0.0,0.0,0.12933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,1.02,4.8412,1.54011,1.23,7.58463,5.63592,6.22103,1.54,7.64867,,0.348338,9.6581,1.71,3.09057,1.556833,6.910945,3.54881,1.93,5.855385,3.057243,5.408055,4.462345,2.01,2.74371,1.498102,1.050885,
50%,5.07,5.58128,1.96125,5.67,7.58463,5.75769,6.22103,6.25,7.79965,,2.38688,9.87822,6.77,9.06781,7.45533,11.25131,7.29729,7.25,8.62346,9.2346,9.460645,10.165165,7.47,10.79722,10.60282,11.87501,
75%,23.245,7.255653,2.422845,24.1,7.58463,5.87993,6.22103,24.86,7.95012,,9.859935,10.02368,25.2,19.47818,23.969655,28.353175,10.70231,26.41,15.203413,16.020193,14.59002,17.457818,27.0,25.52938,26.92952,31.535075,
max,745898.0,10.19288,2.88444,835430.0,7.58463,6.00217,6.22103,896920.0,8.10059,,55.25822,11.13991,958307.0,98.25549,91.68136,90.41407,85.39676,1034076.0,93.68704,99.99305,97.70276,98.72121,1090693.0,99.52137,98.67698,100.0,


In [45]:
#Drop variables 1997 and 2015 because all data values are missing 
#CWUR_drop = CWUR.drop('broad_impact', axis=1)
edu_attain_drop3 = edu_attain_drop2.drop('1997', axis=1)

In [46]:
edu_attain_drop3.head()

Unnamed: 0,country,series_name,1985,1986,1987,1990,1991,1992,1993,1995,1996,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015
0,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.33,,,0.44,,,,0.57,,,,0.75,,,,,0.86,,,,,1.27,,,,
1,Afghanistan,"Barro-Lee: Average years of primary schooling,...",1.03,,,1.26,,,,1.54,,,,2.01,,,,,2.18,,,,,2.64,,,,
2,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.83,,,0.95,,,,1.26,,,,1.92,,,,,1.01,,,,,2.45,,,,
3,Afghanistan,"Barro-Lee: Average years of primary schooling,...",2.34,,,2.22,,,,2.37,,,,3.83,,,,,2.26,,,,,3.55,,,,
4,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.54,,,0.92,,,,0.94,,,,1.26,,,,,2.0,,,,,1.29,,,,


In [47]:
#Drop variables 1997 and 2015 because all data values are missing 
#CWUR_drop = CWUR.drop('broad_impact', axis=1)
edu_attain_drop3 = edu_attain_drop3.drop('2015', axis=1)

In [48]:
edu_attain_drop3.head()

Unnamed: 0,country,series_name,1985,1986,1987,1990,1991,1992,1993,1995,1996,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.33,,,0.44,,,,0.57,,,,0.75,,,,,0.86,,,,,1.27,,,
1,Afghanistan,"Barro-Lee: Average years of primary schooling,...",1.03,,,1.26,,,,1.54,,,,2.01,,,,,2.18,,,,,2.64,,,
2,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.83,,,0.95,,,,1.26,,,,1.92,,,,,1.01,,,,,2.45,,,
3,Afghanistan,"Barro-Lee: Average years of primary schooling,...",2.34,,,2.22,,,,2.37,,,,3.83,,,,,2.26,,,,,3.55,,,
4,Afghanistan,"Barro-Lee: Average years of primary schooling,...",0.54,,,0.92,,,,0.94,,,,1.26,,,,,2.0,,,,,1.29,,,


In [51]:
#Check country names 
edu_attain_drop3['country'].value_counts()

Afghanistan      425
Panama           425
Netherlands      425
New Caledonia    425
New Zealand      425
                ... 
Germany          425
Ghana            425
Greece           425
Guatemala        425
Zimbabwe         425
Name: country, Length: 186, dtype: int64

In [52]:
#Check series names 
edu_attain_drop3['series_name'].value_counts()

Barro-Lee: Average years of primary schooling, age 15+, female                                                               186
Barro-Lee: Percentage of population age 50-54 with secondary schooling. Total (Incomplete and Completed Secondary)           186
Barro-Lee: Percentage of population age 50-54 with primary schooling. Total (Incomplete and Completed Primary)               186
Barro-Lee: Percentage of population age 50-54 with primary schooling. Completed Primary                                      186
Barro-Lee: Percentage of population age 50-54 with no education                                                              186
                                                                                                                            ... 
Barro-Lee: Percentage of female population age 20-24 with tertiary schooling. Completed Tertiary                             186
Barro-Lee: Percentage of female population age 20-24 with secondary schooling. Total (Incomplete 

In [53]:
#Change settings to show all rows
pd.set_option('display.max_rows', None)

In [54]:
#Check series names 
edu_attain_drop3['series_name'].value_counts()

Barro-Lee: Average years of primary schooling, age 15+, female                                                                              186
Barro-Lee: Percentage of population age 50-54 with secondary schooling. Total (Incomplete and Completed Secondary)                          186
Barro-Lee: Percentage of population age 50-54 with primary schooling. Total (Incomplete and Completed Primary)                              186
Barro-Lee: Percentage of population age 50-54 with primary schooling. Completed Primary                                                     186
Barro-Lee: Percentage of population age 50-54 with no education                                                                             186
Barro-Lee: Percentage of population age 45-49 with tertiary schooling. Total (Incomplete and Completed Tertiary)                            186
Barro-Lee: Percentage of population age 45-49 with tertiary schooling. Completed Tertiary                                               

In [56]:
#check summary stats (after dropping two columns) 
edu_attain_drop3.describe()

Unnamed: 0,1985,1986,1987,1990,1991,1992,1993,1995,1996,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
count,51843.0,12.0,3.0,51846.0,1.0,3.0,1.0,51884.0,3.0,47.0,9.0,51970.0,225.0,184.0,55.0,137.0,52037.0,204.0,286.0,366.0,464.0,52796.0,929.0,1106.0,1879.0
mean,253.059289,6.365242,1.98822,280.06165,7.58463,5.758003,6.22103,306.774826,7.79931,8.811033,10.010412,334.780437,16.347225,16.282858,19.528067,11.737983,363.5199,13.172861,16.07378,14.849021,17.527544,386.6418,20.351652,19.483375,23.113742
std,5812.705429,2.335996,0.883044,6525.945102,,0.24401,,7192.240329,0.30145,13.700154,0.523507,7877.842661,21.523526,21.445675,22.526506,16.959871,8563.497,14.756343,22.696642,17.947568,22.434285,9146.338,25.186301,24.036373,28.481772
min,0.0,3.7406,1.11897,0.0,7.58463,5.51415,6.22103,0.0,7.49769,0.0,9.47552,0.0,0.0,0.0,0.16129,0.0,0.0,0.12933,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.02,4.8412,1.54011,1.23,7.58463,5.63592,6.22103,1.54,7.64867,0.348338,9.6581,1.71,3.09057,1.556833,6.910945,3.54881,1.93,5.855385,3.057243,5.408055,4.462345,2.01,2.74371,1.498102,1.050885
50%,5.07,5.58128,1.96125,5.67,7.58463,5.75769,6.22103,6.25,7.79965,2.38688,9.87822,6.77,9.06781,7.45533,11.25131,7.29729,7.25,8.62346,9.2346,9.460645,10.165165,7.47,10.79722,10.60282,11.87501
75%,23.245,7.255653,2.422845,24.1,7.58463,5.87993,6.22103,24.86,7.95012,9.859935,10.02368,25.2,19.47818,23.969655,28.353175,10.70231,26.41,15.203413,16.020193,14.59002,17.457818,27.0,25.52938,26.92952,31.535075
max,745898.0,10.19288,2.88444,835430.0,7.58463,6.00217,6.22103,896920.0,8.10059,55.25822,11.13991,958307.0,98.25549,91.68136,90.41407,85.39676,1034076.0,93.68704,99.99305,97.70276,98.72121,1090693.0,99.52137,98.67698,100.0


# Export supplementary edu_attain data set

In [55]:
#Clean edu_attain data frame 
edu_attain_drop3.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'edu_attain_clean.pkl'))