In [1]:
import numpy as np
import pandas as pd

In [2]:
keep = open("keep_fields.txt", "r").read().split()

pieces = pd.read_csv("pisa2012.csv",  encoding='latin1', chunksize=20000, usecols=keep, low_memory=False)

In [3]:
df = pd.DataFrame() 
df = pd.concat(piece for piece in pieces)

In [4]:
df.columns

Index(['OECD', 'NC', 'ST04Q01', 'ST06Q01', 'ST11Q01', 'ST11Q02', 'ST11Q03',
       'ST11Q04', 'ST11Q05', 'ST11Q06', 'ST55Q01', 'ST55Q02', 'ST55Q03',
       'ST55Q04', 'ST57Q01', 'ST57Q02', 'ST57Q03', 'ST57Q04', 'ST57Q05',
       'ST57Q06', 'ST70Q01', 'ST70Q02', 'ST70Q03', 'ST71Q01', 'ST72Q01',
       'OUTHOURS', 'WEALTH', 'PV1MATH', 'PV2MATH', 'PV3MATH', 'PV4MATH',
       'PV5MATH', 'PV1READ', 'PV2READ', 'PV3READ', 'PV4READ', 'PV5READ',
       'PV1SCIE', 'PV2SCIE', 'PV3SCIE', 'PV4SCIE', 'PV5SCIE', 'W_FSTUWT'],
      dtype='object')

In [6]:
new_names = open("new_field_names.txt", "r").read().split()

names_dict = {i:j for i, j in zip(keep, new_names)}

df.rename_axis(axis=1, mapper=names_dict, inplace=True)
df.replace("Nan", np.nan, inplace=True)
df.dtypes

is_oecd                     object
country                     object
gender                      object
age                        float64
mom_home                    object
dad_home                    object
bro_home                    object
sis_home                    object
granpts_home                object
others_home                 object
ec_lesson_lang              object
ec_lesson_maths             object
ec_lesson_science           object
ec_lesson_other             object
ec_time_Homework           float64
ec_time_Guided_Homework    float64
ec_time_Personal_Tutor     float64
ec_time_Training_Center    float64
ec_time_With_Parent        float64
ec_time_Computer           float64
classes_pwk_lang           float64
classes_pwk_maths          float64
classes_pwk_science        float64
classes_pwk_ALL            float64
class_size_lang            float64
outhours                   float64
wealth                     float64
PV1MATH                    float64
PV2MATH             

In [7]:
#drop Perm from the dataset as an oddity
df = df.query("country != 'Perm (Russian Federation)'")

In [8]:
for i,j in zip(keep, new_names):
    print(i,j)

OECD is_oecd
NC country
ST04Q01 gender
ST06Q01 age
W_FSTUWT student_wt
OUTHOURS outhours
WEALTH wealth
PV1MATH PV1MATH
PV2MATH PV2MATH
PV3MATH PV3MATH
PV4MATH PV4MATH
PV5MATH PV5MATH
PV1READ PV1READ
PV2READ PV2READ
PV3READ PV3READ
PV4READ PV4READ
PV5READ PV5READ
PV1SCIE PV1SCIE
PV2SCIE PV2SCIE
PV3SCIE PV3SCIE
PV4SCIE PV4SCIE
PV5SCIE PV5SCIE
ST11Q01 mom_home
ST11Q02 dad_home
ST11Q03 bro_home
ST11Q04 sis_home
ST11Q05 granpts_home
ST11Q06 others_home
ST55Q01 ec_lesson_lang
ST55Q02 ec_lesson_maths
ST55Q03 ec_lesson_science
ST55Q04 ec_lesson_other
ST57Q01 ec_time_Homework
ST57Q02 ec_time_Guided_Homework
ST57Q03 ec_time_Personal_Tutor
ST57Q04 ec_time_Training_Center
ST57Q05 ec_time_With_Parent
ST57Q06 ec_time_Computer
ST70Q01 classes_pwk_lang
ST70Q02 classes_pwk_maths
ST70Q03 classes_pwk_science
ST71Q01 classes_pwk_ALL
ST72Q01 class_size_lang


In [10]:
numeric_cols = df.select_dtypes(['int','float']).columns
numeric_cols.drop(['age'])

Index(['ec_time_Homework', 'ec_time_Guided_Homework', 'ec_time_Personal_Tutor',
       'ec_time_Training_Center', 'ec_time_With_Parent', 'ec_time_Computer',
       'classes_pwk_lang', 'classes_pwk_maths', 'classes_pwk_science',
       'classes_pwk_ALL', 'class_size_lang', 'outhours', 'wealth', 'PV1MATH',
       'PV2MATH', 'PV3MATH', 'PV4MATH', 'PV5MATH', 'PV1READ', 'PV2READ',
       'PV3READ', 'PV4READ', 'PV5READ', 'PV1SCIE', 'PV2SCIE', 'PV3SCIE',
       'PV4SCIE', 'PV5SCIE', 'student_wt'],
      dtype='object')

In [11]:
wt_df = df.copy()
wt_sums = df.groupby(['country'])['student_wt'].sum()
wt_df[numeric_cols] = df[numeric_cols].multiply(df.student_wt, axis=0)
for cntry in df.country.unique():
    wt_df.ix[wt_df.country == cntry, numeric_cols] = wt_df[wt_df.country==cntry][numeric_cols] / wt_sums[cntry]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


In [12]:
pd.pivot_table(aggfunc='sum', data=wt_df, columns='country')

country,Albania,Argentina,Australia,Austria,Belgium,Brazil,Bulgaria,Canada,Chile,China (Shanghai),...,Switzerland,Thailand,Tunisia,Turkey,United Arab Emirates,United Kingdom (Scotland),United Kingdom (excl.Scotland),United States of America,Uruguay,Viet Nam
PV1MATH,394.633657,388.497144,503.824073,505.618505,514.961445,388.479296,438.444905,518.014882,422.401332,611.658868,...,530.546199,426.561919,387.637954,447.398909,433.874811,498.585434,493.580078,480.831999,409.094722,511.185478
PV1READ,394.355724,396.120641,511.73034,489.058999,509.108529,406.380191,436.65639,523.412413,441.16397,568.311597,...,508.513977,441.206002,404.177728,475.013729,441.521632,506.383495,499.15671,497.478424,411.645412,508.166422
PV1SCIE,397.526623,405.520516,521.02167,504.915319,505.582224,401.626365,446.487741,525.517527,444.89161,579.037301,...,514.555116,444.294924,397.751528,463.127168,448.569731,514.693694,514.441563,497.303935,416.208583,528.485626
PV2MATH,393.70446,388.476665,504.038458,505.169039,515.053982,388.677439,438.661037,518.338435,422.669472,613.07009,...,531.334605,426.934146,388.241844,447.762489,434.391387,498.084881,493.680913,481.012264,409.308339,511.560304
PV2READ,393.970866,395.38526,511.465225,489.961519,508.967347,406.478349,435.482966,523.442142,441.434991,569.835449,...,509.317326,441.149518,404.832423,475.534589,441.891277,505.601505,499.066909,497.853007,411.594238,508.403877
PV2SCIE,397.794895,405.192647,521.200908,505.736471,505.365168,401.649575,446.312601,525.451966,445.251908,580.254234,...,515.435079,444.298652,398.726436,463.449621,448.571763,512.766815,514.102409,497.185896,415.589229,528.808323
PV3MATH,395.173407,388.052031,504.143602,505.585985,514.46578,388.29542,439.118471,517.749971,422.288857,612.958932,...,530.697461,426.307883,387.91905,447.785479,434.393124,498.589601,492.876086,481.173796,409.249915,511.28779
PV3READ,392.909612,395.938132,512.197917,489.253857,509.116516,406.637015,436.418019,523.298695,441.014219,570.076811,...,509.099198,440.967945,403.833189,475.533594,441.350415,505.760272,498.468441,498.18624,411.469076,508.390471
PV3SCIE,397.225529,405.46296,521.895689,505.600359,505.364685,401.368332,446.845108,525.515332,444.556163,580.386827,...,515.258548,443.350109,398.039322,463.330783,448.318633,513.007976,513.749332,497.524316,415.899465,528.333367
PV4MATH,394.400073,388.566969,504.589026,505.373986,514.918299,388.515021,438.945145,518.273054,423.407325,612.998387,...,530.71891,427.067085,387.90347,448.344854,433.633576,498.183877,493.554724,482.525975,409.066623,511.104593


In [13]:
to_replace = {"country":{'China (Shanghai) ':"Shanghai",
                         'Chinese Taipei ':"Taipei",
                         'Hong Kong-China':'Hong Kong',
                         'Macao-China':'Macao',
                         'Viet Nam ':'Vietnam',
                         'Republic of Korea':'South Korea'}
                        }
wt_df.replace(to_replace, inplace=True)
wt_df.country.unique()

array(['Albania', 'United Arab Emirates ', 'Argentina', 'Australia',
       'Austria', 'Belgium', 'Bulgaria ', 'Brazil ', 'Canada ',
       'Switzerland', 'Chile', 'Colombia ', 'Costa Rica ',
       'Czech Republic ', 'Germany', 'Denmark', 'Spain', 'Estonia',
       'Finland', 'France ', 'United Kingdom (excl.Scotland) ',
       'United Kingdom (Scotland)', 'Greece ', 'Hong Kong', 'Croatia',
       'Hungary', 'Indonesia', 'Ireland', 'Iceland', 'Israel ', 'Italy',
       'Jordan ', 'Japan', 'Kazakhstan ', 'South Korea', 'Liechtenstein',
       'Lithuania', 'Luxembourg ', 'Latvia ', 'Macao', 'Mexico ',
       'Montenegro ', 'Malaysia ', 'Netherlands', 'Norway ', 'New Zealand',
       'Peru ', 'Poland ', 'Portugal ', 'Qatar', 'Shanghai',
       'United States of America ', 'Romania', 'Russian Federation ',
       'Singapore', 'Serbia ', 'Slovak Republic', 'Slovenia ', 'Sweden ',
       'Taipei', 'Thailand ', 'Tunisia', 'Turkey ', 'Uruguay', 'Vietnam'], dtype=object)

In [14]:
for ctry in df.country.unique():
    print(ctry, '\n', df[df['country'] == ctry]['wealth'].mean(), wt_df[wt_df['country'] == ctry]['wealth'].sum())

Albania 
 -1.2073777173913027 -1.1202833785406625
United Arab Emirates  
 0.5777696959074313 0.570118003888539
Argentina 
 -0.8482395087000734 -0.8947815133598035
Australia 
 0.4964890326209601 0.5476251873225848
Austria 
 0.27314943986472423 0.27430913164324
Belgium 
 0.12019169704810857 0.07632347268504745
Bulgaria  
 -0.44463540669855706 -0.46659503748741993
Brazil  
 -1.1497398202010107 -1.0352049684343547
Canada  
 0.5084818980999852 0.4836150180776247
Switzerland 
 0.05789237668161155 0.08673693012738723
Chile 
 -0.2633043093270421 -0.5858233307194616
Colombia  
 -1.4055311804009238 -1.6119371867440149
Costa Rica  
 -1.3453844444444425 -1.3115420086684164
Czech Republic  
 -0.18931963814550173 -0.22178083469474255
Germany 
 0.20319038642789491 0.1765268740342935
Denmark 
 0.1955674286487293 0.3160696242138615
Spain 
 -0.017864286568116172 0.015836771150443293
Estonia 
 -0.17742616033755015 -0.19392338529666783
Finland 
 0.21161840448666439 0.26251863804563713
France  
 0.15035454

In [15]:
wt_df.to_csv("Pisa_reduced.csv")