# BIG DIVE Intesa 3
## Pandas
by Stefania Delprete, TOP-IX  
stefania.delprete@top-ix.org 

https://www.linkedin.com/in/astrastefania   
https://twitter.com/astrastefania  

---

## Pandas

http://pandas.pydata.org  
https://github.com/pandas-dev/pandas

International Pandas Sprint https://python-sprints.github.io/pandas  
...also in Turin https://github.com/astrastefania/pandas-sprint-Turin  

--- 

In [1]:
import pandas as pd

## From dictionaries to Pandas dataframes

Source https://en.wikipedia.org/wiki/List_of_government_space_agencies  

In [2]:
agencies_dict = {'agency_code': ['ASI', 'ESA', 'NASA', 'ROSCOSMOS', 'ISRO', 'ISA', 'JAXA'],
          'agency_name': ['Agenzia Spaziale Italiana', 
                          'European Space Agency',
                          'National Aeronautics and Space Administration', 
                          'Russian Federal Space Agency',
                          'Indian Space Research Organisation',
                          'Israeli Space Agency', 'Japan Aerospace Exploration Agency'], 
          'country': ['Italy', 'Europe', 'United States', 'Russia', 'India', 'Israel', 'Japan'],
          'year_founded': [1988, 1975, 1958, 1992, 1969, 1983, 2003]}

In [3]:
# We also call the columns 'variables' and the rows 'observables'

agencies_df = pd.DataFrame(agencies_dict)
agencies_df

Unnamed: 0,agency_code,agency_name,country,year_founded
0,ASI,Agenzia Spaziale Italiana,Italy,1988
1,ESA,European Space Agency,Europe,1975
2,NASA,National Aeronautics and Space Administration,United States,1958
3,ROSCOSMOS,Russian Federal Space Agency,Russia,1992
4,ISRO,Indian Space Research Organisation,India,1969
5,ISA,Israeli Space Agency,Israel,1983
6,JAXA,Japan Aerospace Exploration Agency,Japan,2003


In [4]:
type(agencies_df)

pandas.core.frame.DataFrame

In [5]:
# help(pd.DataFrame.head)

## Exploration of the Pandas DataFrame

In [6]:
agencies_df.head(2) # Shows the first two observables, head() shows 5 by default

Unnamed: 0,agency_code,agency_name,country,year_founded
0,ASI,Agenzia Spaziale Italiana,Italy,1988
1,ESA,European Space Agency,Europe,1975


In [7]:
agencies_df.tail(3) # Shows the last three observables, also tail() shows 5 by default

Unnamed: 0,agency_code,agency_name,country,year_founded
4,ISRO,Indian Space Research Organisation,India,1969
5,ISA,Israeli Space Agency,Israel,1983
6,JAXA,Japan Aerospace Exploration Agency,Japan,2003


In [8]:
agencies_df.shape # Number of rows/columns or observables/variables

(7, 4)

In [9]:
agencies_df.columns # Columns' names

Index(['agency_code', 'agency_name', 'country', 'year_founded'], dtype='object')

In [10]:
agencies_df.index # Default index in this case

RangeIndex(start=0, stop=7, step=1)

In [11]:
agencies_df.info() # Insight on the types of each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
agency_code     7 non-null object
agency_name     7 non-null object
country         7 non-null object
year_founded    7 non-null int64
dtypes: int64(1), object(3)
memory usage: 304.0+ bytes


In [12]:
agencies_df.describe() # Stastistics on the numerical types

Unnamed: 0,year_founded
count,7.0
mean,1981.142857
std,15.09336
min,1958.0
25%,1972.0
50%,1983.0
75%,1990.0
max,2003.0


In [13]:
# We can check inside describe
stat = agencies_df.describe()

In [14]:
type(stat)

pandas.core.frame.DataFrame

In [15]:
stat.index

Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], dtype='object')

## Slicing

In [16]:
agencies_df['country']

0            Italy
1           Europe
2    United States
3           Russia
4            India
5           Israel
6            Japan
Name: country, dtype: object

In [17]:
agencies_df.country

0            Italy
1           Europe
2    United States
3           Russia
4            India
5           Israel
6            Japan
Name: country, dtype: object

In [18]:
col_select = ['year_founded','country']

agencies_df[col_select]

Unnamed: 0,year_founded,country
0,1988,Italy
1,1975,Europe
2,1958,United States
3,1992,Russia
4,1969,India
5,1983,Israel
6,2003,Japan


In [19]:
agencies_df[['year_founded','country']] # Selection of two columns

Unnamed: 0,year_founded,country
0,1988,Italy
1,1975,Europe
2,1958,United States
3,1992,Russia
4,1969,India
5,1983,Israel
6,2003,Japan


In [20]:
agencies_df.year_founded > 1960

0     True
1     True
2    False
3     True
4     True
5     True
6     True
Name: year_founded, dtype: bool

In [21]:
agencies_df[agencies_df.year_founded > 1960] # Add a condition

Unnamed: 0,agency_code,agency_name,country,year_founded
0,ASI,Agenzia Spaziale Italiana,Italy,1988
1,ESA,European Space Agency,Europe,1975
3,ROSCOSMOS,Russian Federal Space Agency,Russia,1992
4,ISRO,Indian Space Research Organisation,India,1969
5,ISA,Israeli Space Agency,Israel,1983
6,JAXA,Japan Aerospace Exploration Agency,Japan,2003


In [22]:
agencies_df

Unnamed: 0,agency_code,agency_name,country,year_founded
0,ASI,Agenzia Spaziale Italiana,Italy,1988
1,ESA,European Space Agency,Europe,1975
2,NASA,National Aeronautics and Space Administration,United States,1958
3,ROSCOSMOS,Russian Federal Space Agency,Russia,1992
4,ISRO,Indian Space Research Organisation,India,1969
5,ISA,Israeli Space Agency,Israel,1983
6,JAXA,Japan Aerospace Exploration Agency,Japan,2003


In [23]:
asi = agencies_df[agencies_df['agency_code'] == 'ASI'] # Saving a selection
asi

Unnamed: 0,agency_code,agency_name,country,year_founded
0,ASI,Agenzia Spaziale Italiana,Italy,1988


In [24]:
type(asi)

pandas.core.frame.DataFrame

### Selection by column name `.loc[] ` or index `.loc[] `

In [25]:
agencies_df.loc[:,'agency_code':'country']

Unnamed: 0,agency_code,agency_name,country
0,ASI,Agenzia Spaziale Italiana,Italy
1,ESA,European Space Agency,Europe
2,NASA,National Aeronautics and Space Administration,United States
3,ROSCOSMOS,Russian Federal Space Agency,Russia
4,ISRO,Indian Space Research Organisation,India
5,ISA,Israeli Space Agency,Israel
6,JAXA,Japan Aerospace Exploration Agency,Japan


In [26]:
agencies_df.loc[1:,:]

Unnamed: 0,agency_code,agency_name,country,year_founded
1,ESA,European Space Agency,Europe,1975
2,NASA,National Aeronautics and Space Administration,United States,1958
3,ROSCOSMOS,Russian Federal Space Agency,Russia,1992
4,ISRO,Indian Space Research Organisation,India,1969
5,ISA,Israeli Space Agency,Israel,1983
6,JAXA,Japan Aerospace Exploration Agency,Japan,2003


In [27]:
agencies_df.iloc[:2,1:]

Unnamed: 0,agency_name,country,year_founded
0,Agenzia Spaziale Italiana,Italy,1988
1,European Space Agency,Europe,1975


In [28]:
agencies_df.columns = ['code', 'name', 'country', 'year_founded']

In [29]:
agencies_df

Unnamed: 0,code,name,country,year_founded
0,ASI,Agenzia Spaziale Italiana,Italy,1988
1,ESA,European Space Agency,Europe,1975
2,NASA,National Aeronautics and Space Administration,United States,1958
3,ROSCOSMOS,Russian Federal Space Agency,Russia,1992
4,ISRO,Indian Space Research Organisation,India,1969
5,ISA,Israeli Space Agency,Israel,1983
6,JAXA,Japan Aerospace Exploration Agency,Japan,2003


In [30]:
df_ = agencies_df[['year_founded', 'name', 'country', 'code']]

In [31]:
df_

Unnamed: 0,year_founded,name,country,code
0,1988,Agenzia Spaziale Italiana,Italy,ASI
1,1975,European Space Agency,Europe,ESA
2,1958,National Aeronautics and Space Administration,United States,NASA
3,1992,Russian Federal Space Agency,Russia,ROSCOSMOS
4,1969,Indian Space Research Organisation,India,ISRO
5,1983,Israeli Space Agency,Israel,ISA
6,2003,Japan Aerospace Exploration Agency,Japan,JAXA


## Creating a new column

We can apply an anonymous labda function to create a new column with `.apply( )`

In [32]:
agencies_df['years'] = agencies_df['year_founded'].apply(lambda year: 2019 - year)
agencies_df

Unnamed: 0,code,name,country,year_founded,years
0,ASI,Agenzia Spaziale Italiana,Italy,1988,31
1,ESA,European Space Agency,Europe,1975,44
2,NASA,National Aeronautics and Space Administration,United States,1958,61
3,ROSCOSMOS,Russian Federal Space Agency,Russia,1992,27
4,ISRO,Indian Space Research Organisation,India,1969,50
5,ISA,Israeli Space Agency,Israel,1983,36
6,JAXA,Japan Aerospace Exploration Agency,Japan,2003,16


In [33]:
agencies_df.describe()

Unnamed: 0,year_founded,years
count,7.0,7.0
mean,1981.142857,37.857143
std,15.09336,15.09336
min,1958.0,16.0
25%,1972.0,29.0
50%,1983.0,36.0
75%,1990.0,47.0
max,2003.0,61.0


In [34]:
agencies_df.mean()

year_founded    1981.142857
years             37.857143
dtype: float64

In [35]:
agencies_df.std()

year_founded    15.09336
years           15.09336
dtype: float64

In [36]:
agencies_df.max() 

code                               ROSCOSMOS
name            Russian Federal Space Agency
country                        United States
year_founded                            2003
years                                     61
dtype: object

In [37]:
agencies_df.min()

code                                  ASI
name            Agenzia Spaziale Italiana
country                            Europe
year_founded                         1958
years                                  16
dtype: object

In [38]:
# We can convert back the new pandas DataFrame to a dictionary
agencies_dict2 = agencies_df.to_dict()

In [39]:
agencies_dict2

{'code': {0: 'ASI',
  1: 'ESA',
  2: 'NASA',
  3: 'ROSCOSMOS',
  4: 'ISRO',
  5: 'ISA',
  6: 'JAXA'},
 'name': {0: 'Agenzia Spaziale Italiana',
  1: 'European Space Agency',
  2: 'National Aeronautics and Space Administration',
  3: 'Russian Federal Space Agency',
  4: 'Indian Space Research Organisation',
  5: 'Israeli Space Agency',
  6: 'Japan Aerospace Exploration Agency'},
 'country': {0: 'Italy',
  1: 'Europe',
  2: 'United States',
  3: 'Russia',
  4: 'India',
  5: 'Israel',
  6: 'Japan'},
 'year_founded': {0: 1988,
  1: 1975,
  2: 1958,
  3: 1992,
  4: 1969,
  5: 1983,
  6: 2003},
 'years': {0: 31, 1: 44, 2: 61, 3: 27, 4: 50, 5: 36, 6: 16}}

## Sorting by a column

In [40]:
agencies_df.sort_values('years', ascending=False) # Sorting by 'years'

Unnamed: 0,code,name,country,year_founded,years
2,NASA,National Aeronautics and Space Administration,United States,1958,61
4,ISRO,Indian Space Research Organisation,India,1969,50
1,ESA,European Space Agency,Europe,1975,44
5,ISA,Israeli Space Agency,Israel,1983,36
0,ASI,Agenzia Spaziale Italiana,Italy,1988,31
3,ROSCOSMOS,Russian Federal Space Agency,Russia,1992,27
6,JAXA,Japan Aerospace Exploration Agency,Japan,2003,16


## Importing a dataset

Happiness rank and scores by country in 2017.  
Source (plus full metadata description) https://www.kaggle.com/unsdsn/world-happiness/data

We will start importing .csv files  
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html

In [41]:
happy = pd.read_csv('data/happiness_2017.csv') # Importing a .csv file

In [42]:
happy.columns

Index(['Country', 'Happiness.Rank', 'Happiness.Score', 'Whisker.high',
       'Whisker.low', 'Economy..GDP.per.Capita.', 'Family',
       'Health..Life.Expectancy.', 'Freedom', 'Generosity',
       'Trust..Government.Corruption.', 'Dystopia.Residual'],
      dtype='object')

In [43]:
happy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 12 columns):
Country                          155 non-null object
Happiness.Rank                   155 non-null int64
Happiness.Score                  155 non-null float64
Whisker.high                     155 non-null float64
Whisker.low                      155 non-null float64
Economy..GDP.per.Capita.         155 non-null float64
Family                           155 non-null float64
Health..Life.Expectancy.         155 non-null float64
Freedom                          155 non-null float64
Generosity                       155 non-null float64
Trust..Government.Corruption.    155 non-null float64
Dystopia.Residual                155 non-null float64
dtypes: float64(10), int64(1), object(1)
memory usage: 14.6+ KB


In [44]:
happy.head()

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


## More manipulations

In [45]:
happy_ = happy.loc[:, ['Country', 'Happiness.Rank', 'Happiness.Score','Freedom']]
happy_.head(3)

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Freedom
0,Norway,1,7.537,0.635423
1,Denmark,2,7.522,0.626007
2,Iceland,3,7.504,0.627163


In [46]:
# Changing columns name
happy_.columns = ['country', 'rank', 'score', 'freedom'] 
happy_.head(3)

Unnamed: 0,country,rank,score,freedom
0,Norway,1,7.537,0.635423
1,Denmark,2,7.522,0.626007
2,Iceland,3,7.504,0.627163


In [47]:
# Changing only one column name
happy_ = happy_.rename(columns = {'freedom':'free'})
happy_.head(3)

Unnamed: 0,country,rank,score,free
0,Norway,1,7.537,0.635423
1,Denmark,2,7.522,0.626007
2,Iceland,3,7.504,0.627163


In [48]:
# More use of .loc[]
happy_filtered = happy_.loc[(happy_['score'] <= 4) & (happy_['free'] >= 0.3)]
happy_filtered.head(10)

Unnamed: 0,country,rank,score,free
135,Malawi,136,3.97,0.466915
137,Zimbabwe,138,3.875,0.336384
138,Lesotho,139,3.808,0.390661
141,Botswana,142,3.766,0.505196
142,Benin,143,3.657,0.425963
147,Liberia,148,3.533,0.332881
148,Guinea,149,3.507,0.348588
149,Togo,150,3.495,0.380426
150,Rwanda,151,3.471,0.581844
152,Tanzania,153,3.349,0.390018


### Deleting rows or columns

.drop() by default has the argument axis=0 meaning row indexes, axis=1 stands for columns.  
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [49]:
# Let's make a selection first
score_top10 = happy_.sort_values('score', ascending=False).head(10)
score_top10

Unnamed: 0,country,rank,score,free
0,Norway,1,7.537,0.635423
1,Denmark,2,7.522,0.626007
2,Iceland,3,7.504,0.627163
3,Switzerland,4,7.494,0.620071
4,Finland,5,7.469,0.617951
5,Netherlands,6,7.377,0.585384
6,Canada,7,7.316,0.611101
7,New Zealand,8,7.314,0.614062
9,Australia,10,7.284,0.601607
8,Sweden,9,7.284,0.612924


In [50]:
score_top10_simple = score_top10.drop(['rank', 'free'], axis=1) # Dropping columns
score_top10_simple

Unnamed: 0,country,score
0,Norway,7.537
1,Denmark,7.522
2,Iceland,7.504
3,Switzerland,7.494
4,Finland,7.469
5,Netherlands,7.377
6,Canada,7.316
7,New Zealand,7.314
9,Australia,7.284
8,Sweden,7.284


In [51]:
score_top10_simple_cut = score_top10.drop([0, 1, 5]) # Dropping rows
score_top10_simple_cut

Unnamed: 0,country,rank,score,free
2,Iceland,3,7.504,0.627163
3,Switzerland,4,7.494,0.620071
4,Finland,5,7.469,0.617951
6,Canada,7,7.316,0.611101
7,New Zealand,8,7.314,0.614062
9,Australia,10,7.284,0.601607
8,Sweden,9,7.284,0.612924


### Exploring another dataset

Access to electricity (% of population): access to electricity is the percentage of population with access to electricity. Electrification data are collected from industry, national surveys and international sources.  

Source (selection from) http://databank.worldbank.org/data/source/jobs

In [52]:
elect_access = pd.read_csv('data/WB_electricity_access.csv')

In [53]:
elect_access.columns

Index(['Series Name', 'Series Code', 'Country Name', 'Country Code',
       '2007 [YR2007]', '2008 [YR2008]', '2009 [YR2009]', '2010 [YR2010]',
       '2011 [YR2011]', '2012 [YR2012]', '2013 [YR2013]', '2014 [YR2014]',
       '2015 [YR2015]', '2016 [YR2016]'],
      dtype='object')

In [54]:
elect_access.head(2)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2007 [YR2007],2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016]
0,Access to electricity (% of population),EG.ELC.ACCS.ZS,Afghanistan,AFG,33.7486801147461,42.4,44.8548851013184,42.7,43.2220189082037,69.1,67.2595520019531,89.5,71.5,84.1371383666992
1,Access to electricity (% of population),EG.ELC.ACCS.ZS,Albania,ALB,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [55]:
elect_access.tail(10)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2007 [YR2007],2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016]
20,Access to electricity (% of population),EG.ELC.ACCS.ZS,Belize,BLZ,90.4782562255859,90.6150207519531,90.7728118896484,89.9172243747798,91.6817359855335,91.3423233032227,91.5541839599609,91.7720565795898,91.8,92.2143173217773
21,Access to electricity (% of population),EG.ELC.ACCS.ZS,Benin,BEN,29.3367118835449,30.6212387084961,31.9267921447754,34.2,36.9,38.4,37.2992057800293,34.1,40.0334777832031,41.4026145935059
22,Access to electricity (% of population),EG.ELC.ACCS.ZS,Bermuda,BMU,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
23,Access to electricity (% of population),EG.ELC.ACCS.ZS,Bhutan,BTN,71.8,70.0135040283203,74.4616470336914,73.2829108748978,83.4089965820313,91.5,92.4044189453125,96.9126434326172,98.4235229492188,100.0
24,Access to electricity (% of population),EG.ELC.ACCS.ZS,Bolivia,BOL,80.156567,84.674991,86.765597,84.2948150634766,88.335943,90.387375,89.505686,90.038729,91.522822,93.0391311645508
25,,,,,,,,,,,,,,
26,,,,,,,,,,,,,,
27,,,,,,,,,,,,,,
28,Data from database: Jobs,,,,,,,,,,,,,
29,Last Updated: 04/25/2018,,,,,,,,,,,,,


In [56]:
elect_access_ = elect_access.iloc[:24, [2,3,-2,-1]]
elect_access_.tail()

Unnamed: 0,Country Name,Country Code,2015 [YR2015],2016 [YR2016]
19,Belgium,BEL,100.0,100.0
20,Belize,BLZ,91.8,92.2143173217773
21,Benin,BEN,40.0334777832031,41.4026145935059
22,Bermuda,BMU,100.0,100.0
23,Bhutan,BTN,98.4235229492188,100.0


In [57]:
elect_access_.columns = ['country', 'code', '2015', '2016'] 
elect_access_

Unnamed: 0,country,code,2015,2016
0,Afghanistan,AFG,71.5,84.1371383666992
1,Albania,ALB,100,100
2,Algeria,DZA,99.3367080688477,99.439567565918
3,American Samoa,ASM,..,..
4,Andorra,AND,100,100
5,Angola,AGO,42,40.5206069946289
6,Antigua and Barbuda,ATG,96.8262939453125,97.3546676635742
7,Arab World,ARB,88.5179673938546,88.7686540243445
8,Argentina,ARG,99.9952087402344,100
9,Armenia,ARM,100,100


In [58]:
elect_access_ = elect_access_.drop(3)

In [59]:
elect_access_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23 entries, 0 to 23
Data columns (total 4 columns):
country    23 non-null object
code       23 non-null object
2015       23 non-null object
2016       23 non-null object
dtypes: object(4)
memory usage: 920.0+ bytes


## Arranging values with `.apply()`

In [60]:
elect_access_['2015'] = elect_access_['2015'].apply(lambda n: float(n))

In [61]:
elect_access_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23 entries, 0 to 23
Data columns (total 4 columns):
country    23 non-null object
code       23 non-null object
2015       23 non-null float64
2016       23 non-null object
dtypes: float64(1), object(3)
memory usage: 920.0+ bytes


In [62]:
elect_access_['2015'] = elect_access_['2015'].apply(lambda n: round(n,2))

In [63]:
elect_access_.head(3)

Unnamed: 0,country,code,2015,2016
0,Afghanistan,AFG,71.5,84.1371383666992
1,Albania,ALB,100.0,100.0
2,Algeria,DZA,99.34,99.439567565918


In [73]:
elect_access_.nunique()

country    23
code       23
2015       11
2016       10
dtype: int64

In [64]:
elect_access_.groupby('code').mean().head()

Unnamed: 0_level_0,2015
code,Unnamed: 1_level_1
ABW,95.24
AFG,71.5
AGO,42.0
ALB,100.0
AND,100.0


---
### `>>> Let's practice` 
Interactive session applying the first steps of a Data Science progejct on a new dataset

Source https://www.ibm.com/communities/analytics/watson-analytics-blog/american-time-use-survey/

In [74]:
survey = pd.read_csv('data/WA_American-Time-Use-Survey-lite.csv')

In [76]:
survey.head()

Unnamed: 0,Education Level,Age,Age Range,Employment Status,Gender,Children,Weekly Earnings,Year,Weekly Hours Worked,Sleeping,...,Caring for Children,Playing with Children,Job Searching,Shopping,Eating and Drinking,Socializing & Relaxing,Television,Golfing,Running,Volunteering
0,High School,51,50-59,Unemployed,Female,0,0,2005,0,825,...,0,0,0,0,40,180,120,0,0,0
1,Bachelor,42,40-49,Employed,Female,2,1480,2005,40,500,...,365,20,0,120,40,15,15,0,0,0
2,Master,47,40-49,Employed,Male,0,904,2005,40,480,...,0,0,0,15,85,214,199,0,0,0
3,Some College,21,20-29,Employed,Female,0,320,2005,40,705,...,0,0,0,105,30,240,240,0,0,0
4,High School,49,40-49,Not in labor force,Female,0,0,2005,0,470,...,0,0,0,0,35,600,40,0,0,0


In [77]:
survey = survey.rename(columns = {'Weekly Earnings':'Weekly $'})

In [78]:
survey.nunique()

Education Level             11
Age                         67
Age Range                    8
Employment Status            3
Gender                       2
Children                    13
Weekly $                  2039
Year                         8
Weekly Hours Worked        110
Sleeping                  1028
Grooming                   252
Housework                  551
Food & Drink Prep          400
Caring for Children        585
Playing with Children      350
Job Searching              165
Shopping                   398
Eating and Drinking        402
Socializing & Relaxing    1097
Television                 976
Golfing                    135
Running                     79
Volunteering               501
dtype: int64

In [68]:
survey.columns

Index(['Education Level', 'Age', 'Age Range', 'Employment Status', 'Gender',
       'Children', 'Weekly Earnings', 'Year', 'Weekly Hours Worked',
       'Sleeping', 'Grooming', 'Housework', 'Food & Drink Prep',
       'Caring for Children', 'Playing with Children', 'Job Searching',
       'Shopping', 'Eating and Drinking', 'Socializing & Relaxing',
       'Television', 'Golfing', 'Running', 'Volunteering'],
      dtype='object')

In [70]:
survey['Education Level'].unique()

array(['High School', 'Bachelor', 'Master', 'Some College', '11th grade',
       'Associate Degree', '9th grade', '10th grade', 'Prof. Degree',
       '12th grade', 'Doctoral Degree'], dtype=object)