# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Creating-a-series" data-toc-modified-id="Creating-a-series-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Creating a series</a></div><div class="lev1 toc-item"><a href="#Creating-a-DataFrame" data-toc-modified-id="Creating-a-DataFrame-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Creating a DataFrame</a></div><div class="lev1 toc-item"><a href="#The-Series" data-toc-modified-id="The-Series-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>The Series</a></div><div class="lev1 toc-item"><a href="#Aligned-vectorized-operations" data-toc-modified-id="Aligned-vectorized-operations-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Aligned vectorized operations</a></div><div class="lev2 toc-item"><a href="#automatic-index-alignment" data-toc-modified-id="automatic-index-alignment-41"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>automatic index alignment</a></div><div class="lev1 toc-item"><a href="#Data-Frame" data-toc-modified-id="Data-Frame-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Data Frame</a></div><div class="lev1 toc-item"><a href="#Make-changes-to-series-and-dataframes" data-toc-modified-id="Make-changes-to-series-and-dataframes-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Make changes to series and dataframes</a></div>

# Creating a series

In [20]:
import pandas as pd

In [21]:
s = pd.Series(['banana', 42])

In [22]:
s

0    banana
1        42
dtype: object

In [23]:
# manually assign index values to a series
# by passing a Python list
s = pd.Series(['Wes McKinney', 'Creator of Pandas'],
              index=['Person', 'Who'])
s

Person         Wes McKinney
Who       Creator of Pandas
dtype: object

# Creating a DataFrame

In [24]:
scientists = pd.DataFrame({
    'Name': ['Rosaline Franklin', 'William Gosset'],
    'Occupation': ['Chemist', 'Statistician'],
    'Born': ['1920-07-25', '1876-06-13'],
    'Died': ['1958-04-16', '1937-10-16'],
    'Age': [37, 61]
})
scientists

Unnamed: 0,Age,Born,Died,Name,Occupation
0,37,1920-07-25,1958-04-16,Rosaline Franklin,Chemist
1,61,1876-06-13,1937-10-16,William Gosset,Statistician


In [13]:
scientists = pd.DataFrame(
    data={'Occupation': ['Chemist', 'Statistician'],
    'Born': ['1920-07-25', '1876-06-13'],
    'Died': ['1958-04-16', '1937-10-16'],
    'Age': [37, 61]
    },
    index=['Rosaline Franklin', 'William Gosset'],
    columns=['Occupation', 'Born', 'Died', 'Age']
)

scientists

Unnamed: 0,Occupation,Born,Died,Age
Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
William Gosset,Statistician,1876-06-13,1937-10-16,61


In [17]:
# ordered dict
from collections import OrderedDict

In [18]:
scientists = pd.DataFrame(OrderedDict([
('Name', ['Rosaline Franklin', 'William Gosset']),
    ('Occupation', ['Chemist', 'Statistician']),
('Born', ['1920-07-25', '1876-06-13']),
('Died', ['1958-04-16', '1937-10-16']),
('Age', [37, 61])
])
)

scientists

Unnamed: 0,Name,Occupation,Born,Died,Age
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


# The Series

In [25]:
# create our example dataframe
# with a row index label
scientists = pd.DataFrame(
data={'Occupation': ['Chemist', 'Statistician'],
'Born': ['1920-07-25', '1876-06-13'],
'Died': ['1958-04-16', '1937-10-16'],
'Age': [37, 61]},
index=['Rosaline Franklin', 'William Gosset'],
columns=['Occupation', 'Born', 'Died', 'Age'])
print(scientists)

                     Occupation        Born        Died  Age
Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
William Gosset     Statistician  1876-06-13  1937-10-16   61


In [26]:
# select by row index label
first_row = scientists.loc['William Gosset']
print(type(first_row))

<class 'pandas.core.series.Series'>


In [27]:
print(first_row)

Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object


In [28]:
print(first_row.index)

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')


In [29]:
print(first_row.values)

['Statistician' '1876-06-13' '1937-10-16' 61]


In [31]:
# keys is like index
print(first_row.keys())

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')


In [32]:
# get the first index using an attribute
print(first_row.index[0])

Occupation


In [33]:
# get the first index using a method
print(first_row.keys()[0])

Occupation


http://pandas.pydata.org/pandas-docs/version/0.20.3/generated/pandas.Series.html

In [35]:
# is ndarray-link

# get the 'Age' column
ages = scientists['Age']
print(ages)

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64


In [36]:
print(ages.mean())

49.0


In [37]:
print(ages.min())

37


In [38]:
print(ages.max())

61


In [39]:
print(ages.std())

16.97056274847714


In [42]:
scientists = pd.read_csv('../data/scientists.csv')

In [43]:
ages = scientists['Age']
print(ages)

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [44]:
# mean of all ages
print(ages.mean())

59.125


In [45]:
print(ages[ages > ages.mean()])

1    61
2    90
3    66
7    77
Name: Age, dtype: int64


In [46]:
print(ages > ages.mean())

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool


In [47]:
print(type(ages > ages.mean()))

<class 'pandas.core.series.Series'>


In [48]:
# get index 0, 1, 4, and 5
manual_bool_values = [True, True, False, False, True, True, False, True]
print(ages[manual_bool_values])

0    37
1    61
4    56
5    45
7    77
Name: Age, dtype: int64


# Aligned vectorized operations

In [49]:
print(ages + ages)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [51]:
print(ages * ages)

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64


In [52]:
print(ages + 100)

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64


In [54]:
print(ages * 2)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [56]:
# vectors with different lengths

print(ages + pd.Series([1, 100]))

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64


In [57]:
# with other types, the shapes must match

import numpy as np
# this will cause an error
print(ages + np.array([1, 100]))

ValueError: operands could not be broadcast together with shapes (8,) (2,) 

## automatic index alignment

In [59]:
# ages as they appear in the data
print(ages)

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [60]:
rev_ages = ages.sort_index(ascending=False)
print(rev_ages)

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64


In [61]:
# reference output to show index label alignment
print(ages * 2)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [62]:
# note how we get the same values
# even though the vector is reversed
print(ages + rev_ages)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


# Data Frame

In [63]:
# boolean vectors will subset rows
print(scientists[scientists['Age'] > scientists['Age'].mean()])

                   Name        Born        Died  Age     Occupation
1        William Gosset  1876-06-13  1937-10-16   61   Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90          Nurse
3           Marie Curie  1867-11-07  1934-07-04   66        Chemist
7          Johann Gauss  1777-04-30  1855-02-23   77  Mathematician


In [64]:
# 4 values passed as a bool vector
# 3 rows returned
print(scientists.loc[[True, True, False, True]])

                Name        Born        Died  Age    Occupation
0  Rosaline Franklin  1920-07-25  1958-04-16   37       Chemist
1     William Gosset  1876-06-13  1937-10-16   61  Statistician
3        Marie Curie  1867-11-07  1934-07-04   66       Chemist


# Make changes to series and dataframes

In [65]:
# add a column
print(scientists['Born'].dtype)

object


In [66]:
print(scientists['Died'].dtype)

object


In [67]:
# format born as a datetime
born_datetime = pd.to_datetime(scientists['Born'], format='%Y-%m-%d')

In [68]:
print(born_datetime)

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]


In [69]:
# format the 'Died' column as a datetime
died_datetime = pd.to_datetime(scientists['Died'], format='%Y-%m-%d')

In [72]:
scientists['born_dt'], scientists['died_dt'] = (born_datetime,
                                                died_datetime)
print(scientists.head())

                   Name        Born        Died  Age    Occupation    born_dt  \
0     Rosaline Franklin  1920-07-25  1958-04-16   37       Chemist 1920-07-25   
1        William Gosset  1876-06-13  1937-10-16   61  Statistician 1876-06-13   
2  Florence Nightingale  1820-05-12  1910-08-13   90         Nurse 1820-05-12   
3           Marie Curie  1867-11-07  1934-07-04   66       Chemist 1867-11-07   
4         Rachel Carson  1907-05-27  1964-04-14   56     Biologist 1907-05-27   

     died_dt  
0 1958-04-16  
1 1937-10-16  
2 1910-08-13  
3 1934-07-04  
4 1964-04-14  


In [73]:
# all the current columns in our data
print(scientists.columns)

Index(['Name', 'Born', 'Died', 'Age', 'Occupation', 'born_dt', 'died_dt'], dtype='object')


In [74]:
# drop the shuffled age column
# you provide the axis=1 argument to drop column-wise
scientists_dropped = scientists.drop(['Age'], axis=1)

In [75]:
# columns after dropping our column
print(scientists_dropped.columns)

Index(['Name', 'Born', 'Died', 'Occupation', 'born_dt', 'died_dt'], dtype='object')
