In [2]:
import pandas as pd

Chapter 2 - Pandas Data Structures Basics

In [3]:
#create a dataframe
scientists = pd.DataFrame(
    data={
     'Occupation': ['Chemist', 'Statistician'],
     'Born': ['1920-07-25', '1876-06-13'],
     'Died': ['1958-04-16', '1937-10-16'],
     'Age': [37, 61]
     },
    index=['Rosaline Franklin', 'William Gosset'],
    columns=["Occupation", "Born", "Died", "Age"]    
    )
print(scientists)

                     Occupation        Born        Died  Age
Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
William Gosset     Statistician  1876-06-13  1937-10-16   61


In [4]:
#select by row index label
first_row = scientists.loc['William Gosset']
print(type(first_row))

<class 'pandas.core.series.Series'>


In [5]:
print(first_row)

Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object


In [6]:
print(first_row.index)

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')


In [7]:
print(first_row.values)

['Statistician' '1876-06-13' '1937-10-16' 61]


In [8]:
#get first index using an attribute
print(first_row.index[0])

Occupation


In [9]:
#get first indedx using a method
print(first_row.keys()[0])

Occupation


In [10]:
#get the age column
ages = scientists['Age']
print(ages)

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64


In [11]:
#mean
print(ages.mean())

49.0


In [12]:
#minimum
print(ages.min())

37


In [13]:
#maximum
print(ages.max())

61


In [14]:
#standard deviation
print(ages.std())

16.97056274847714


In [18]:
scientists = pd.read_csv('scientists.csv')
print(scientists)

                   Name        Born        Died  Age          Occupation
0     Rosaline Franklin  1920-07-25  1958-04-16   37             Chemist
1        William Gosset  1876-06-13  1937-10-16   61        Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90               Nurse
3           Marie Curie  1867-11-07  1934-07-04   66             Chemist
4         Rachel Carson  1907-05-27  1964-04-14   56           Biologist
5             John Snow  1813-03-15  1858-06-16   45           Physician
6           Alan Turing  1912-06-23  1954-06-07   41  Computer Scientist
7          Johann Gauss  1777-04-30  1855-02-23   77       Mathematician


In [17]:
ages = scientists['Age']
print(ages)

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [19]:
ages.describe()

count     8.000000
mean     59.125000
std      18.325918
min      37.000000
25%      44.000000
50%      58.500000
75%      68.750000
max      90.000000
Name: Age, dtype: float64

In [20]:
ages.mean()

59.125

In [21]:
print(ages[ages > ages.mean()])

1    61
2    90
3    66
7    77
Name: Age, dtype: int64


In [22]:
#parts of a dataframe
#(1).index
scientists.index

RangeIndex(start=0, stop=8, step=1)

In [23]:
#(2).columns
scientists.columns

Index(['Name', 'Born', 'Died', 'Age', 'Occupation'], dtype='object')

In [24]:
#(3).values
scientists.values

array([['Rosaline Franklin', '1920-07-25', '1958-04-16', 37, 'Chemist'],
       ['William Gosset', '1876-06-13', '1937-10-16', 61, 'Statistician'],
       ['Florence Nightingale', '1820-05-12', '1910-08-13', 90, 'Nurse'],
       ['Marie Curie', '1867-11-07', '1934-07-04', 66, 'Chemist'],
       ['Rachel Carson', '1907-05-27', '1964-04-14', 56, 'Biologist'],
       ['John Snow', '1813-03-15', '1858-06-16', 45, 'Physician'],
       ['Alan Turing', '1912-06-23', '1954-06-07', 41,
        'Computer Scientist'],
       ['Johann Gauss', '1777-04-30', '1855-02-23', 77, 'Mathematician']],
      dtype=object)

In [25]:
#boolean vectors will subset rows
print(scientists[scientists['Age'] > scientists['Age'].mean()])

                   Name        Born        Died  Age     Occupation
1        William Gosset  1876-06-13  1937-10-16   61   Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90          Nurse
3           Marie Curie  1867-11-07  1934-07-04   66        Chemist
7          Johann Gauss  1777-04-30  1855-02-23   77  Mathematician


In [29]:
#format the 'Born' column as a datetime
born_datetime = pd.to_datetime(scientists['Born'], format='%Y-%m-%d')
print(born_datetime)

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]


In [30]:
#format the 'Died' column as a datetime
died_datetime = pd.to_datetime(scientists['Died'], format='%Y-%m-%d')
print(died_datetime)

0   1958-04-16
1   1937-10-16
2   1910-08-13
3   1934-07-04
4   1964-04-14
5   1858-06-16
6   1954-06-07
7   1855-02-23
Name: Died, dtype: datetime64[ns]


In [31]:
print(scientists.dtypes)

Name          object
Born          object
Died          object
Age            int64
Occupation    object
dtype: object


In [34]:
scientists['Born'] = pd.to_datetime(scientists['Born'])
scientists['Died'] = pd.to_datetime(scientists['Died'])

# Verify the dtypes
print(scientists.dtypes)

Name                  object
Born          datetime64[ns]
Died          datetime64[ns]
Age                    int64
Occupation            object
dtype: object


In [None]:
#multiple assignment syntax
scientists['born_dt'], scientists['died_dt'] = (
born_datetime,
died_datetime
)
print(scientists.head())

                   Name       Born       Died  Age    Occupation    born_dt  \
0     Rosaline Franklin 1920-07-25 1958-04-16   37       Chemist 1920-07-25   
1        William Gosset 1876-06-13 1937-10-16   61  Statistician 1876-06-13   
2  Florence Nightingale 1820-05-12 1910-08-13   90         Nurse 1820-05-12   
3           Marie Curie 1867-11-07 1934-07-04   66       Chemist 1867-11-07   
4         Rachel Carson 1907-05-27 1964-04-14   56     Biologist 1907-05-27   

     died_dt  
0 1958-04-16  
1 1937-10-16  
2 1910-08-13  
3 1934-07-04  
4 1964-04-14  


In [37]:
#directly change a column
print(scientists['Age'])

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [39]:
#shuffle the values
scientists["Age"] = scientists["Age"].sample(frac=1, random_state=42)

In [40]:
scientists['Age'] = (
    scientists['Age']
    .sample(frac=1, random_state=42)
)
print(scientists['Age'])

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [42]:
scientists['Age'] = (
    scientists['Age']
    .sample(frac=1, random_state=42)
.values # remove the index so it doesn't auto align the values
)
print(scientists['Age'])

0    45
1    56
2    61
3    41
4    37
5    90
6    77
7    66
Name: Age, dtype: int64


In [45]:
#calculate real age
scientists['age_days'] = (scientists['died_dt'] - scientists['born_dt'])
print(scientists)

                   Name       Born       Died  Age          Occupation  \
0     Rosaline Franklin 1920-07-25 1958-04-16   45             Chemist   
1        William Gosset 1876-06-13 1937-10-16   56        Statistician   
2  Florence Nightingale 1820-05-12 1910-08-13   61               Nurse   
3           Marie Curie 1867-11-07 1934-07-04   41             Chemist   
4         Rachel Carson 1907-05-27 1964-04-14   37           Biologist   
5             John Snow 1813-03-15 1858-06-16   90           Physician   
6           Alan Turing 1912-06-23 1954-06-07   77  Computer Scientist   
7          Johann Gauss 1777-04-30 1855-02-23   66       Mathematician   

     born_dt    died_dt age_days_dt   age_days  
0 1920-07-25 1958-04-16  13779 days 13779 days  
1 1876-06-13 1937-10-16  22404 days 22404 days  
2 1820-05-12 1910-08-13  32964 days 32964 days  
3 1867-11-07 1934-07-04  24345 days 24345 days  
4 1907-05-27 1964-04-14  20777 days 20777 days  
5 1813-03-15 1858-06-16  16529 days 165

In [49]:
#dropping values
print(scientists.columns)

Index(['Name', 'Born', 'Died', 'Age', 'Occupation', 'born_dt', 'died_dt',
       'age_days_dt', 'age_days'],
      dtype='object')


In [50]:
scientists_dropped = scientists.drop(['Age'], axis=1)
print(scientists_dropped.columns)

Index(['Name', 'Born', 'Died', 'Occupation', 'born_dt', 'died_dt',
       'age_days_dt', 'age_days'],
      dtype='object')
