## Creating a Series

In [4]:
# By passing a pythong list
import pandas as pd

s= pd.Series(['Wes McKinney', 'Creator of Pandas'],
            index = ['Person', 'who'])
print (s)

Person         Wes McKinney
who       Creator of Pandas
dtype: object


## Creating a DataFrame

In [11]:
# a DataFrame can be thought of as a dictionary of Series objects. 
# Order is not guaranteed because python dictionaries are not ordered. Needs to use OrderedDict or collections module

scientists = pd.DataFrame(
        {'Name': ['Rosaline Franklin', 'William Gosset'],
        'Occupation': ['Chemist', 'Statistician'],
        'Born':['1920-07-25', '1876-06-13'],
        'Died':['1958-04-16', '1937-10-16'],
        'Age':[37, 61]})

print(scientists)

                Name    Occupation        Born        Died  Age
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
1     William Gosset  Statistician  1876-06-13  `937-10-16   61


In [10]:
# Specify the column order

scientists = pd.DataFrame(
    data = {'Occupation': ['Chemist', 'Statiscian'],
            'Born':['1920-07-25', '1876-06-13'],
            'Died':['1958-04-16', '1937-10-16'],
            'Age':[37, 61]},
    index = ['Rosaline Franklin', 'William Gosset'],
    columns = ['Occupation','Born', 'Died', 'Age'])
    
print(scientists)
             
             

                   Occupation        Born        Died  Age
Rosaline Franklin     Chemist  1920-07-25  1958-04-16   37
William Gosset     Statiscian  1876-06-13  `937-10-16   61


## The Series

In [17]:
# Create example Dataframe with a row index level
# When you use the loc attribute to subset the first row of scientists dataframe, you will get the series object back.

scientists = pd.DataFrame(
    data = {'Occupation': ['Chemist', 'Statistician'],
            'Born': ['1920-07-25', '1876-06-13'],
            'Died': ['1958-04-16', '1937-10-16'],
            'Age': [37, 61]},
    index = ['Rosaline Frankline', 'William Gosset'],
    columns = ['Occupation', 'Born', 'Died', 'Age'])

print(scientists)

                      Occupation        Born        Died  Age
Rosaline Frankline       Chemist  1920-07-25  1958-04-16   37
William Gosset      Statistician  1876-06-13  1937-10-16   61


In [35]:
first_row = scientists.loc['William Gosset'] 

print(type(first_row))
print(first_row)


<class 'pandas.core.series.Series'>
Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object


In [24]:
print(first_row.index)

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')


In [25]:
print(first_row.values)

['Statistician' '1876-06-13' '1937-10-16' 61]


In [26]:
print(first_row.keys)

<bound method Series.keys of Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object>


In [38]:
print(first_row.index[0])

Occupation


In [43]:
print(first_row.keys()[0])

Occupation


In [44]:
print(first_row.size) # number of element

4


In [45]:
print(first_row.T) # Transpose of the series

Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object


In [48]:
# Get the seriese of the 'age' column from sceientist dataframe
ages = scientists['Age']
print(ages)

Rosaline Frankline    37
William Gosset        61
Name: Age, dtype: int64


In [54]:
# When you have a vecor of numbers, there are common calculations 
print(ages.mean())
print(ages.min())
print(ages.max())
print(ages.std())

49.0
37
61
16.97056274847714


## Boolean Subsetting: Series

In [86]:
import pandas as pd
scientists = pd.read_csv('./scientists.csv')

In [17]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [18]:
ages = scientists['Age']
ages

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [19]:
print(ages.describe())

count     8.000000
mean     59.125000
std      18.325918
min      37.000000
25%      44.000000
50%      58.500000
75%      68.750000
max      90.000000
Name: Age, dtype: float64


In [14]:
print(ages.mean())

59.125


In [15]:
print(ages > ages.mean())

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool


In [20]:
names = scientists['Name']

In [21]:
print(names == 'William Gosset')

0    False
1     True
2    False
3    False
4    False
5    False
6    False
7    False
Name: Name, dtype: bool


In [22]:
manual_bool_values = [True, True, False, False, True, True, False, True]
print(ages[manual_bool_values])

0    37
1    61
4    56
5    45
7    77
Name: Age, dtype: int64


## Operations are automatically aligned and vectorized

In [23]:
# Vectors with same lengths
print(ages * ages)

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64


In [24]:
print (ages + ages)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [26]:
# Vectors with intergers

print(ages + 100)
print(ages * 2)

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64
0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [27]:
# Vectors with different lengths
print(ages + pd.Series([1, 100]))

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64


## Vectors with Common Index Labels(Automatic Alignment)

In [30]:
# In Pandas the data aligment is almost always automatic
# It shows as they appear in the dataset
print(ages)

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [33]:
rev_ages = ages.sort_index(ascending = False)
rev_ages

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64

In [34]:
print(ages * 2)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


## Boolean Subseeting: DataFrame
### The most common Pandas object is DataFrame and it can be though of as Python's way of storing spreadsheet-like data.

In [39]:
# boolean vectors will subset rows

print(scientists[scientists['Age'] > scientists['Age'].mean()])


                   Name        Born        Died  Age     Occupation
1        William Gosset  1876-06-13  1937-10-16   61   Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90          Nurse
3           Marie Curie  1867-11-07  1934-07-04   66        Chemist
7          Johann Gauss  1777-04-30  1855-02-23   77  Mathematician


In [40]:
print(scientists[scientists['Age'] > scientists['Age'].median()])

                   Name        Born        Died  Age     Occupation
1        William Gosset  1876-06-13  1937-10-16   61   Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90          Nurse
3           Marie Curie  1867-11-07  1934-07-04   66        Chemist
7          Johann Gauss  1777-04-30  1855-02-23   77  Mathematician


In [41]:
print(scientists[scientists['Age'] > scientists['Age'].mode()])

                   Name        Born        Died  Age    Occupation
1        William Gosset  1876-06-13  1937-10-16   61  Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90         Nurse
3           Marie Curie  1867-11-07  1934-07-04   66       Chemist


In [42]:
# 4 values passes as a bool vector
# 3 rows for the [true] returned

print(scientists.loc[[True, True, False, True]])

                Name        Born        Died  Age    Occupation
0  Rosaline Franklin  1920-07-25  1958-04-16   37       Chemist
1     William Gosset  1876-06-13  1937-10-16   61  Statistician
3        Marie Curie  1867-11-07  1934-07-04   66       Chemist


## Operations are Automatically Aligned and Vectorized(Broadcasting)

In [46]:
first_half = scientists[:4]
second_half = scientists[4:]
print(first_half)

                   Name        Born        Died  Age    Occupation
0     Rosaline Franklin  1920-07-25  1958-04-16   37       Chemist
1        William Gosset  1876-06-13  1937-10-16   61  Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90         Nurse
3           Marie Curie  1867-11-07  1934-07-04   66       Chemist


In [47]:
print(second_half)

            Name        Born        Died  Age          Occupation
4  Rachel Carson  1907-05-27  1964-04-14   56           Biologist
5      John Snow  1813-03-15  1858-06-16   45           Physician
6    Alan Turing  1912-06-23  1954-06-07   41  Computer Scientist
7   Johann Gauss  1777-04-30  1855-02-23   77       Mathematician


In [48]:
# muliple by scalar
print(scientists * 2)
# if dataframe is all numeric values and more values need to be added on cell-by-ceel basis, you can use 'add' method.

                                       Name                  Born  \
0        Rosaline FranklinRosaline Franklin  1920-07-251920-07-25   
1              William GossetWilliam Gosset  1876-06-131876-06-13   
2  Florence NightingaleFlorence Nightingale  1820-05-121820-05-12   
3                    Marie CurieMarie Curie  1867-11-071867-11-07   
4                Rachel CarsonRachel Carson  1907-05-271907-05-27   
5                        John SnowJohn Snow  1813-03-151813-03-15   
6                    Alan TuringAlan Turing  1912-06-231912-06-23   
7                  Johann GaussJohann Gauss  1777-04-301777-04-30   

                   Died  Age                            Occupation  
0  1958-04-161958-04-16   74                        ChemistChemist  
1  1937-10-161937-10-16  122              StatisticianStatistician  
2  1910-08-131910-08-13  180                            NurseNurse  
3  1934-07-041934-07-04  132                        ChemistChemist  
4  1964-04-141964-04-14  112     

In [49]:
# Convert the strings to proper 'datetime'
print(scientists['Born'].dtype)

object


In [58]:
died_datetime = pd.to_datetime(scientists['Died'], format = '%Y-%m-%d') # %Y is uppercase
print(died_datetime)

0   1958-04-16
1   1937-10-16
2   1910-08-13
3   1934-07-04
4   1964-04-14
5   1858-06-16
6   1954-06-07
7   1855-02-23
Name: Died, dtype: datetime64[ns]


## Directly Change a Column

In [59]:
print(scientists['Age'])

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [62]:
import random

# Set a seed so the randomness is always the same
random.seed(42)
random.shuffle(scientists['Age'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  x[i], x[j] = x[j], x[i]


In [65]:
print(scientists['Age'])

0    77
1    90
2    37
3    61
4    41
5    45
6    66
7    56
Name: Age, dtype: int64


In [67]:
# The random_state is used to keep the 'randomization' less random

scientists['Age'] = scientists['Age'].\
    sample(len(scientists['Age']), random_state=24).\
    reset_index(drop=True) # Values stay randomized

print(scientists['Age'])

0    56
1    45
2    66
3    41
4    90
5    77
6    61
7    37
Name: Age, dtype: int64


In [72]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,56,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,45,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,66,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,41,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,90,Biologist,1907-05-27,1964-04-14
5,John Snow,1813-03-15,1858-06-16,77,Physician,1813-03-15,1858-06-16
6,Alan Turing,1912-06-23,1954-06-07,61,Computer Scientist,1912-06-23,1954-06-07
7,Johann Gauss,1777-04-30,1855-02-23,37,Mathematician,1777-04-30,1855-02-23


In [79]:
# Subtracting dates gives the number of days
scientists['age_days_dt'] = (scientists['died_dt'] - scientists['born_dt'])
print(scientists)

                   Name        Born        Died  Age          Occupation  \
0     Rosaline Franklin  1920-07-25  1958-04-16   56             Chemist   
1        William Gosset  1876-06-13  1937-10-16   45        Statistician   
2  Florence Nightingale  1820-05-12  1910-08-13   66               Nurse   
3           Marie Curie  1867-11-07  1934-07-04   41             Chemist   
4         Rachel Carson  1907-05-27  1964-04-14   90           Biologist   
5             John Snow  1813-03-15  1858-06-16   77           Physician   
6           Alan Turing  1912-06-23  1954-06-07   61  Computer Scientist   
7          Johann Gauss  1777-04-30  1855-02-23   37       Mathematician   

     born_dt    died_dt age_days_dt  age_years_dt  
0 1920-07-25 1958-04-16  13779 days          37.0  
1 1876-06-13 1937-10-16  22404 days          61.0  
2 1820-05-12 1910-08-13  32964 days          90.0  
3 1867-11-07 1934-07-04  24345 days          66.0  
4 1907-05-27 1964-04-14  20777 days          56.0  
5 1

In [71]:
# Change the column names

scientists['born_dt'], scientists['died_dt'] = (born_datetime, died_datetime)
print(scientists.head())

                   Name        Born        Died  Age    Occupation    born_dt  \
0     Rosaline Franklin  1920-07-25  1958-04-16   56       Chemist 1920-07-25   
1        William Gosset  1876-06-13  1937-10-16   45  Statistician 1876-06-13   
2  Florence Nightingale  1820-05-12  1910-08-13   66         Nurse 1820-05-12   
3           Marie Curie  1867-11-07  1934-07-04   41       Chemist 1867-11-07   
4         Rachel Carson  1907-05-27  1964-04-14   90     Biologist 1907-05-27   

     died_dt  
0 1958-04-16  
1 1937-10-16  
2 1910-08-13  
3 1934-07-04  
4 1964-04-14  


In [80]:
# We can convert the value to just the year using the astype method

scientists['age_years_dt'] = scientists['age_days_dt'].astype('timedelta64[Y]')
print(scientists)

                   Name        Born        Died  Age          Occupation  \
0     Rosaline Franklin  1920-07-25  1958-04-16   56             Chemist   
1        William Gosset  1876-06-13  1937-10-16   45        Statistician   
2  Florence Nightingale  1820-05-12  1910-08-13   66               Nurse   
3           Marie Curie  1867-11-07  1934-07-04   41             Chemist   
4         Rachel Carson  1907-05-27  1964-04-14   90           Biologist   
5             John Snow  1813-03-15  1858-06-16   77           Physician   
6           Alan Turing  1912-06-23  1954-06-07   61  Computer Scientist   
7          Johann Gauss  1777-04-30  1855-02-23   37       Mathematician   

     born_dt    died_dt age_days_dt  age_years_dt  
0 1920-07-25 1958-04-16  13779 days          37.0  
1 1876-06-13 1937-10-16  22404 days          61.0  
2 1820-05-12 1910-08-13  32964 days          90.0  
3 1867-11-07 1934-07-04  24345 days          66.0  
4 1907-05-27 1964-04-14  20777 days          56.0  
5 1

## Dropping Values

In [82]:
# you can either select all columns by using colum subsetting or select columns to drop with drop method on dataframe

print(scientists.columns)

Index(['Name', 'Born', 'Died', 'Age', 'Occupation', 'born_dt', 'died_dt',
       'age_days_dt', 'age_years_dt'],
      dtype='object')


In [84]:
# drop the shuffled age column 
# you provide the axis=1 argument to drop column-wise

scientist_dropped = scientists.drop(['Age'], axis =1)
print(scientist_dropped.columns)

Index(['Name', 'Born', 'Died', 'Occupation', 'born_dt', 'died_dt',
       'age_days_dt', 'age_years_dt'],
      dtype='object')


## Exploring and Importing Data


In [88]:
# Pickle - This is Python's way of serializing and saving ddata in a binary format reading pickle data 
# is also backwards compatible. 
# pickle files are saved with extensons of .p, .pkl or .pickle

names = scientists['Name']
print (names)

# pass in a string to the path you want to save
names.to_pickle('./scientists_names_series.pickle')

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object


In [90]:
# to read in pickled data

scientist_names_from_pickle = pd.read_pickle('./scientists_names_series.pickle')
scientist_names_from_pickle

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object

In [94]:
# same method cna be used on DataFrame object

scientists.to_pickle('./scientists_df.pickle')
pickled_df=pd.read_pickle('./scientists_df.pickle')
pickled_df

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [95]:
# to save a series into a CSV

names.to_csv('./sceientist_names_series.csv')

In [96]:
# to save a dataframe into a TSV

scientists.to_csv('./scientists_df.tsv', sep='\t')

In [101]:
# Series data structue doesn't have an explicit to_excel method.
# It needs to convert the series into a one-column DataFrame.
# Convert the SEries into a DataFrame before saving Excel.

names_df = names.to_frame()  # Contert the series into DF
import xlwt
names_df.to_excel('./scientists_names_series_df.xls')

import openpyxl
names_df.to_excel('./scientists_names_series_df.xlsx')


# Saving a DataFrame into excel format

scientists.to_excel('./scientists_df.xlsx',
                    sheet_name='scientists',
                    index=False)