In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
colors = ['red', 'yellow', 'green', 'blue', 'orange', 'red', 'violet', 'indigo']
# Here I convert the list `colors` using the Series() method.
colors_series = pd.Series(colors)
colors_series

0       red
1    yellow
2     green
3      blue
4    orange
5       red
6    violet
7    indigo
dtype: object

In [3]:
# I can see that color_series is now a pandas Series object.
print(f' Here is the color series type {type(colors_series)}')

 Here is the color series type <class 'pandas.core.series.Series'>


In [4]:
# I can access its autogenerated index by using the .index attribute.
colors_series.index

RangeIndex(start=0, stop=8, step=1)

In [5]:
# I see that the index is the default RangeIndex subclass.
type(colors_series.index)

pandas.core.indexes.range.RangeIndex

In [6]:
# I can access its data by using the values attribute.
colors_series.values

array(['red', 'yellow', 'green', 'blue', 'orange', 'red', 'violet',
       'indigo'], dtype=object)

In [7]:
# I can see that accessing the data in my Series using the .values attribute returns a numpy array.
print(f' The color_series data type is a {type(colors_series.values)}')

 The color_series data type is a <class 'numpy.ndarray'>


In [8]:
# Create a numpy array.
arr = np.array([5, 10, 15, 20, 25, 30, 35, 40, 40])


In [9]:
# Convert my numpy array to a pandas Series.
num_series = pd.Series(arr)
num_series

0     5
1    10
2    15
3    20
4    25
5    30
6    35
7    40
8    40
dtype: int64

In [10]:
# Create a python dictionary.

data = {'a' : 0., 'b' : 1., 'c' : 2., 'd': 3., 'e': 4., 'f': 5.}
# Convert dictionary to a pandas Series

diction_series = pd.Series(data)
diction_series

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
f    5.0
dtype: float64

In [11]:
# Confirm the conversion.

print(f'My diction_series is a {type(diction_series)}.')

My diction_series is a <class 'pandas.core.series.Series'>.


In [12]:
# The default is the first 5 rows.
colors_series.head()

0       red
1    yellow
2     green
3      blue
4    orange
dtype: object

In [13]:
# Calling the .tail() method with our n = 2 returns a Series with the last two rows.
colors_series.tail(2)

6    violet
7    indigo
dtype: object

In [14]:
# The default for the `.sample()` method is one row.
colors_series.sample()

4    orange
dtype: object

In [15]:
# Calling the `.head()`, `.tail()`, or `.sample()` methods on our Series returns a new Series.

print(type(colors_series.head()))
print(type(colors_series.tail()))
print(type(colors_series.sample()))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [16]:
# Here I change the data type of `numeric_series` to object, as you can see below.
print(num_series.astype(str))

0     5
1    10
2    15
3    20
4    25
5    30
6    35
7    40
8    40
dtype: object


In [17]:
# However, since I didn't reassign that transformation, the original Series' data type is still int64.
print(num_series)

0     5
1    10
2    15
3    20
4    25
5    30
6    35
7    40
8    40
dtype: int64


In [18]:
# Returns the total count, unique value count, most frequent value, and frequency of top value.
colors_series.describe()

count       8
unique      7
top       red
freq        2
dtype: object

In [19]:
# Returns a Series of summary statistics on a column with a numeric data type.
num_series.describe()

count     9.000000
mean     24.444444
std      12.856041
min       5.000000
25%      15.000000
50%      25.000000
75%      35.000000
max      40.000000
dtype: float64

In [20]:
# Default - frequency of unique values.

colors_series.value_counts()

red       2
yellow    1
orange    1
violet    1
indigo    1
green     1
blue      1
dtype: int64

In [21]:
# normalize=True returns the relative frequency of the unique values.
colors_series.value_counts(normalize=True)

red       0.250
yellow    0.125
orange    0.125
violet    0.125
indigo    0.125
green     0.125
blue      0.125
dtype: float64

In [22]:
# normalize=True and ascending=True displays the largest relative frequency last.
colors_series.value_counts(normalize=True, ascending=True)

blue      0.125
green     0.125
indigo    0.125
violet    0.125
orange    0.125
yellow    0.125
red       0.250
dtype: float64

In [23]:
# This allows me to return all observations when there are duplicate max values in a Series.
print(num_series.nlargest(1, keep = 'all'))
print(num_series.nsmallest(1, keep = 'all'))

7    40
8    40
dtype: int64
0    5
dtype: int64


In [24]:
# Using `.sort_values()` on a column with string values returns a Series in alphabetic order, ascending.
colors_series.sort_values()

3      blue
2     green
7    indigo
4    orange
0       red
5       red
6    violet
1    yellow
dtype: object

In [25]:
# It works as you would think with numerical values; setting ascending=False returns values in descending order.
colors_series.sort_values(ascending = False)

1    yellow
6    violet
5       red
0       red
4    orange
7    indigo
2     green
3      blue
dtype: object

In [26]:
# Here I check to see if all of the values in my Series meet my condition.

(colors_series == 'red').all()

False

In [27]:
# Here I check to see if any of the values in my Series meet my condition.

(colors_series == 'red').any()

True

In [28]:
(num_series < 0).any()

False

In [33]:
# The string method .capitalize() is applied to each string value in the Series.
bad_series = colors_series.str.capitalize()
bad_series


0       Red
1    Yellow
2     Green
3      Blue
4    Orange
5       Red
6    Violet
7    Indigo
dtype: object

In [38]:
# What if my colors were hyphenated like this for some terrible reason?bcolors_series.str.join('-')
bad_series.str.replace('-' ,'')
bad_series

0       Red
1    Yellow
2     Green
3      Blue
4    Orange
5       Red
6    Violet
7    Indigo
dtype: object

In [39]:
# Create a list of colors I want to check for in my `color_series`.

my_colors = ['black', 'white', 'red']

In [41]:
bools = colors_series.isin(my_colors)
bools

0     True
1    False
2    False
3    False
4    False
5     True
6    False
7    False
dtype: bool

In [42]:
# Using the `isin()` method returns a new boolean Series; I assigned it to `bools` above.
type(bools)

pandas.core.series.Series

In [44]:
# I can pass the boolean Series into the indexing operator to return the observations that are True.

colors_series[bools]

0    red
5    red
dtype: object

In [48]:
# I can skip the middle woman and pass a conditional directly into the indexing operator
# to return only even values.
num_series[num_series % 2 == 0]

1    10
3    20
5    30
7    40
8    40
dtype: int64

In [50]:
# Here I am passing a boolean Series as a selector to the .loc attribute called on my original color_series 
# from above. As you can see below, where the boolean Series has a True value, the observation from the 
# original Series is returned.
colors_series.loc[bools]

0    red
5    red
dtype: object

In [52]:
# Here, I'm demonstrating that I can pass a conditional that returns a boolean Series directly as a 
# selector into .loc. I don't have to assign the boolean Series to a variable first like I did above.
num_series.loc[num_series > 15]

3    20
4    25
5    30
6    35
7    40
8    40
dtype: int64

In [53]:
 # Slicing with labels in the index to return a range of observations.

diction_series['a': 'c']

a    0.0
b    1.0
c    2.0
dtype: float64

In [55]:
# Passing a list with labels in the index returns only the observations in the list.
diction_series[['a', 'd', 'f']]

a    0.0
d    3.0
f    5.0
dtype: float64

In [60]:
# The .iloc[row_indexer, column index] attribute allows me to access a group of rows and columns by their 
# integer location or position. Notice below that the observations returned match the integer index location
# passed to .iloc, and the indexing is NOT inclusive. .iloc does not accept a boolean Series as a selector 
# like .loc does.

# # The row selector starts at the 0 position of the index and is not inclusive.
print(colors_series)
print(colors_series.iloc[0:5])

0       red
1    yellow
2     green
3      blue
4    orange
5       red
6    violet
7    indigo
dtype: object
0       red
1    yellow
2     green
3      blue
4    orange
dtype: object


In [61]:
# I can also pass a list of integer positions to `.iloc` to pick and choose rows even if the index is labels.
colors_series.iloc[[0,2,4]]

0       red
2     green
4    orange
dtype: object

In [63]:
# Our Series of color name strings has the object data type, Type ('O') is an object
colors_series.dtype

dtype('O')

In [65]:
# Our Series of numbers has the int64 data type

num_series.dtype

dtype('int64')

In [66]:
# Our Series of bool values; this is actually the NumPy bool dtype which does not support missing values.
# (None becomes False, np.nan becomes True)

bools.dtype

dtype('bool')

In [67]:
diction_series.dtype

dtype('float64')

In [68]:
# The .shape attribute returns a tuple representing the rows and columns in a DataFrame, but it can also 
# be used on a Series to return the rows.
colors_series.shape

(8,)

In [69]:
# Here I use a string method that returns a boolean Series to identify strings ending with the letter `d`.
colors_series.loc[colors_series.str.endswith('d')]

0    red
5    red
dtype: object

In [71]:
# Here I put it all together and append the `.upper()` string onto the Series being returned. All in one step.
colors_series.loc[colors_series.str.endswith('d')].str.upper()

0    RED
5    RED
dtype: object

ModuleNotFoundError: No module named 'pydataset'