# Series Object
A single column data frame + index

In [1]:
import pandas as pd

In [2]:
# Creation in code (implicit int index)
beattles = pd.Series(['John', 'Paul', 'George', 'Ringo'], name='beattles')

#Creation with explicit index
beattles_songs_by_author = pd.Series([45,95,35,46], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')
beattles_songs_by_author

John      45
Paul      95
George    35
Ringo     46
Name: beattles songs by author, dtype: int64

In [3]:
# Accessing values by index label
beattles_songs_by_author = pd.Series([45,95,35,46], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')
beattles_songs_by_author.loc['George']

# Accessing values by index position (int)
print( beattles_songs_by_author.iloc[0] )
print()

#get value, or return a default value (here -1) if not found
print( beattles_songs_by_author.get('Eric', -1) )
print()
#iloc can be used to slice per normal pythonn
print ( beattles_songs_by_author.iloc[1::2] )
print()
#accessing last item
print( beattles_songs_by_author.iloc[-1] )


45

-1

Paul     95
Ringo    46
Name: beattles songs by author, dtype: int64

46


In [4]:
beattles_songs_by_author = pd.Series([45,95,35,46], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')

# Check if Series contains an index label
print('Ringo' in beattles_songs_by_author)

# Check if Series contains a value
print(1200 in beattles_songs_by_author.values)

True
False


In [5]:
#iterating over only the series values
beattles_songs_by_author = pd.Series([45,95,35,46], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')
for song_count in beattles_songs_by_author:
    print(song_count)
print()
#iterating over index labels AND series values
for author, song_count in beattles_songs_by_author.items():
    print("{}:{}".format(author,song_count))
print()
#iterating through index labels only
for author in beattles_songs_by_author.keys():
    print(author)

45
95
35
46

John:45
Paul:95
George:35
Ringo:46

John
Paul
George
Ringo


In [6]:
# filtering by boolean array
beattles_songs_by_author = pd.Series([45,95,34,46,90, 23], index= ['John', 'Paul', 'George', 'Ringo', 'Eric', 'Steve'], name='beattles songs by author')
low_writers = beattles_songs_by_author[beattles_songs_by_author < beattles_songs_by_author.median()]
print(low_writers)
print()

#this is equivalent but more explicit.
bool_array_mask = beattles_songs_by_author < beattles_songs_by_author.median()
print(bool_array_mask)
print()
low_writers = beattles_songs_by_author[bool_array_mask]
print(low_writers)
print()

#inverse of a mask
print('big writers')
print(beattles_songs_by_author[~bool_array_mask])
print()
#you can combine boolean arrays with logical operators
print('even low writers')
beattles_songs_by_author[ bool_array_mask & (beattles_songs_by_author % 2 ==0) ]  #this line will give wrong results without ()


John      45
George    34
Steve     23
Name: beattles songs by author, dtype: int64

John       True
Paul      False
George     True
Ringo     False
Eric      False
Steve      True
Name: beattles songs by author, dtype: bool

John      45
George    34
Steve     23
Name: beattles songs by author, dtype: int64

big writers
Paul     95
Ringo    46
Eric     90
Name: beattles songs by author, dtype: int64

even low writers


George    34
Name: beattles songs by author, dtype: int64

In [7]:
#Simple data update/append/delete
beattles_songs_by_author = pd.Series([45,95,34,46], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')

del beattles_songs_by_author['John'] #in practice this is usually done by filtering the row out, not deleting in place
print(beattles_songs_by_author)

#add a new row, two ways
new_beattles = pd.concat([beattles_songs_by_author, pd.Series([1000],['Eric'])])
new_beattles["Steve"]=2
print(new_beattles)
print()

#two ways to set a value in place
new_beattles["John"]=2000
print(new_beattles)


Paul      95
George    34
Ringo     46
Name: beattles songs by author, dtype: int64
Paul        95
George      34
Ringo       46
Eric      1000
Steve        2
dtype: int64

Paul        95
George      34
Ringo       46
Eric      1000
Steve        2
John      2000
dtype: int64


In [8]:
#broadcast math operations
beattles_songs_by_author = pd.Series([45,95,35,46], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')
print('songs written by each beattle +1')
print( beattles_songs_by_author+1)
print()

print('Songs per beattle*100')
print (beattles_songs_by_author*100) #and so on for other math operations
print()

#broadcast rounding
beattles_songs_by_author_decimals = beattles_songs_by_author * 1.00109
print (beattles_songs_by_author_decimals.round())


songs written by each beattle +1
John      46
Paul      96
George    36
Ringo     47
Name: beattles songs by author, dtype: int64

Songs per beattle*100
John      4500
Paul      9500
George    3500
Ringo     4600
Name: beattles songs by author, dtype: int64

John      45.0
Paul      95.0
George    35.0
Ringo     46.0
Name: beattles songs by author, dtype: float64


In [9]:
#broadcast for strings
beattles = pd.Series(['John', 'Paul', 'George', 'Ringo'], name='beattles')
print(beattles.str.len())
print(beattles.str.lower())
print(beattles.str.strip())
print(beattles.str.replace('J', 'X'))

0    4
1    4
2    6
3    5
Name: beattles, dtype: int64
0      john
1      paul
2    george
3     ringo
Name: beattles, dtype: object
0      John
1      Paul
2    George
3     Ringo
Name: beattles, dtype: object
0      Xohn
1      Paul
2    George
3     Ringo
Name: beattles, dtype: object


In [19]:
#map, the ultimate broadcast.   returns a new series resulting from applying a function to all values
beattles_songs_by_author = pd.Series([45,95,35,46], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')
def make_productive(x):
    return x* 1000

print(beattles_songs_by_author.map(make_productive))




John      45000
Paul      95000
George    35000
Ringo     46000
Name: beattles songs by author, dtype: int64


In [10]:
#converting series data types
beattles_songs_by_author = pd.Series([45,95,35,46], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')
beattles_songs_by_author.astype(str)


John      45
Paul      95
George    35
Ringo     46
Name: beattles songs by author, dtype: object

In [25]:
#index manipulation functions
beattles_songs_by_author = pd.Series([45,95,35,46], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')

#reset to a monotonously increasing integer index - the default one.
print(beattles_songs_by_author.reset_index(drop=True)) #drop means to drop the index, not convert to an extra column in a DataFrame
print()
#re-order row by new index
print(beattles_songs_by_author.reindex(['Ringo', 'George', 'John', 'Paul']))
print()


#rename indexes using dictionary
print(beattles_songs_by_author.rename({'Ringo': 'Bozo'}))
print()

#rename indexes using a lambda
print(beattles_songs_by_author.rename(lambda x : x.lower()))

0    45
1    95
2    35
3    46
Name: beattles songs by author, dtype: int64

Ringo     46
George    35
John      45
Paul      95
Name: beattles songs by author, dtype: int64

John      45
Paul      95
George    35
Bozo      46
Name: beattles songs by author, dtype: int64

john      45
paul      95
george    35
ringo     46
Name: beattles songs by author, dtype: int64


In [11]:
#Merging or updating based on two series.
beattles_songs_by_author = pd.Series([45,95,34,46], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')
really_productive_beattles = pd.Series([400,900,300,600], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')

the_who = pd.Series([4,9,3,66600], index= ['Pete', 'Keith', 'Roger', 'John'], name='the who songs by author')

#append one series on another
super_band = beattles_songs_by_author.add(the_who, fill_value=-1)
print(super_band)
print()

#use one series to update the value in another series using a custom function
def avg_productivity(v1, v2):
    return (v1+v2)/2

print( beattles_songs_by_author.combine(really_productive_beattles, avg_productivity) )
print()

#update the values in one index with the values in another.
beattles_songs_by_author.update(the_who)
print(beattles_songs_by_author)


George       33.0
John      66645.0
Keith         8.0
Paul         94.0
Pete          3.0
Ringo        45.0
Roger         2.0
dtype: float64

John      222.5
Paul      497.5
George    167.0
Ringo     323.0
Name: beattles songs by author, dtype: float64

John      66600
Paul         95
George       34
Ringo        46
Name: beattles songs by author, dtype: int64


In [12]:
# Descriptive Statistics
beattles_songs_by_author = pd.Series([45,95,34,46,90, 23], index= ['John', 'Paul', 'George', 'Ringo', 'Eric', 'Steve'], name='beattles songs by author')
print(beattles_songs_by_author.sum())
print(beattles_songs_by_author.mean())
print(beattles_songs_by_author.median())
print(beattles_songs_by_author.quantile(.2))
print(beattles_songs_by_author.max())
print(beattles_songs_by_author.std())
print(beattles_songs_by_author.var()) # variance
print()
print(beattles_songs_by_author.describe())

333
55.5
45.5
34.0
95
29.898160478531118
893.9

count     6.00000
mean     55.50000
std      29.89816
min      23.00000
25%      36.75000
50%      45.50000
75%      79.00000
max      95.00000
Name: beattles songs by author, dtype: float64


In [13]:
#duplicate handling
beattles_songs_by_author = pd.Series([34,95,34,17], index= ['John', 'John', 'George', 'Ringo'], name='beattles songs by author')

#check if any duplicate indexes exist
print('any duplicate indexes?')
print(beattles_songs_by_author.is_unique)

#print out if values are duplicated

print(beattles_songs_by_author.duplicated())
print()

#drop duplicate values
print(beattles_songs_by_author.drop_duplicates())
print()

#return unique values
print(beattles_songs_by_author.unique)
print()


any duplicate indexes?
False
John      False
John      False
George     True
Ringo     False
Name: beattles songs by author, dtype: bool

John     34
John     95
Ringo    17
Name: beattles songs by author, dtype: int64

<bound method Series.unique of John      34
John      95
George    34
Ringo     17
Name: beattles songs by author, dtype: int64>



In [14]:
# na / None handling
beattles_songs_by_author = pd.Series([45,95,34,None], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')

#drop na's
print(beattles_songs_by_author.dropna())
print()

#replace na's with default
print('NA replaced by negative one')
print(beattles_songs_by_author.fillna(0))
print()

#get nulls as a bool array for filtering
print('is null')
print(beattles_songs_by_author.isnull())

John      45.0
Paul      95.0
George    34.0
Name: beattles songs by author, dtype: float64

NA replaced by negative one
John      45.0
Paul      95.0
George    34.0
Ringo      0.0
Name: beattles songs by author, dtype: float64

is null
John      False
Paul      False
George    False
Ringo      True
Name: beattles songs by author, dtype: bool



In [18]:
#sorting
beattles_songs_by_author = pd.Series([34,95,45,None], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')

print(beattles_songs_by_author.sort_index(ascending=True))
print()
print(beattles_songs_by_author.sort_values(ascending=False))



George    45.0
John      34.0
Paul      95.0
Ringo      NaN
Name: beattles songs by author, dtype: float64

Paul      95.0
George    45.0
John      34.0
Ringo      NaN
Name: beattles songs by author, dtype: float64


In [26]:
#serializing to and from csv
beattles_songs_by_author = pd.Series([34,95,45,None], index= ['John', 'Paul', 'George', 'Ringo'], name='beattles songs by author')

with open('beattles.csv', 'w') as fout:
	beattles_songs_by_author.to_csv(fout, header=True, index_label='Name')
	
#we usually read in as a data frame not a Series
with open('beattles.csv', 'r') as fin:
    df = pd.read_csv(fin, index_col=0)
print(df)

        beattles songs by author
Name                            
John                        34.0
Paul                        95.0
George                      45.0
Ringo                        NaN
