---

_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-data-analysis/resources/0dhYG) course resource._

---

# The Series Data Structure

In [0]:
import pandas as pd
pd.Series?

In [0]:
animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)

0    Tiger
1     Bear
2    Moose
dtype: object

In [0]:
numbers = [1, 2, 3]
pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [0]:
animals = ['Tiger', 'Bear', None]
pd.Series(animals)

0    Tiger
1     Bear
2     None
dtype: object

In [0]:
numbers = [1, 2, None]
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [0]:
import numpy as np
np.nan == None

False

In [0]:
np.nan == np.nan

False

In [0]:
np.isnan(np.nan)

True

In [0]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [0]:
s.index

Index(['Archery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object')

In [0]:
s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s

India      Tiger
America     Bear
Canada     Moose
dtype: object

In [0]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey'])
s

Golf      Scotland
Sumo         Japan
Hockey         NaN
dtype: object

# Querying a Series

In [0]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [0]:
s.iloc[3]

'South Korea'

In [0]:
s.loc['Golf']

'Scotland'

In [0]:
s[3]

'South Korea'

In [0]:
s['Golf']

'Scotland'

In [0]:
sports = {99: 'Bhutan',
          100: 'Scotland',
          101: 'Japan',
          102: 'South Korea'}
s = pd.Series(sports)

In [0]:
s[0] #This won't call s.iloc[0] as one might expect, it generates an error instead

KeyError: ignored

In [0]:
s = pd.Series([100.00, 120.00, 101.00, 3.00])
s

0    100.0
1    120.0
2    101.0
3      3.0
dtype: float64

In [0]:
total = 0
for item in s:
    total+=item
print(total)

324.0


In [0]:
import numpy as np

total = np.sum(s)
print(total)

324.0


In [0]:
#this creates a big series of random numbers
s = pd.Series(np.random.randint(0,1000,10000))
s.head()

0    444
1    923
2    166
3    266
4    465
dtype: int64

In [0]:
len(s)

10000

In [0]:
%%timeit -n 100
summary = 0
for item in s:
    summary+=item

100 loops, best of 3: 1.25 ms per loop


In [0]:
%%timeit -n 100
summary = np.sum(s)

100 loops, best of 3: 164 µs per loop


In [0]:
s+=2 #adds two to each item in s using broadcasting
s.head()

0    446
1    925
2    168
3    268
4    467
dtype: int64

In [0]:
for label, value in s.iteritems():
    s.at[label] = value+2
s.head()

0    448
1    927
2    170
3    270
4    469
dtype: int64

In [0]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
for label, value in s.iteritems():
    s.loc[label]= value+2

10 loops, best of 3: 847 ms per loop


In [0]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
s+=2


10 loops, best of 3: 492 µs per loop


In [0]:
s = pd.Series([1, 2, 3])
s.loc['Animal'] = 'Bears'
s

0             1
1             2
2             3
Animal    Bears
dtype: object

In [0]:
original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)

In [0]:
original_sports

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [0]:
cricket_loving_countries

Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

In [0]:
all_countries

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Cricket        Australia
Cricket         Barbados
Cricket         Pakistan
Cricket          England
dtype: object

In [0]:
all_countries.loc['Cricket']

Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

# The DataFrame Data Structure

In [0]:
import pandas as pd
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()

In [0]:
df.loc['Store 2']

In [0]:
type(df.loc['Store 2'])

In [0]:
df.loc['Store 1']

In [0]:
df.loc['Store 1', 'Cost']

In [0]:
df.T

In [0]:
df.T.loc['Cost']

In [0]:
df['Cost']

In [0]:
df.loc['Store 1']['Cost']

In [0]:
df.loc[:,['Name', 'Cost']]

In [0]:
df.drop('Store 1')

In [0]:
df

In [0]:
copy_df = df.copy()
copy_df = copy_df.drop('Store 1')
copy_df

In [0]:
copy_df.drop?

In [0]:
del copy_df['Name']
copy_df

In [0]:
df['Location'] = None
df

# Dataframe Indexing and Loading

In [0]:
costs = df['Cost']
costs

In [0]:
costs+=2
costs

In [0]:
df

In [0]:
!cat olympics.csv

In [0]:
df = pd.read_csv('olympics.csv')
df.head()

In [0]:
df = pd.read_csv('olympics.csv', index_col = 0, skiprows=1)
df.head()

In [0]:
df.columns

In [0]:
for col in df.columns:
    if col[:2]=='01':
        df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
    if col[:2]=='02':
        df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
    if col[:2]=='03':
        df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
    if col[:1]=='№':
        df.rename(columns={col:'#' + col[1:]}, inplace=True) 

df.head()

# Querying a DataFrame

In [0]:
df['Gold'] > 0

In [0]:
only_gold = df.where(df['Gold'] > 0)
only_gold.head()

In [0]:
only_gold['Gold'].count()

In [0]:
df['Gold'].count()

In [0]:
only_gold = only_gold.dropna()
only_gold.head()

In [0]:
only_gold = df[df['Gold'] > 0]
only_gold.head()

In [0]:
len(df[(df['Gold'] > 0) | (df['Gold.1'] > 0)])

In [0]:
df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]

# Indexing Dataframes

In [0]:
df.head()

In [0]:
df['country'] = df.index
df = df.set_index('Gold')
df.head()

In [0]:
df = df.reset_index()
df.head()

In [0]:
df = pd.read_csv('census.csv')
df.head()

In [0]:
df['SUMLEV'].unique()

In [0]:
df=df[df['SUMLEV'] == 50]
df.head()

In [0]:
columns_to_keep = ['STNAME',
                   'CTYNAME',
                   'BIRTHS2010',
                   'BIRTHS2011',
                   'BIRTHS2012',
                   'BIRTHS2013',
                   'BIRTHS2014',
                   'BIRTHS2015',
                   'POPESTIMATE2010',
                   'POPESTIMATE2011',
                   'POPESTIMATE2012',
                   'POPESTIMATE2013',
                   'POPESTIMATE2014',
                   'POPESTIMATE2015']
df = df[columns_to_keep]
df.head()

In [0]:
df = df.set_index(['STNAME', 'CTYNAME'])
df.head()

In [0]:
df.loc['Michigan', 'Washtenaw County']

In [0]:
df.loc[ [('Michigan', 'Washtenaw County'),
         ('Michigan', 'Wayne County')] ]

# Missing values

In [0]:
df = pd.read_csv('log.csv')
df

In [0]:
df.fillna?

In [0]:
df = df.set_index('time')
df = df.sort_index()
df

In [0]:
df = df.reset_index()
df = df.set_index(['time', 'user'])
df

In [0]:
df = df.fillna(method='ffill')
df.head()