### Lecture 14: Pandas - Series

In [1]:
# values, index, how to create a Series, how to get the value of one element in a Series

# Operations on a Series --- find items greater than a value; check if an items' existence

# convert dictionary into a Series

In [4]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
# remember the uppercase

In [5]:
S1 = Series([100, 200, 300, 400])
S1

0    100
1    200
2    300
3    400
dtype: int64

In [6]:
S2 = Series([1000000, 2000000, 3000000, 4000000], index=['Sophie', 'Lucas', 'Mom', 'Dad'])
S2

Sophie    1000000
Lucas     2000000
Mom       3000000
Dad       4000000
dtype: int64

In [10]:
# note the values vs value (wrong)
# however, index vs indexes(wrong)
S2.values

array([1000000, 2000000, 3000000, 4000000], dtype=int64)

In [12]:
S2.index

Index(['Sophie', 'Lucas', 'Mom', 'Dad'], dtype='object')

In [13]:
S2['Sophie']

1000000

In [14]:
# Operations on a Series --- find items greater than a value; check if an items' existence
'Sohpie' in S2
# due to typo

False

In [15]:
'Sophie' in S2.index

True

In [16]:
2000000 in S2

False

In [17]:
S2[S2 > 2000000]

Mom    3000000
Dad    4000000
dtype: int64

In [18]:
'Sophie' in S2

True

In [19]:
# convert to dictionary from series
ww2_dict = S2.to_dict()
ww2_dict
# note automatic sorting by index happens

{'Dad': 4000000, 'Lucas': 2000000, 'Mom': 3000000, 'Sophie': 1000000}

In [20]:
# convert to series from dictionary
ww2_series = Series(ww2_dict)
ww2_series

Dad       4000000
Lucas     2000000
Mom       3000000
Sophie    1000000
dtype: int64

In [22]:
obj(ww2_series)

NameError: name 'obj' is not defined

In [23]:
fam = ["Dad", "Mom", "Lucas", "Sophie", "Linus"]

In [24]:
fam_series = Series(ww2_series, index=fam)
fam_series

Dad       4000000
Mom       3000000
Lucas     2000000
Sophie    1000000
Linus         NaN
dtype: float64

In [25]:
# to check WHAT IF the series has index which is not defined in new index
fam2 = ["EZ", "MC", "Lucas", "Sophie"]
fam_series2 = Series(fam_series, index=fam2)
fam_series2

EZ            NaN
MC            NaN
Lucas     2000000
Sophie    1000000
dtype: float64

In [26]:
# so whatever in par_series will not be included in the new series; whatever new in the new series will be NaN

In [27]:
pd.isnull(fam_series2)

EZ         True
MC         True
Lucas     False
Sophie    False
dtype: bool

In [28]:
pd.notnull(fam_series2)

EZ        False
MC        False
Lucas      True
Sophie     True
dtype: bool

In [29]:
pd.anynull(fam_series2)

AttributeError: module 'pandas' has no attribute 'anynull'

In [30]:
# Series to Sereis; NaN + X = NaN
fam_series + fam_series2

Dad           NaN
EZ            NaN
Linus         NaN
Lucas     4000000
MC            NaN
Mom           NaN
Sophie    2000000
dtype: float64

In [34]:
fam_series3 = Series(fam_series2)
condition = pd.isnull(fam_series3)
condition

EZ         True
MC         True
Lucas     False
Sophie    False
dtype: bool

In [37]:
fam_series3.values

array([      nan,       nan,  2000000.,  1000000.])

In [44]:
zero_A = np.zeros([1,4])
zero_A

array([[ 0.,  0.,  0.,  0.]])

In [47]:
zeros = np.zeros(4)
fam3_val = fam_series3.values
arr_val = [z_item if cond else fam3_item for z_item, fam3_item, cond in zip(zeros, fam3_val, condition ) ]
arr_val

[0.0, 0.0, 2000000.0, 1000000.0]

In [49]:
zeros = np.zeros(4)
fam3_val = fam_series3.values
arr_val = [z_item if cond else fam3_item for z_item, fam3_item, cond in zip(np.zeros(4), fam_series3.values, pd.isnull(fam_series3) ) ]
arr_val

[0.0, 0.0, 2000000.0, 1000000.0]

In [50]:
# try to see if a Series can be changed with values in this way
zeros = np.zeros(4)
fam3_val = fam_series3.values
fam_series3.values = [z_item if cond else fam3_item for z_item, fam3_item, cond in zip(np.zeros(4), fam_series3.values, pd.isnull(fam_series3) ) ]
fam_series3

AttributeError: can't set attribute

In [51]:
# to set the values of a series, the Series() method has to be called, rather than through assignment
zeros = np.zeros(4)
fam3_val = fam_series3.values
arr_val = [z_item if cond else fam3_item for z_item, fam3_item, cond in zip(np.zeros(4), fam_series3.values, pd.isnull(fam_series3) ) ]
fam_series3_init = Series(arr_val, index=fam_series3.index)
fam_series3_init

EZ              0
MC              0
Lucas     2000000
Sophie    1000000
dtype: float64

In [52]:
fam_series + fam_series3_init

Dad           NaN
EZ            NaN
Linus         NaN
Lucas     4000000
MC            NaN
Mom           NaN
Sophie    2000000
dtype: float64

### Lecture 15: Pandas - Data Frame


In [65]:
# learnings:
# import
# webbrowser
# read_clipboard() to create a DF
# DF.columns
# get a column of a DF; get multiple columns of a DF
# add a new column to a DF
# DF head, tail --- head(n), tail(n)
# DF index  --- ix(n)
# assign values to an entire column/add a new column
# passing values to column using array
# passing values to column using Series --- auto aligned with index
# delete column --- del DF_name['col_name']
# DF & Dictionary --- constructing DF using dictionary
# a full way of constructing DF --- http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.html

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [55]:
import webbrowser as wb

In [56]:
website = 'http://en.wikipedia.org/wiki/NFL_win-loss_records'
wb.open(website)

True

In [57]:
nfl_frame = pd.read_clipboard()
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
0,1,Dallas Cowboys,511,378,6,0.574,1960,894,NFC East
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North


In [59]:
nfl_frame.columns

Index(['Rank ', 'Team ', 'Won ', 'Lost ', 'Tied* ', 'Pct. ', 'First Season ',
       'Total Games ', 'Conference'],
      dtype='object')

In [71]:
nfl_sub = DataFrame(nfl_frame, columns=['Rank ', 'Team ', 'First Season '])
nfl_sub

Unnamed: 0,Rank,Team,First Season
0,1,Dallas Cowboys,1960
1,2,Chicago Bears,1920
2,3,Green Bay Packers,1921
3,4,Miami Dolphins,1966
4,5,Baltimore Ravens,1996


In [68]:
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
0,1,Dallas Cowboys,511,378,6,0.574,1960,894,NFC East
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North


In [73]:
# add a new column 'Statdium'
DataFrame(nfl_frame,columns=['Team ','First Season ','Total Games ','Stadium'])

Unnamed: 0,Team,First Season,Total Games,Stadium
0,Dallas Cowboys,1960,894,
1,Chicago Bears,1920,1357,
2,Green Bay Packers,1921,1339,
3,Miami Dolphins,1966,792,
4,Baltimore Ravens,1996,326,


In [72]:
nfl_frame['Team ']

0       Dallas Cowboys 
1        Chicago Bears 
2    Green Bay Packers 
3       Miami Dolphins 
4     Baltimore Ravens 
Name: Team , dtype: object

In [74]:
#passing values to a DF column 'Stadium'
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
0,1,Dallas Cowboys,511,378,6,0.574,1960,894,NFC East
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North


In [75]:
nfl_f2 = DataFrame(nfl_frame, columns=["rank ", "Team ", "Won ", "Stadium"])
nfl_f2

Unnamed: 0,rank,Team,Won,Stadium
0,,Dallas Cowboys,511,
1,,Chicago Bears,752,
2,,Green Bay Packers,741,
3,,Miami Dolphins,443,
4,,Baltimore Ravens,182,


In [76]:
nfl_f2.Stadium="Nokia's Stadium"
nfl_f2

Unnamed: 0,rank,Team,Won,Stadium
0,,Dallas Cowboys,511,Nokia's Stadium
1,,Chicago Bears,752,Nokia's Stadium
2,,Green Bay Packers,741,Nokia's Stadium
3,,Miami Dolphins,443,Nokia's Stadium
4,,Baltimore Ravens,182,Nokia's Stadium


In [77]:
nfl_f2.index

Int64Index([0, 1, 2, 3, 4], dtype='int64')

In [78]:
nfl_f2.columns

Index(['rank ', 'Team ', 'Won ', 'Stadium'], dtype='object')

In [79]:
# change the column names --- spaces are removed"
nfl_f2.columns=['rank', 'Team', 'Won', 'Stadium']
nfl_f2

Unnamed: 0,rank,Team,Won,Stadium
0,,Dallas Cowboys,511,Nokia's Stadium
1,,Chicago Bears,752,Nokia's Stadium
2,,Green Bay Packers,741,Nokia's Stadium
3,,Miami Dolphins,443,Nokia's Stadium
4,,Baltimore Ravens,182,Nokia's Stadium


In [80]:
nfl_f2.columns

Index(['rank', 'Team', 'Won', 'Stadium'], dtype='object')

In [81]:
# passing values to a column using Series
S1 = Series(["Dallas", "Chicago", "Unkown", "Miami", "Baltimore"], index=[0,1,2,3,4])
stadium = S1
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
0,1,Dallas Cowboys,511,378,6,0.574,1960,894,NFC East
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North


In [82]:
nfl_f2

Unnamed: 0,rank,Team,Won,Stadium
0,,Dallas Cowboys,511,Nokia's Stadium
1,,Chicago Bears,752,Nokia's Stadium
2,,Green Bay Packers,741,Nokia's Stadium
3,,Miami Dolphins,443,Nokia's Stadium
4,,Baltimore Ravens,182,Nokia's Stadium


In [83]:
# important step
nfl_f2["Stadium"] = S1
nfl_f2

Unnamed: 0,rank,Team,Won,Stadium
0,,Dallas Cowboys,511,Dallas
1,,Chicago Bears,752,Chicago
2,,Green Bay Packers,741,Unkown
3,,Miami Dolphins,443,Miami
4,,Baltimore Ravens,182,Baltimore


In [85]:
S2 = Series(["Baltimore", "Chicago", "Dallas",  "Miami", "San Hose",], index=[4,1,0,3,2])
S2

4    Baltimore
1      Chicago
0       Dallas
3        Miami
2     San Hose
dtype: object

In [87]:
nfl_f2["Stdium"] = S2
nfl_f2["Stadium"] = S2
nfl_f2

Unnamed: 0,rank,Team,Won,Stadium,Stdium
0,,Dallas Cowboys,511,Dallas,Dallas
1,,Chicago Bears,752,Chicago,Chicago
2,,Green Bay Packers,741,San Hose,San Hose
3,,Miami Dolphins,443,Miami,Miami
4,,Baltimore Ravens,182,Baltimore,Baltimore


In [88]:
# delete column --- del DF_name['col_name']
del nfl_f2["Stdium"]
nfl_f2

Unnamed: 0,rank,Team,Won,Stadium
0,,Dallas Cowboys,511,Dallas
1,,Chicago Bears,752,Chicago
2,,Green Bay Packers,741,San Hose
3,,Miami Dolphins,443,Miami
4,,Baltimore Ravens,182,Baltimore


In [89]:
# converting between dictionary {key:[values]} and DF
dic = {"rank":[0,1,2], "Team":["Chicago Bears", "Green Bay Packers","Dallas Cowboys"], "Score":[752, 741,511]}
nfl_f3 = DataFrame(dic)
nfl_f3

Unnamed: 0,Score,Team,rank
0,752,Chicago Bears,0
1,741,Green Bay Packers,1
2,511,Dallas Cowboys,2
