##### Think of a dataframe as a spreadsheet
* Import numpy, pandas
* Import Series and DataFrame from Pandas
* Means don't need to use pd.Series and pd.DataFrame
* just use Series and DataFrame on their own

In [1]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

#####Create DataFrame

In [2]:
# Create a dframe of random numbers
# 10 rows by 5 columns
dframe1 = DataFrame(np.random.randn(10, 5))
#Show
dframe1

Unnamed: 0,0,1,2,3,4
0,1.917697,2.309277,1.14645,0.848784,-1.796298
1,1.587544,-0.97838,-0.719227,-0.104783,0.896393
2,-0.901963,0.476837,-2.160731,-0.772172,-0.862739
3,1.60142,0.205594,0.834811,-0.04722,0.206579
4,0.896741,0.551626,-0.446384,0.519411,-1.563575
5,1.340887,0.676124,1.389039,0.690302,-0.672861
6,0.211048,0.838083,-0.988605,-0.465525,0.7786
7,-0.261779,0.543677,-0.412117,-0.040209,2.010473
8,-0.379533,-0.401494,-0.787742,0.844011,0.667548
9,-1.839553,-0.845549,-1.640436,1.114539,-0.843676


In [3]:
# Create a dframe of random numbers
# 10 rows by 5 columns
# With column names A,B,C,D,E
dframe2 = DataFrame(np.random.randn(10, 5),
                    columns=['A', 'B', 'C', 'D', 'E'])
#Show
dframe2

Unnamed: 0,A,B,C,D,E
0,1.098191,-0.275606,0.742009,0.675661,-0.671022
1,0.464326,-0.901078,-1.223377,-0.2649,-0.700515
2,0.399417,1.505876,0.770794,0.38614,-1.036665
3,1.83366,1.926181,1.450215,-0.471642,-1.263143
4,-0.690456,0.525694,-2.926841,0.351934,0.772354
5,0.469535,1.741975,-0.27113,0.884625,0.829159
6,0.547092,0.923668,0.474917,0.413519,-0.224845
7,0.271775,0.716133,0.905484,1.794388,0.302953
8,0.979633,0.316172,-0.421535,-0.31754,-0.266619
9,1.113527,-0.952721,-0.547171,0.259548,1.879415


In [4]:
# Create a dframe with 4 random numbers
# Reshape 2 by 2
# Add column name
# Add index
dframe3 = DataFrame(np.arange(4).reshape(2,2),
                    columns=list('AB'),index=['NYC','LA'])

#Show
dframe3

Unnamed: 0,A,B
NYC,0,1
LA,2,3


In [5]:
# Or use
dframe4 = DataFrame(np.arange(25).reshape(5,5),
                    columns=['A','B','C','D','E'],
                    index=['NYC','LA','SF','CHI','LON'])

#Show
dframe4

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
LA,5,6,7,8,9
SF,10,11,12,13,14
CHI,15,16,17,18,19
LON,20,21,22,23,24


In [6]:
# Create a dataframe from an array
arr0001 = np.array([[1,2,np.nan],
                    [np.nan,3,4]])
# Show
arr0001

array([[  1.,   2.,  nan],
       [ nan,   3.,   4.]])

In [7]:
# Create data frame
# Pass the matrix arr just created
# Show data frame
dframe0001 = DataFrame(arr0001,
                    index=['A','B'],
                    columns=['One','Two','Three'])
dframe0001

Unnamed: 0,One,Two,Three
A,1.0,2,
B,,3,4.0


In [8]:
# Creating a DataFrame by passing a dict of objects that can be converted to series-like.
dframe01 = pd.DataFrame({'Test #' : 1,
                    'Country' : pd.Series(['USSR','Germany','China','Japan']),
                    'Population' : pd.Series([8700000,4300000,3000000,2100000]),
                    'Rank' : np.array([1,2,3,4],dtype='int32'),
                    'Capital' : pd.Categorical(["Moscow","Berlin","Beijing","Tokyo"]),
                    'Analyst' : 'Manager01' })
#Show
dframe01

Unnamed: 0,Analyst,Capital,Country,Population,Rank,Test #
0,Manager01,Moscow,USSR,8700000,1,1
1,Manager01,Berlin,Germany,4300000,2,1
2,Manager01,Beijing,China,3000000,3,1
3,Manager01,Tokyo,Japan,2100000,4,1


In [9]:
#Show data types
dframe01.dtypes

Analyst         object
Capital       category
Country         object
Population       int64
Rank             int32
Test #           int64
dtype: object

In [10]:
# Creating a DataFrame by passing a dict of objects that can be converted to series-like.
dframe02 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
#Show
dframe02

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,1,3,test,foo
1,1,2013-01-02,1,3,train,foo
2,1,2013-01-02,1,3,test,foo
3,1,2013-01-02,1,3,train,foo


#####Play With NFL Data

In [11]:
#Let's get some data to play with. How about the NFL?
import webbrowser
website = 'http://en.wikipedia.org/wiki/NFL_win-loss_records'
webbrowser.open(website)

True

In [18]:
#Copy first 5 ranked teams and read to get data
nfl_frame = pd.read_clipboard()

In [32]:
#Show
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
0,1,Dallas Cowboys,510,378,6,0.574,1960,894,NFC East
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North


#####Working with DataFrames

In [33]:
# Grab the column names with .columns
nfl_frame.columns

Index([u'Rank', u'Team', u'Won', u'Lost', u'Tied*', u'Pct.', u'First Season',
       u'Total Games', u'Conference'],
      dtype='object')

In [35]:
# Retrieve individual columns
nfl_frame.Team

0       Dallas Cowboys
1        Chicago Bears
2    Green Bay Packers
3       Miami Dolphins
4     Baltimore Ravens
Name: Team, dtype: object

In [24]:
# Or use
DataFrame(nfl_frame,columns=['Team'])

Unnamed: 0,Team
0,Dallas Cowboys
1,Chicago Bears
2,Green Bay Packers
3,Miami Dolphins
4,Baltimore Ravens


In [36]:
# Grab values from multi-worded column name
nfl_frame['First Season']

0    1960
1    1920
2    1921
3    1966
4    1996
Name: First Season, dtype: int64

In [37]:
# Or use
DataFrame(nfl_frame,columns=['First Season'])

Unnamed: 0,First Season
0,1960
1,1920
2,1921
3,1966
4,1996


In [38]:
# Look at some specific data columns
DataFrame(nfl_frame,columns=['Team','First Season','Total Games'])

Unnamed: 0,Team,First Season,Total Games
0,Dallas Cowboys,1960,894
1,Chicago Bears,1920,1357
2,Green Bay Packers,1921,1339
3,Miami Dolphins,1966,792
4,Baltimore Ravens,1996,326


In [39]:
# Retrieve TOP few rows
# Default returns first five rows
nfl_frame.head

<bound method DataFrame.head of    Rank               Team  Won  Lost  Tied*   Pct.  First Season  \
0     1     Dallas Cowboys  510   378      6  0.574          1960   
1     2      Chicago Bears  752   563     42  0.570          1920   
2     3  Green Bay Packers  741   561     37  0.567          1921   
3     4     Miami Dolphins  443   345      4  0.562          1966   
4     5   Baltimore Ravens  182   143      1  0.560          1996   

   Total Games Conference  
0          894   NFC East  
1         1357  NFC North  
2         1339  NFC North  
3          792   AFC East  
4          326  AFC North  >

In [40]:
# Retrieve TOP three rows
nfl_frame.head(3)

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
0,1,Dallas Cowboys,510,378,6,0.574,1960,894,NFC East
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North


In [41]:
# Retrieve LAST three rows
nfl_frame.tail(3)

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North


In [42]:
# Return all values for third row (INDEX 3)
# Use ix

nfl_frame.ix[3]

Rank                         4
Team            Miami Dolphins
Won                        443
Lost                       345
Tied*                        4
Pct.                     0.562
First Season              1966
Total Games                792
Conference            AFC East
Name: 3, dtype: object

In [43]:
# Grab multiple columns
# This creates a new data frame
# Stadium does not exist in the data frame so returns Nan
DataFrame(nfl_frame,
          columns=['Team','First Season','Total Games','Stadium'])

Unnamed: 0,Team,First Season,Total Games,Stadium
0,Dallas Cowboys,1960,894,
1,Chicago Bears,1920,1357,
2,Green Bay Packers,1921,1339,
3,Miami Dolphins,1966,792,
4,Baltimore Ravens,1996,326,


In [45]:
# We can also assign values to entire columns
# Careful with the ' here
nfl_frame['Stadium']="Levi's Stadium"
#Show
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference,Stadium
0,1,Dallas Cowboys,510,378,6,0.574,1960,894,NFC East,Levi's Stadium
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North,Levi's Stadium
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North,Levi's Stadium
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East,Levi's Stadium
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North,Levi's Stadium


In [46]:
#Putting numbers for stadiums
nfl_frame["Stadium"] = np.arange(5)

#Show
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference,Stadium
0,1,Dallas Cowboys,510,378,6,0.574,1960,894,NFC East,0
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North,1
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North,2
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East,3
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North,4


In [47]:
#Adding a Series to a DataFrame
stadiums = Series(["Levi's Stadium","AT&T Stadium"],index=[4,0])
#Show
stadiums

4    Levi's Stadium
0      AT&T Stadium
dtype: object

In [48]:
#Now input into the nfl DataFrame
nfl_frame['Stadium']=stadiums

#Show
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference,Stadium
0,1,Dallas Cowboys,510,378,6,0.574,1960,894,NFC East,AT&T Stadium
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North,
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North,
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East,
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North,Levi's Stadium


In [49]:
#We can also delete columns
del nfl_frame['Stadium']

nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
0,1,Dallas Cowboys,510,378,6,0.574,1960,894,NFC East
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North


In [50]:
#DataFrames can be constructed many ways. Another way is from a dictionary of equal length lists
data = {'City':['SF','LA','NYC'],
        'Population':[837000,3880000,8400000]}

city_frame = DataFrame(data)

#Show
city_frame

Unnamed: 0,City,Population
0,SF,837000
1,LA,3880000
2,NYC,8400000


In [51]:
#For full list of ways to create DataFrames from various sources go to teh documentation for pandas:
website = 'http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.html'
webbrowser.open(website)

True