##### Think of a dataframe as a spreadsheet
* Import numpy, pandas
* Import Series and DataFrame from Pandas
* Means don't need to use pd.Series and pd.DataFrame
* just use Series and DataFrame on their own

In [16]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

#####Create DataFrame

In [30]:
# Create a dframe of random numbers
# 10 rows by 5 columns
dframe1 = DataFrame(np.random.randn(10, 5))
#Show
dframe1

Unnamed: 0,0,1,2,3,4
0,-0.756281,-0.406234,0.771813,-0.260713,0.609076
1,0.15064,0.318368,0.833417,2.651111,0.140913
2,-0.80881,1.167852,-0.239902,0.346369,-1.375619
3,-1.117911,-2.00312,-1.278466,0.22197,0.205984
4,-0.195005,-0.234844,0.807946,1.22465,1.653306
5,0.297969,-0.211083,-1.466187,0.402225,0.615775
6,0.667099,0.134969,0.385097,2.595509,-0.498207
7,-1.10677,-0.947562,1.013593,0.637456,0.542968
8,-1.197276,0.958619,-0.597127,-0.859344,2.431903
9,-0.950228,-0.862622,-0.219051,-1.332754,1.539727


In [31]:
# Create a dframe of random numbers
# 10 rows by 5 columns
# With column names A,B,C,D,E
dframe2 = DataFrame(np.random.randn(10, 5),
                    columns=['A', 'B', 'C', 'D', 'E'])
#Show
dframe2

Unnamed: 0,A,B,C,D,E
0,1.765001,0.189032,-0.352124,-0.098067,-0.28431
1,-1.385644,0.628176,1.046881,-1.283287,0.634779
2,-1.340478,-1.042355,-0.786823,0.806554,0.580185
3,1.628103,0.246677,0.6225,0.36866,2.731107
4,-0.718924,0.001851,-0.485846,0.09567,-0.916543
5,-0.824483,-1.723289,1.195611,-0.11578,-0.126033
6,0.379594,-0.076666,2.07337,1.434528,-1.038052
7,-0.124926,0.500767,-0.806244,-0.362083,0.437959
8,0.576185,-1.257452,-0.103016,-0.278564,-0.520243
9,1.038002,-1.552109,1.186445,-0.082395,0.051788


In [53]:
# Create a dframe with 4 random numbers
# Reshape 2 by 2
# Add column name
# Add index
dframe3 = DataFrame(np.arange(4).reshape(2,2),
                    columns=list('AB'),index=['NYC','LA'])

#Show
dframe3

Unnamed: 0,A,B
NYC,0,1
LA,2,3


In [57]:
# Or use
dframe4 = DataFrame(np.arange(25).reshape(5,5),
                    columns=['A','B','C','D','E'],
                    index=['NYC','LA','SF','CHI','LON'])

#Show
dframe4

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
LA,5,6,7,8,9
SF,10,11,12,13,14
CHI,15,16,17,18,19
LON,20,21,22,23,24


In [17]:
#Let's get some data to play with. How about the NFL?
import webbrowser
website = 'http://en.wikipedia.org/wiki/NFL_win-loss_records'
webbrowser.open(website)

True

In [18]:
#Copy first 5 ranked teams and read to get data
nfl_frame = pd.read_clipboard()

In [32]:
#Show
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
0,1,Dallas Cowboys,510,378,6,0.574,1960,894,NFC East
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North


#####Working with DataFrames

In [33]:
# Grab the column names with .columns
nfl_frame.columns

Index([u'Rank', u'Team', u'Won', u'Lost', u'Tied*', u'Pct.', u'First Season',
       u'Total Games', u'Conference'],
      dtype='object')

In [35]:
# Retrieve individual columns
nfl_frame.Team

0       Dallas Cowboys
1        Chicago Bears
2    Green Bay Packers
3       Miami Dolphins
4     Baltimore Ravens
Name: Team, dtype: object

In [24]:
# Or use
DataFrame(nfl_frame,columns=['Team'])

Unnamed: 0,Team
0,Dallas Cowboys
1,Chicago Bears
2,Green Bay Packers
3,Miami Dolphins
4,Baltimore Ravens


In [36]:
# Grab values from multi-worded column name
nfl_frame['First Season']

0    1960
1    1920
2    1921
3    1966
4    1996
Name: First Season, dtype: int64

In [37]:
# Or use
DataFrame(nfl_frame,columns=['First Season'])

Unnamed: 0,First Season
0,1960
1,1920
2,1921
3,1966
4,1996


In [38]:
# Look at some specific data columns
DataFrame(nfl_frame,columns=['Team','First Season','Total Games'])

Unnamed: 0,Team,First Season,Total Games
0,Dallas Cowboys,1960,894
1,Chicago Bears,1920,1357
2,Green Bay Packers,1921,1339
3,Miami Dolphins,1966,792
4,Baltimore Ravens,1996,326


In [39]:
# Retrieve TOP few rows
# Default returns first five rows
nfl_frame.head

<bound method DataFrame.head of    Rank               Team  Won  Lost  Tied*   Pct.  First Season  \
0     1     Dallas Cowboys  510   378      6  0.574          1960   
1     2      Chicago Bears  752   563     42  0.570          1920   
2     3  Green Bay Packers  741   561     37  0.567          1921   
3     4     Miami Dolphins  443   345      4  0.562          1966   
4     5   Baltimore Ravens  182   143      1  0.560          1996   

   Total Games Conference  
0          894   NFC East  
1         1357  NFC North  
2         1339  NFC North  
3          792   AFC East  
4          326  AFC North  >

In [40]:
# Retrieve TOP three rows
nfl_frame.head(3)

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
0,1,Dallas Cowboys,510,378,6,0.574,1960,894,NFC East
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North


In [41]:
# Retrieve LAST three rows
nfl_frame.tail(3)

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North


In [42]:
# Return all values for third row (INDEX 3)
# Use ix

nfl_frame.ix[3]

Rank                         4
Team            Miami Dolphins
Won                        443
Lost                       345
Tied*                        4
Pct.                     0.562
First Season              1966
Total Games                792
Conference            AFC East
Name: 3, dtype: object

In [43]:
# Grab multiple columns
# This creates a new data frame
# Stadium does not exist in the data frame so returns Nan
DataFrame(nfl_frame,
          columns=['Team','First Season','Total Games','Stadium'])

Unnamed: 0,Team,First Season,Total Games,Stadium
0,Dallas Cowboys,1960,894,
1,Chicago Bears,1920,1357,
2,Green Bay Packers,1921,1339,
3,Miami Dolphins,1966,792,
4,Baltimore Ravens,1996,326,


In [45]:
# We can also assign values to entire columns
# Careful with the ' here
nfl_frame['Stadium']="Levi's Stadium"
#Show
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference,Stadium
0,1,Dallas Cowboys,510,378,6,0.574,1960,894,NFC East,Levi's Stadium
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North,Levi's Stadium
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North,Levi's Stadium
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East,Levi's Stadium
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North,Levi's Stadium


In [46]:
#Putting numbers for stadiums
nfl_frame["Stadium"] = np.arange(5)

#Show
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference,Stadium
0,1,Dallas Cowboys,510,378,6,0.574,1960,894,NFC East,0
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North,1
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North,2
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East,3
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North,4


In [47]:
#Adding a Series to a DataFrame
stadiums = Series(["Levi's Stadium","AT&T Stadium"],index=[4,0])
#Show
stadiums

4    Levi's Stadium
0      AT&T Stadium
dtype: object

In [48]:
#Now input into the nfl DataFrame
nfl_frame['Stadium']=stadiums

#Show
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference,Stadium
0,1,Dallas Cowboys,510,378,6,0.574,1960,894,NFC East,AT&T Stadium
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North,
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North,
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East,
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North,Levi's Stadium


In [49]:
#We can also delete columns
del nfl_frame['Stadium']

nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
0,1,Dallas Cowboys,510,378,6,0.574,1960,894,NFC East
1,2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North
2,3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North
3,4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East
4,5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North


In [50]:
#DataFrames can be constructed many ways. Another way is from a dictionary of equal length lists
data = {'City':['SF','LA','NYC'],
        'Population':[837000,3880000,8400000]}

city_frame = DataFrame(data)

#Show
city_frame

Unnamed: 0,City,Population
0,SF,837000
1,LA,3880000
2,NYC,8400000


In [51]:
#For full list of ways to create DataFrames from various sources go to teh documentation for pandas:
website = 'http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.html'
webbrowser.open(website)

True