In [1]:
import pandas as pd

In [4]:
fortune = pd.read_csv("fortune1000.csv", index_col="Rank")
sectors = fortune.groupby("Sector")
fortune.head(3)

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Walmart,Retailing,General Merchandisers,"Bentonville, AR",482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000


In [7]:
# No point grouping on a column with all unique values
# Industry has duplicates and Sector has duplicates
# This will bundle each sector into a larger groupby object
sectors = fortune.groupby("Sector")
type(sectors)   # now you can call methods on this groupby object
type(fortune)   # This is a separate DataFrame

pandas.core.frame.DataFrame

In [8]:
fortune.head(3)

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Walmart,Retailing,General Merchandisers,"Bentonville, AR",482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000


In [9]:
# This returns the number of groupings in our sectors groupby object
len(sectors)

21

In [10]:
# This shows the same as above
fortune["Sector"].nunique()

21

In [16]:
# sorts in alpha by the label or grouping
# this is calling a method on a groupby object sectors
sectors.size()

Sector
Aerospace & Defense              20
Apparel                          15
Business Services                51
Chemicals                        30
Energy                          122
Engineering & Construction       26
Financials                      139
Food and Drug Stores             15
Food, Beverages & Tobacco        43
Health Care                      75
Hotels, Resturants & Leisure     25
Household Products               28
Industrials                      46
Materials                        43
Media                            25
Motor Vehicles & Parts           24
Retailing                        80
Technology                      102
Telecommunications               15
Transportation                   36
Wholesalers                      40
dtype: int64

In [14]:
# sorts by descending on counts
# this is calling a Series method on Sector
fortune["Sector"].value_counts()

Financials                      139
Energy                          122
Technology                      102
Retailing                        80
Health Care                      75
Business Services                51
Industrials                      46
Food, Beverages & Tobacco        43
Materials                        43
Wholesalers                      40
Transportation                   36
Chemicals                        30
Household Products               28
Engineering & Construction       26
Hotels, Resturants & Leisure     25
Media                            25
Motor Vehicles & Parts           24
Aerospace & Defense              20
Apparel                          15
Food and Drug Stores             15
Telecommunications               15
Name: Sector, dtype: int64

In [19]:
# first() method extracts the first row of each grouping or sector
# last() method extracts the last row of each grouping or sector
# These methods give you a picture of what is in the groupby object
sectors.first()
sectors.last()

Unnamed: 0_level_0,Company,Industry,Location,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aerospace & Defense,Delta Tucker Holdings,Aerospace and Defense,"McLean, VA",1923,-133,12000
Apparel,Guess,Apparel,"Los Angeles, CA",2204,82,13500
Business Services,DeVry Education Group,Education,"Downers Grove, IL",1910,140,11770
Chemicals,H.B. Fuller,Chemicals,"St. Paul, MN",2084,87,4425
Energy,Portland General Electric,Utilities: Gas and Electric,"Portland, OR",1898,172,2646
Engineering & Construction,MDC Holdings,Homebuilders,"Denver, CO",1909,66,1225
Financials,New York Community Bancorp,Commercial Banks,"Westbury, NY",1902,-47,3448
Food and Drug Stores,Fred’s,Food and Drug Stores,"Memphis, TN",2151,-7,7103
"Food, Beverages & Tobacco",Alliance One International,Tobacco,"Morrisville, NC",2066,-15,6835
Health Care,Providence Service,Health Care: Pharmacy and Other Services,"Tucson, AZ",1987,84,9072


In [22]:
# attribute .groups returns a Python dictionary with keys representing
# each grouping and values are the index labels from rows of original DataFrame
# that make up that grouping
sectors.groups
#          group             row index

{'Aerospace & Defense': Int64Index([ 24,  45,  60,  88, 118, 120, 209, 245, 282, 378, 389, 490, 560,
             605, 785, 788, 836, 903, 958, 987],
            dtype='int64', name='Rank'),
 'Apparel': Int64Index([91, 231, 340, 354, 448, 547, 575, 597, 683, 695, 726, 794, 877,
             882, 917],
            dtype='int64', name='Rank'),
 'Business Services': Int64Index([144, 186, 199, 204, 221, 248, 249, 294, 307, 312, 355, 392, 404,
             440, 467, 468, 481, 485, 492, 503, 545, 626, 635, 652, 677, 694,
             714, 729, 734, 735, 737, 744, 767, 776, 777, 783, 791, 792, 796,
             801, 803, 816, 819, 820, 869, 870, 886, 939, 951, 952, 993],
            dtype='int64', name='Rank'),
 'Chemicals': Int64Index([ 56, 101, 182, 189, 206, 253, 262, 277, 288, 296, 316, 538, 549,
             555, 566, 580, 613, 624, 654, 668, 717, 720, 724, 758, 761, 829,
             865, 898, 934, 949],
            dtype='int64', name='Rank'),
 'Energy': Int64Index([  2,  14,  30,  32,

In [26]:
# Subsetting, grouping, or categorizing
sectors.get_group("Energy")
sectors.get_group("Technology")
sectors.get_group("Apparel")

Unnamed: 0_level_0,Company,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
91,Nike,Apparel,"Beaverton, OR",30601,3273,62600
231,VF,Apparel,"Greensboro, NC",12377,1232,64000
340,PVH,Apparel,"New York, NY",8020,572,26200
354,Ralph Lauren,Apparel,"New York, NY",7620,702,20000
448,Hanesbrands,Apparel,"Winston-Salem, NC",5732,429,65300
547,Levi Strauss,Apparel,"San Francisco, CA",4495,209,12500
575,Coach,Apparel,"New York, NY",4192,402,12950
597,Under Armour,Apparel,"Baltimore, MD",3963,233,9600
683,Fossil Group,Apparel,"Richardson, TX",3229,221,15100
695,Skechers U.S.A.,Apparel,"Manhattan Beach, CA",3159,232,6400


### Methods on the Groupby Object and DataFrame Columns

In [35]:
# Will extract the row most to the left that has the highest or lowest alpha ranking
# which means closer to the end of the alphabet like sectors.first() and sectors.last()
sectors.max()
sectors.min()
sectors.sum()
sectors.mean()

Unnamed: 0_level_0,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aerospace & Defense,17897.0,1437.1,48402.85
Apparel,6397.866667,549.066667,23093.133333
Business Services,5337.156863,553.470588,26687.254902
Chemicals,8129.9,754.266667,15455.033333
Energy,12441.057377,-602.02459,9745.303279
Engineering & Construction,5922.423077,204.0,15642.615385
Financials,15950.784173,1872.007194,24172.28777
Food and Drug Stores,32251.266667,1117.266667,93026.533333
"Food, Beverages & Tobacco",12929.465116,1195.744186,28177.488372
Health Care,21529.426667,1414.853333,35710.52


In [33]:
# You can see below that the sectors.sum() is summing all the Revenue cells
# in numeric columns for each sector or group
sectors.get_group("Apparel")["Revenue"].sum()

95968

In [38]:
# returns a Series with groups as index and sum of revenue column for each group
sectors["Revenue"].sum()

Sector
Aerospace & Defense              357940
Apparel                           95968
Business Services                272195
Chemicals                        243897
Energy                          1517809
Engineering & Construction       153983
Financials                      2217159
Food and Drug Stores             483769
Food, Beverages & Tobacco        555967
Health Care                     1614707
Hotels, Resturants & Leisure     169546
Household Products               234737
Industrials                      497581
Materials                        259145
Media                            220764
Motor Vehicles & Parts           482540
Retailing                       1465076
Technology                      1377600
Telecommunications               461834
Transportation                   408508
Wholesalers                      444800
Name: Revenue, dtype: int64

In [41]:
sectors["Employees"].sum()
sectors["Profits"].max()  # returns profits of highest profit company in each group or sector
sectors["Employees"].mean()   # average num of employees in each group

Sector
Aerospace & Defense             48402.850000
Apparel                         23093.133333
Business Services               26687.254902
Chemicals                       15455.033333
Energy                           9745.303279
Engineering & Construction      15642.615385
Financials                      24172.287770
Food and Drug Stores            93026.533333
Food, Beverages & Tobacco       28177.488372
Health Care                     35710.520000
Hotels, Resturants & Leisure    99369.800000
Household Products              23072.785714
Industrials                     33591.934783
Materials                       14840.069767
Media                           22012.560000
Motor Vehicles & Parts          45106.666667
Retailing                       77845.362500
Technology                      35087.735294
Telecommunications              55497.866667
Transportation                  42688.694444
Wholesalers                     13139.925000
Name: Employees, dtype: float64