# Python Data Visualization Tools

In [1]:
import numpy as np      ## Library with linear algebra functions/matrix analysis 
import matplotlib.pyplot    ## Library with graph and other plotting functions
import pandas as pd     ## Library with data set visualization and analysis functions
import seaborn         ## Library helping your visualizations look better

## Basic Pandas Operations

In [2]:
## Importing Data Set as a Pandas DataFrame
batting = pd.read_csv('Batting_Players.csv')
batting['playerID'] = batting['playerID'].astype(str)
players = pd.read_csv('People.csv')
players['playerID'] = players['playerID'].astype(str)
type(batting)

pandas.core.frame.DataFrame

In [5]:
batting.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abbotje01,2000,1,CHA,AL,80,215,31,59,15,...,29,2,1,21,38,1,2,2,1,2
1,abbotku01,2000,1,NYN,NL,79,157,22,34,7,...,12,1,1,14,51,2,1,0,1,2
2,abbotpa01,2000,1,SEA,AL,35,5,1,2,1,...,0,0,0,0,1,0,0,1,0,0
3,abreubo01,2000,1,PHI,NL,154,576,103,182,42,...,79,28,8,100,116,9,1,0,3,12
4,aceveju01,2000,1,MIL,NL,62,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [6]:
## Unique Columns
print(batting.columns)
print(players.columns)
## Unique values in Columns
print(batting['yearID'].unique())
print(players['weight'].unique())

Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'G', 'AB', 'R', 'H',
       '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH',
       'SF', 'GIDP'],
      dtype='object')
Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID'],
      dtype='object')
[2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
 2014 2015 2016 2017]
[215. 180. 190. 184. 220. 192. 170. 175. 169. 200. 165. 185. 195.  nan
 210. 160. 255. 205. 245. 202. 134. 140. 235. 172. 168. 154. 182. 225.
 148. 260. 151. 158. 188. 187. 173. 250. 178. 270. 230. 150. 163. 208.
 166. 164. 197. 240. 162. 183. 155. 125. 227. 176. 275. 153. 193. 145.
 196. 174. 149. 130. 194. 223. 181. 152. 147. 186. 157. 156. 198. 207.
 

In [7]:
## Grabbing a column (first row index, next column name)
batting.loc[:5,'playerID']
## Grabbing multiple columns
players.loc[:5,('nameFirst','nameLast','finalGame')]

Unnamed: 0,nameFirst,nameLast,finalGame
0,David,Aardsma,2015-08-23
1,Hank,Aaron,1976-10-03
2,Tommie,Aaron,1971-09-26
3,Don,Aase,1990-10-03
4,Andy,Abad,2006-04-13
5,Fernando,Abad,2017-10-01


In [8]:
## Column grouping capabilities: either single or double 
vals = batting.groupby(['playerID']).sum()
## Needs operation after groupby clause for other values in data
batting_sums = batting.groupby(['playerID','yearID']).mean().reset_index()
batting_sums.loc[:5,('playerID','yearID','G','AB','H')]

Unnamed: 0,playerID,yearID,G,AB,H
0,aardsda01,2004,11.0,0.0,0.0
1,aardsda01,2006,45.0,2.0,0.0
2,aardsda01,2007,25.0,0.0,0.0
3,aardsda01,2008,47.0,1.0,0.0
4,aardsda01,2009,73.0,0.0,0.0
5,aardsda01,2010,53.0,0.0,0.0


In [17]:
## Grab length of a given column/row
print(len(batting_sums.columns))
print(len(batting_sums['playerID']))

20
22970


In [28]:
## Adding Columns together 
batting1 = batting
batting1.loc[:,'Vals'] = batting1.loc[:,'BB'] + batting1.loc[:,'RBI']
batting1.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,Vals
0,abbotje01,2000,1,CHA,AL,80,215,31,59,15,...,2,1,21,38,1,2,2,1,2,50
1,abbotku01,2000,1,NYN,NL,79,157,22,34,7,...,1,1,14,51,2,1,0,1,2,26
2,abbotpa01,2000,1,SEA,AL,35,5,1,2,1,...,0,0,0,1,0,0,1,0,0,0
3,abreubo01,2000,1,PHI,NL,154,576,103,182,42,...,28,8,100,116,9,1,0,3,12,179
4,aceveju01,2000,1,MIL,NL,62,1,1,0,0,...,0,0,1,1,0,0,0,0,0,1


In [27]:
## Subsetting DataFrame based on column values: 
## Note: Always create separate data frame when subsetting original dataframe.
## Subsets keep original indices, reset indices to make life A LOT EASIER.
batting1 = batting[batting['teamID'] == 'BOS'].reset_index(drop=True)
players1 = players[players['birthYear'] >= 1990].reset_index(drop=True)
players1
'''
Note: sometimes DataFrame object is changed into a DataSeries object. If you are having problems with your code,
change your data object into a DataFrame object like so:
batting1 = pd.DataFrame(batting1)
'''
batting1.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,alcanis01,2000,1,BOS,AL,21,45,9,13,1,...,7,0,0,3,7,0,0,0,0,0
1,alexama02,2000,1,BOS,AL,101,194,30,41,4,...,19,2,0,13,41,0,0,2,0,0
2,arrojro01,2000,2,BOS,AL,13,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,beckro01,2000,1,BOS,AL,34,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,berryse01,2000,2,BOS,AL,1,4,0,0,0,...,0,0,0,0,2,0,0,0,0,0


In [20]:
## Sorting Values
batting1.sort_values(by='AB',ascending=False).reset_index().loc[:5,['playerID','AB','H']]

Unnamed: 0,playerID,AB,H
0,bettsmo01,672,214
1,ellsbja01,660,212
2,garcino01,658,198
3,pedrodu01,653,213
4,bogaexa01,652,192
5,pedrodu01,641,193


### Problem 1

In [25]:
print(len(players[players['birthCountry'] == 'USA']))
print(len(players[players['birthCountry'] != 'USA']))
print(len(players))

16879
2491
19370


## Problem 2

In [26]:
players['birthCountry'].head()

0    USA
1    USA
2    USA
3    USA
4    USA
Name: birthCountry, dtype: object

## Problem 3