# Pandas DataFrame 
    - Creating dataframes 
    - Reading data from flat files and from the web
    - Data Manipulations

# DataFrames
    - DataFrame is a Group of Series

In [25]:
from IPython.display import * 
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import numpy as np

## Example of creating a dataframe

In [26]:
df  = pd.DataFrame({'x':[2,8,9], 'y':[3, 7, 1]})
df

Unnamed: 0,x,y
0,2,3
1,8,7
2,9,1


In [27]:
df = pd.DataFrame({'x':[4,7], 'y':[7,8]})
df

Unnamed: 0,x,y
0,4,7
1,7,8


# Dataframe with non-default index

In [28]:
df = pd.DataFrame({'x':[3,2,5], 'y':['Hello', 'Hi', 'Bye']}
                  , index=['First', 'Second', 'Third']
                 )
df

Unnamed: 0,x,y
First,3,Hello
Second,2,Hi
Third,5,Bye


## Another example

In [29]:
person = pd.DataFrame({
    'Name':['Zoe', 'Max', 'Sarah'],
    'Year_Grad':[2010, 2015, 2007],
    'School':['Stanford', 'UCLA', 'UC Davis'],
    'Major':['Engineering', 'Finance', 'Accounting']    
})
person

Unnamed: 0,Name,Year_Grad,School,Major
0,Zoe,2010,Stanford,Engineering
1,Max,2015,UCLA,Finance
2,Sarah,2007,UC Davis,Accounting


## Slicing and dicing

In [30]:
person.iloc[0:,0:]

Unnamed: 0,Name,Year_Grad,School,Major
0,Zoe,2010,Stanford,Engineering
1,Max,2015,UCLA,Finance
2,Sarah,2007,UC Davis,Accounting


## Slicing by the index/column

In [31]:
person.iloc[1,2]

'UCLA'

In [32]:
person.loc[1,'School']

'UCLA'

In [33]:
person.loc[[1,2], ['School', 'Major']]

Unnamed: 0,School,Major
1,UCLA,Finance
2,UC Davis,Accounting


In [34]:
person.loc[1,:]

Name             Max
Year_Grad       2015
School          UCLA
Major        Finance
Name: 1, dtype: object

## Saving dataframes to flat files

In [35]:
!pip install openpyxl



In [36]:
person.to_excel('Person.xlsx', sheet_name='My Data', index=True)
person.to_csv('Person.csv', index=True)
person.to_csv('PersonText.txt', index=True, sep='#')
! ls -ltr | tail -3

-rw-r--r--  1 gerislayer  staff     5101 Mar 21 18:43 Person.xlsx
-rw-r--r--  1 gerislayer  staff      118 Mar 21 18:43 Person.csv
-rw-r--r--  1 gerislayer  staff      118 Mar 21 18:43 PersonText.txt


## Reading from xlsx, csv, txt

In [37]:
df1 = pd.read_excel('Person.xlsx', 
                    sheet_name=0, 
                    index_col='Unnamed: 0')
df1

Unnamed: 0,Name,Year_Grad,School,Major
0,Zoe,2010,Stanford,Engineering
1,Max,2015,UCLA,Finance
2,Sarah,2007,UC Davis,Accounting


In [38]:
df2 = pd.read_csv('Person.csv', 
                  index_col='Unnamed: 0')
df2

Unnamed: 0,Name,Year_Grad,School,Major
0,Zoe,2010,Stanford,Engineering
1,Max,2015,UCLA,Finance
2,Sarah,2007,UC Davis,Accounting


In [39]:
df3 = pd.read_table(filepath_or_buffer='Person.csv', 
                    index_col='Unnamed: 0', sep=',')
df3

Unnamed: 0,Name,Year_Grad,School,Major
0,Zoe,2010,Stanford,Engineering
1,Max,2015,UCLA,Finance
2,Sarah,2007,UC Davis,Accounting


# Read specific columns

In [40]:
pd.read_excel('Person.xlsx', usecols=['Name', 'Year_Grad'])

Unnamed: 0,Name,Year_Grad
0,Zoe,2010
1,Max,2015
2,Sarah,2007


# Skip certain rows

In [41]:
pd.read_excel('Person.xlsx', skiprows=1, header=None)

Unnamed: 0,0,1,2,3,4
0,0,Zoe,2010,Stanford,Engineering
1,1,Max,2015,UCLA,Finance
2,2,Sarah,2007,UC Davis,Accounting


## Convert dataframe to dict

In [42]:
df3.to_dict()

{'Name': {0: 'Zoe', 1: 'Max', 2: 'Sarah'},
 'Year_Grad': {0: 2010, 1: 2015, 2: 2007},
 'School': {0: 'Stanford', 1: 'UCLA', 2: 'UC Davis'},
 'Major': {0: 'Engineering', 1: 'Finance', 2: 'Accounting'}}

## to json

In [43]:
jsn = df3.to_json()
jsn

'{"Name":{"0":"Zoe","1":"Max","2":"Sarah"},"Year_Grad":{"0":2010,"1":2015,"2":2007},"School":{"0":"Stanford","1":"UCLA","2":"UC Davis"},"Major":{"0":"Engineering","1":"Finance","2":"Accounting"}}'

## dictionary to dataframe

In [44]:
d = df3.to_dict()
ddd = pd.DataFrame.from_dict(d)
ddd

Unnamed: 0,Name,Year_Grad,School,Major
0,Zoe,2010,Stanford,Engineering
1,Max,2015,UCLA,Finance
2,Sarah,2007,UC Davis,Accounting


## Json file to dataframe

In [45]:
jsn = df3.to_json()
jjj = pd.read_json(jsn)
jjj

  jjj = pd.read_json(jsn)


Unnamed: 0,Name,Year_Grad,School,Major
0,Zoe,2010,Stanford,Engineering
1,Max,2015,UCLA,Finance
2,Sarah,2007,UC Davis,Accounting


## Read data from web

In [46]:
import webbrowser as wb
web = 'https://en.wikipedia.org/wiki/List_of_NBA_champions'
wb.open(web)

True

In [27]:
!pip install lxml

Collecting lxml
  Downloading lxml-5.3.1-cp39-cp39-macosx_10_9_x86_64.whl.metadata (3.7 kB)
Downloading lxml-5.3.1-cp39-cp39-macosx_10_9_x86_64.whl (4.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m31m17.6 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: lxml
Successfully installed lxml-5.3.1


In [47]:
nba_chmps = pd.read_html(web)[1] # Please run this with Google Chrome
nba_chmps.head(2)

Unnamed: 0,Year,Western champion,Coach,Result,Eastern champion,Coach.1,Finals MVP[a],Ref
0,Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA)
1,1947,"Chicago Stags (1) (1, 0–1)",Harold Olsen,1–4,"Philadelphia Warriors (2) (1, 1–0)",Eddie Gottlieb,,[16]


In [48]:
nba_chmps.tail(2)

Unnamed: 0,Year,Western champion,Coach,Result,Eastern champion,Coach.1,Finals MVP[a],Ref
78,2023,"Denver Nuggets (1) (1, 1–0)",Michael Malone,4–1,"Miami Heat (8) (7, 3–4)",Erik Spoelstra,Nikola Jokić,[102]
79,2024,"Dallas Mavericks (5) (3, 1–2)",Jason Kidd,1–4,"Boston Celtics (1) (23, 18–5)",Joe Mazzulla,Jaylen Brown,[103]


### Find the header of the data frame

In [49]:
list(nba_chmps.columns)

['Year',
 'Western champion',
 'Coach',
 'Result',
 'Eastern champion',
 'Coach.1',
 'Finals MVP[a]',
 'Ref']

### Find the dimensionality of the dataframe

In [50]:
nba_chmps.shape

(80, 8)

# Show the entire dataframe

In [51]:
display(nba_chmps)

Unnamed: 0,Year,Western champion,Coach,Result,Eastern champion,Coach.1,Finals MVP[a],Ref
0,Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA)
1,1947,"Chicago Stags (1) (1, 0–1)",Harold Olsen,1–4,"Philadelphia Warriors (2) (1, 1–0)",Eddie Gottlieb,,[16]
2,1948,"Baltimore Bullets† (2) (1, 1–0)",Buddy Jeannette,4–2,"Philadelphia Warriors (1) (2, 1–1)",Eddie Gottlieb,,[17]
3,1949,"Minneapolis Lakers (2) (1, 1–0)",John Kundla,4–2,"Washington Capitols (1) (1, 0–1)",Red Auerbach,,[18]
4,National Basketball Association (NBA),National Basketball Association (NBA),National Basketball Association (NBA),National Basketball Association (NBA),National Basketball Association (NBA),National Basketball Association (NBA),National Basketball Association (NBA),National Basketball Association (NBA)
...,...,...,...,...,...,...,...,...
75,2020[h],"Los Angeles Lakers (1) (32, 17–15)",Frank Vogel,4–2,"Miami Heat (5) (6, 3–3)",Erik Spoelstra,LeBron James,[99]
76,2021,"Phoenix Suns (2) (3, 0–3)",Monty Williams,2–4,"Milwaukee Bucks (3) (3, 2–1)",Mike Budenholzer,Giannis Antetokounmpo,[100]
77,2022,"Golden State Warriors (3) (12, 7–5)",Steve Kerr,4–2,"Boston Celtics (2) (22, 17–5)",Ime Udoka,Stephen Curry,[101]
78,2023,"Denver Nuggets (1) (1, 1–0)",Michael Malone,4–1,"Miami Heat (8) (7, 3–4)",Erik Spoelstra,Nikola Jokić,[102]


## Add a column to a dataFrame

In [52]:
nba_chmps['MVP'] = 'Fans'
display(nba_chmps.head(2))

Unnamed: 0,Year,Western champion,Coach,Result,Eastern champion,Coach.1,Finals MVP[a],Ref,MVP
0,Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Fans
1,1947,"Chicago Stags (1) (1, 0–1)",Harold Olsen,1–4,"Philadelphia Warriors (2) (1, 1–0)",Eddie Gottlieb,,[16],Fans


## Rename column/s

In [53]:
nba_chmps.rename(columns={'MVP':'Basketball_Fans'}, 
                 inplace=True)

In [54]:
display(nba_chmps.head(2))

Unnamed: 0,Year,Western champion,Coach,Result,Eastern champion,Coach.1,Finals MVP[a],Ref,Basketball_Fans
0,Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Fans
1,1947,"Chicago Stags (1) (1, 0–1)",Harold Olsen,1–4,"Philadelphia Warriors (2) (1, 1–0)",Eddie Gottlieb,,[16],Fans


In [55]:
nba_chmps.drop(columns=['Basketball_Fans'], inplace=True)
nba_chmps.head(1)

Unnamed: 0,Year,Western champion,Coach,Result,Eastern champion,Coach.1,Finals MVP[a],Ref
0,Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA)


## Let's clean headers of a dataframe

In [56]:
# change the names
nba_chmps.columns = [name.replace(' ','_') for name in nba_chmps.columns]
nba_chmps.head(2)

Unnamed: 0,Year,Western_champion,Coach,Result,Eastern_champion,Coach.1,Finals_MVP[a],Ref
0,Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA)
1,1947,"Chicago Stags (1) (1, 0–1)",Harold Olsen,1–4,"Philadelphia Warriors (2) (1, 1–0)",Eddie Gottlieb,,[16]


In [57]:
nba_chmps.rename(columns={'Coach.1':'Eastern_Coach', 
                          'Coach':'Western_Coach'}, 
                 inplace=True)

nba_chmps.head(2)

Unnamed: 0,Year,Western_champion,Western_Coach,Result,Eastern_champion,Eastern_Coach,Finals_MVP[a],Ref
0,Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA)
1,1947,"Chicago Stags (1) (1, 0–1)",Harold Olsen,1–4,"Philadelphia Warriors (2) (1, 1–0)",Eddie Gottlieb,,[16]


## Two ways to call a column

In [58]:
nba_chmps['Western_Coach'][0:5]

0    Basketball Association of America (BAA)
1                               Harold Olsen
2                            Buddy Jeannette
3                                John Kundla
4      National Basketball Association (NBA)
Name: Western_Coach, dtype: object

In [59]:
# Option 1:
print(nba_chmps.Year[0:5])

# Option 2:
print(nba_chmps['Year'][0:5])

0    Basketball Association of America (BAA)
1                                       1947
2                                       1948
3                                       1949
4      National Basketball Association (NBA)
Name: Year, dtype: object
0    Basketball Association of America (BAA)
1                                       1947
2                                       1948
3                                       1949
4      National Basketball Association (NBA)
Name: Year, dtype: object


## Find the records where Gregg Popovich or Steve Kerr made it to the finals

In [60]:
nba_chmps.Western_Coach.isin(['Gregg Popovich', 'Steve Kerr'])

0     False
1     False
2     False
3     False
4     False
      ...  
75    False
76    False
77     True
78    False
79    False
Name: Western_Coach, Length: 80, dtype: bool

In [61]:
nba_chmps[(nba_chmps.Western_Coach.isin(['Gregg Popovich', 'Steve Kerr'])) 
          | (nba_chmps.Eastern_Coach.isin(['Gregg Popovich', 'Steve Kerr']))]

Unnamed: 0,Year,Western_champion,Western_Coach,Result,Eastern_champion,Eastern_Coach,Finals_MVP[a],Ref
54,1999[f],"San Antonio Spurs (1) (1, 1–0)",Gregg Popovich,4–1,"New York Knicks (8) (8, 2–6)",Jeff Van Gundy,Tim Duncan,[75]
58,2003,"San Antonio Spurs (1) (2, 2–0)",Gregg Popovich,4–2,"New Jersey Nets (2) (2, 0–2)",Byron Scott,Tim Duncan,[79]
60,2005,"San Antonio Spurs (2) (3, 3–0)",Gregg Popovich,4–3,"Detroit Pistons (2) (7, 3–4)",Larry Brown,Tim Duncan,[81]
62,2007,"San Antonio Spurs (3) (4, 4–0)",Gregg Popovich,4–0,"Cleveland Cavaliers (2) (1, 0–1)",Mike Brown,Tony Parker,[83]
68,2013,"San Antonio Spurs (2) (5, 4–1)",Gregg Popovich,3–4,"Miami Heat (1) (4, 3–1)",Erik Spoelstra,LeBron James,[91]
69,2014,"San Antonio Spurs (1) (6, 5–1)",Gregg Popovich,4–1,"Miami Heat (2) (5, 3–2)",Erik Spoelstra,Kawhi Leonard,[92]
70,2015,"Golden State Warriors (1) (7, 4–3)",Steve Kerr,4–2,"Cleveland Cavaliers (2) (2, 0–2)",David Blatt,Andre Iguodala,[93]
71,2016,"Golden State Warriors (1) (8, 4–4)",Steve Kerr,3–4,"Cleveland Cavaliers (1) (3, 1–2)",Tyronn Lue,LeBron James,[94]
72,2017,"Golden State Warriors (1) (9, 5–4)",Steve Kerr,4–1,"Cleveland Cavaliers (2) (4, 1–3)",Tyronn Lue,Kevin Durant,[95]
73,2018,"Golden State Warriors (2) (10, 6–4)",Steve Kerr,4–0,"Cleveland Cavaliers (4) (5, 1–4)",Tyronn Lue,Kevin Durant,[96]


## Task:
1. Add a column: If the western team won, then "Western" else "Eastern"
2. Find how many championships went Eastern/Western

In [62]:
R = ['4-2', '3-4', '4-0', '4-1', '2-4']
['W' if each[0]=='4' else 'E' for each in R]

['W', 'E', 'W', 'W', 'E']

In [63]:
results = nba_chmps.Result.tolist()
winning = ['Western' if result[0]=='4' else 'Eastern' for result in results]

nba_chmps['Champ'] = winning
nba_chmps.head()

Unnamed: 0,Year,Western_champion,Western_Coach,Result,Eastern_champion,Eastern_Coach,Finals_MVP[a],Ref,Champ
0,Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Basketball Association of America (BAA),Eastern
1,1947,"Chicago Stags (1) (1, 0–1)",Harold Olsen,1–4,"Philadelphia Warriors (2) (1, 1–0)",Eddie Gottlieb,,[16],Eastern
2,1948,"Baltimore Bullets† (2) (1, 1–0)",Buddy Jeannette,4–2,"Philadelphia Warriors (1) (2, 1–1)",Eddie Gottlieb,,[17],Western
3,1949,"Minneapolis Lakers (2) (1, 1–0)",John Kundla,4–2,"Washington Capitols (1) (1, 0–1)",Red Auerbach,,[18],Western
4,National Basketball Association (NBA),National Basketball Association (NBA),National Basketball Association (NBA),National Basketball Association (NBA),National Basketball Association (NBA),National Basketball Association (NBA),National Basketball Association (NBA),National Basketball Association (NBA),Eastern


# How can we eliminate all mistaken rows?

In [46]:
nba_chmps = nba_chmps[(nba_chmps.Year.str.startswith('1')) | (nba_chmps.Year.str.startswith('2'))]
nba_chmps.head()

Unnamed: 0,Year,Western_champion,Western_Coach,Result,Eastern_champion,Eastern_Coach,Finals_MVP[a],Ref,Champ
1,1947,"Chicago Stags (1) (1, 0–1)",Harold Olsen,1–4,"Philadelphia Warriors (2) (1, 1–0)",Eddie Gottlieb,,[16],Eastern
2,1948,"Baltimore Bullets† (2) (1, 1–0)",Buddy Jeannette,4–2,"Philadelphia Warriors (1) (2, 1–1)",Eddie Gottlieb,,[17],Western
3,1949,"Minneapolis Lakers (2) (1, 1–0)",John Kundla,4–2,"Washington Capitols (1) (1, 0–1)",Red Auerbach,,[18],Western
5,1950,"Minneapolis Lakers (1) [b] (2, 2–0)",John Kundla,4–2,"Syracuse Nationals (1) (1, 0–1)",Al Cervi,,[22][23],Western
6,1951,"Rochester Royals (2) (1, 1–0)",Les Harrison,4–3,"New York Knicks (3) (1, 0–1)",Joe Lapchick,,[24],Western


# Count the "Western" vs "Eastern"

In [47]:
nba_chmps.Champ.value_counts()

Champ
Eastern    41
Western    37
Name: count, dtype: int64

# Unique Values

In [48]:
nba_chmps.Champ.unique()

array(['Eastern', 'Western'], dtype=object)

## To find who won the NBA championship in ['1974', '1987', '1995', '2015']

In [49]:
years = ['1974', '1987', '1995', '2015']
for year in years:
    condition = nba_chmps.Year==year
    west_team = nba_chmps.Western_champion[condition].values[0]
    east_team = nba_chmps.Eastern_champion[condition].values[0]

    print(year,':\t', west_team if nba_chmps.Result[condition].values[0][0]=='4' else east_team)

1974 :	 Boston Celtics (1) (13, 12–1)
1987 :	 Los Angeles Lakers (1) (21, 10–11)
1995 :	 Houston Rockets (6) (4, 2–2)
2015 :	 Golden State Warriors (1) (7, 4–3)


In [50]:
nba_chmps[nba_chmps.Year.isin(years)]

Unnamed: 0,Year,Western_champion,Western_Coach,Result,Eastern_champion,Eastern_Coach,Finals_MVP[a],Ref,Champ
29,1974,"Milwaukee Bucks (1) (2, 1–1)",Larry Costello,3–4,"Boston Celtics (1) (13, 12–1)",Tom Heinsohn,John Havlicek,[47],Eastern
42,1987,"Los Angeles Lakers (1) (21, 10–11)",Pat Riley,4–2,"Boston Celtics (1) (19, 16–3)",K. C. Jones,Magic Johnson,[62],Western
50,1995,"Houston Rockets (6) (4, 2–2)",Rudy Tomjanovich,4–0,"Orlando Magic (1) (1, 0–1)",Brian Hill,Hakeem Olajuwon,[70],Western
70,2015,"Golden State Warriors (1) (7, 4–3)",Steve Kerr,4–2,"Cleveland Cavaliers (2) (2, 0–2)",David Blatt,Andre Iguodala,[93],Western


## How to select several columns

In [51]:
nba_chmps.head(2)

Unnamed: 0,Year,Western_champion,Western_Coach,Result,Eastern_champion,Eastern_Coach,Finals_MVP[a],Ref,Champ
1,1947,"Chicago Stags (1) (1, 0–1)",Harold Olsen,1–4,"Philadelphia Warriors (2) (1, 1–0)",Eddie Gottlieb,,[16],Eastern
2,1948,"Baltimore Bullets† (2) (1, 1–0)",Buddy Jeannette,4–2,"Philadelphia Warriors (1) (2, 1–1)",Eddie Gottlieb,,[17],Western


In [52]:
nba_chmps[['Year', 'Western_champion', 'Eastern_champion']].head(3)

Unnamed: 0,Year,Western_champion,Eastern_champion
1,1947,"Chicago Stags (1) (1, 0–1)","Philadelphia Warriors (2) (1, 1–0)"
2,1948,"Baltimore Bullets† (2) (1, 1–0)","Philadelphia Warriors (1) (2, 1–1)"
3,1949,"Minneapolis Lakers (2) (1, 1–0)","Washington Capitols (1) (1, 0–1)"


In [53]:
nba_chmps.to_html('Test.html')


In [54]:
! open Test.html