# Pandas Basics

In [2]:
import numpy as np
import pandas as pd
pd.__version__

'2.0.1'

### Pandas Series

In [3]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [4]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [5]:
data.keys

<bound method Series.keys of 0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64>

In [6]:
data[0]

0.25

In [7]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [8]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

In [9]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [10]:
data.keys

<bound method Series.keys of a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64>

In [11]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [12]:
data.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [13]:
data['c']

0.75

In [14]:
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [15]:
data[['d','a','b']]

d    1.00
a    0.25
b    0.50
dtype: float64

In [16]:
population_dict = {'California':38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}

population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [17]:
population['New York']

19651127

### Pandas DataFrame

In [18]:
area_dict = {'California': 423967, 
             'Texas': 695662, 
             'New York': 141297,
             'Florida': 170312, 
             'Illinois': 149995}
area = pd.Series(area_dict)
states = pd.DataFrame({'population':population,
                       'area':area})

In [19]:
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [20]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [21]:
states.columns

Index(['population', 'area'], dtype='object')

### Initialization with numpy

In [22]:
df = pd.DataFrame(np.random.rand(4,2), columns=['A','B'])
df

Unnamed: 0,A,B
0,0.039265,0.000837
1,0.092869,0.858402
2,0.566751,0.047084
3,0.275215,0.632028


In [23]:
df['A']

0    0.039265
1    0.092869
2    0.566751
3    0.275215
Name: A, dtype: float64

In [24]:
df['B']

0    0.000837
1    0.858402
2    0.047084
3    0.632028
Name: B, dtype: float64

In [25]:
df.A

0    0.039265
1    0.092869
2    0.566751
3    0.275215
Name: A, dtype: float64

In [26]:
df.B

0    0.000837
1    0.858402
2    0.047084
3    0.632028
Name: B, dtype: float64

In [27]:
df = pd.DataFrame(np.random.rand(4,2), columns=['A', 'B'], index=list('abcd'))

In [28]:
df

Unnamed: 0,A,B
a,0.827474,0.611427
b,0.159086,0.653477
c,0.058909,0.645425
d,0.548712,0.510113


### Initialization with dictionary

In [29]:
data = {'ColA': [1, 2, 3, 4],
        'ColB': [5, 6, 7, 8],
        'ColC': ['A', 'B', 'C', 'D']
}
data

{'ColA': [1, 2, 3, 4], 'ColB': [5, 6, 7, 8], 'ColC': ['A', 'B', 'C', 'D']}

In [30]:
df = pd.DataFrame(data)
df

Unnamed: 0,ColA,ColB,ColC
0,1,5,A
1,2,6,B
2,3,7,C
3,4,8,D


In [31]:
data = {'Country':['USA','China','Japan'],
        'Population':[328200000,1397710000,126150000],
        'GDP per capita':[62606,10262,40162]}
df = pd.DataFrame(data)
df

Unnamed: 0,Country,Population,GDP per capita
0,USA,328200000,62606
1,China,1397710000,10262
2,Japan,126150000,40162


In [32]:
df['GDP'] = df['Population']*df['GDP per capita']

In [33]:
df

Unnamed: 0,Country,Population,GDP per capita,GDP
0,USA,328200000,62606,20547289200000
1,China,1397710000,10262,14343300020000
2,Japan,126150000,40162,5066436300000


In [34]:
df['X'] = [10,20,30]
df

Unnamed: 0,Country,Population,GDP per capita,GDP,X
0,USA,328200000,62606,20547289200000,10
1,China,1397710000,10262,14343300020000,20
2,Japan,126150000,40162,5066436300000,30


### Data indexing and selection

In [35]:
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [36]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [37]:
states['density'] = states['population'] / states['area']
states

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [39]:
states.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
population,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
area,423967.0,695662.0,141297.0,170312.0,149995.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [40]:
states.T['California']

population    3.833252e+07
area          4.239670e+05
density       9.041393e+01
Name: California, dtype: float64

In [41]:
states.T['New York']

population    1.965113e+07
area          1.412970e+05
density       1.390767e+02
Name: New York, dtype: float64

In [42]:
states

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [47]:
# Linhas de índices 1 e 2; e colunas de índices 0 e 1
states.iloc[1:3,:2] 

Unnamed: 0,population,area
Texas,26448193,695662
New York,19651127,141297


In [52]:
states.loc['Texas':'Florida', :'area'] # Linhas e colunas (inclusivo)

Unnamed: 0,population,area
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312


In [53]:
states.loc[states['density'] > 100, ['population', 'density']]

Unnamed: 0,population,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [54]:
states

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [55]:
states.iloc[0,2] = 90

In [56]:
states

Unnamed: 0,population,area,density
California,38332521,423967,90.0
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [57]:
data = {'Name':['John','Sarah','Michael','Elizabeth'],
        'Age':[25,30,40,35],
        'Gender':['Male','Female','Male','Female']}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender
0,John,25,Male
1,Sarah,30,Female
2,Michael,40,Male
3,Elizabeth,35,Female


In [58]:
print(df.loc[df['Name']=='Sarah','Age'])

1    30
Name: Age, dtype: int64


In [59]:
print(df.loc[df['Name']=='Michael','Gender'])

2    Male
Name: Gender, dtype: object


In [60]:
df.loc[df['Name']=='Elizabeth','Age'] = 36
print(df)

        Name  Age  Gender
0       John   25    Male
1      Sarah   30  Female
2    Michael   40    Male
3  Elizabeth   36  Female


In [61]:
df.loc[0:2,['Name','Age']]

Unnamed: 0,Name,Age
0,John,25
1,Sarah,30
2,Michael,40


In [62]:
df.loc[df['Age']<40]

Unnamed: 0,Name,Age,Gender
0,John,25,Male
1,Sarah,30,Female
3,Elizabeth,36,Female


In [63]:
df = pd.DataFrame({
    'country': ['USA', 'China', 'Japan', 'Germany', 'India'],
    'population': [328200000, 1393000000, 126500000, 83020000, 1353000000],
    'GDP': [21300000, 14200000, 5085000, 4040000, 2999000]
})

In [64]:
df['GDP'] = df['GDP']*1e6

In [65]:
df['GDP per capita'] = df['GDP'] / df['population']
df

Unnamed: 0,country,population,GDP,GDP per capita
0,USA,328200000,21300000000000.0,64899.451554
1,China,1393000000,14200000000000.0,10193.826274
2,Japan,126500000,5085000000000.0,40197.628458
3,Germany,83020000,4040000000000.0,48662.972778
4,India,1353000000,2999000000000.0,2216.555802


In [66]:
max_country = df.loc[df['GDP per capita'].idxmax(), 'country']
print(f'idxmax: {df["GDP per capita"].idxmax()}')
max_country

idxmax: 0


'USA'

In [67]:
df.loc[df['GDP per capita'] > 40000,'population'].mean()

179240000.0

In [68]:
top_countries = df.sort_values(by='GDP per capita',ascending=False).head(3)
top_countries = top_countries[['country','population']]
top_countries

Unnamed: 0,country,population
0,USA,328200000
3,Germany,83020000
2,Japan,126500000
