In [1]:
import numpy as np
import pandas as pd

In [2]:
x = pd.Series([10, 20, 30, 40, 50])

In [3]:
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [4]:
x.index

RangeIndex(start=0, stop=5, step=1)

In [5]:
x.dtype

dtype('int64')

In [6]:
x.values

array([10, 20, 30, 40, 50], dtype=int64)

In [7]:
data = [450, 650, 870]
index = [2010, 2011, 2012]
sales = pd.Series(data, index=index)
sales.index.name = "Year"
sales.name = "Total sales per year"
sales

Year
2010    450
2011    650
2012    870
Name: Total sales per year, dtype: int64

In [8]:
sales.index

Int64Index([2010, 2011, 2012], dtype='int64', name='Year')

In [9]:
sales[2010]

450

In [10]:
sales > 500

Year
2010    False
2011     True
2012     True
Name: Total sales per year, dtype: bool

In [11]:
sales[[False, True, True]]

Year
2011    650
2012    870
Name: Total sales per year, dtype: int64

In [12]:
sales[sales > 500]

Year
2011    650
2012    870
Name: Total sales per year, dtype: int64

In [13]:
sales.iloc[0]  # index locations / order

450

In [14]:
sales[sales > 500].index.values

array([2011, 2012], dtype=int64)

In [15]:
list(sales[sales > 500].values)

[650, 870]

In [16]:
650 in sales

False

In [17]:
2011 in sales

True

In [18]:
650 in sales.values

True

In [19]:
sales.to_dict()

{2010: 450, 2011: 650, 2012: 870}

In [20]:
sales_dict = {
    "Don": 534,
    "Mike": 453,
    "Edwin": 412,
}
sales_ser = pd.Series(sales_dict)
sales_ser

Don      534
Mike     453
Edwin    412
dtype: int64

In [21]:
new_sales = pd.Series(sales_dict, index=['Don', 'Mike', 'Sally', 'Edwin', 'Lucy'])
new_sales

Don      534.0
Mike     453.0
Sally      NaN
Edwin    412.0
Lucy       NaN
dtype: float64

## DataFrames
- Two-dimensional
- Size-mutable
- Heterogenous

In [22]:
list_dicts = [
    {'Name': 'Tom', 'Sales': 300},
    {'Name': 'Greg'},
    {'Name': 'Simone', 'Sales': 747},
    {'Name': 'Paula', 'Sales': 534, 'Performance': 'Outstanding'}
]

df_list_dicts = pd.DataFrame(list_dicts)
df_list_dicts

Unnamed: 0,Name,Sales,Performance
0,Tom,300.0,
1,Greg,,
2,Simone,747.0,
3,Paula,534.0,Outstanding


In [23]:
df_list_dicts['Sales']

0    300.0
1      NaN
2    747.0
3    534.0
Name: Sales, dtype: float64

In [24]:
df_list_dicts.drop("Performance", axis=1)

Unnamed: 0,Name,Sales
0,Tom,300.0
1,Greg,
2,Simone,747.0
3,Paula,534.0


In [25]:
df_list_dicts.drop(1)

Unnamed: 0,Name,Sales,Performance
0,Tom,300.0,
2,Simone,747.0,
3,Paula,534.0,Outstanding


In [88]:
east = pd.Series([1000,1200,3400],index=['Q1','Q2','Q3'])
west = pd.Series([1100,1300,2400,3500],index=['Q1','Q2','Q3','Q4'])
df_region = pd.DataFrame({'East':east, 'West': west})
df_region['North'] = [2000,3000,2500,4000]
df_region['South'] = [1500,2000,1500,4000]

In [89]:
west
east

Q1    1000
Q2    1200
Q3    3400
dtype: int64

In [90]:
df_region

Unnamed: 0,East,West,North,South
Q1,1000.0,1100,2000,1500
Q2,1200.0,1300,3000,2000
Q3,3400.0,2400,2500,1500
Q4,,3500,4000,4000


In [91]:
df_region['years'] = ['2016', '2017', '2018', '2019']
df_region

Unnamed: 0,East,West,North,South,years
Q1,1000.0,1100,2000,1500,2016
Q2,1200.0,1300,3000,2000,2017
Q3,3400.0,2400,2500,1500,2018
Q4,,3500,4000,4000,2019


In [92]:
df_region.set_index('years', inplace = True)

In [93]:
df_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,1000.0,1100,2000,1500
2017,1200.0,1300,3000,2000
2018,3400.0,2400,2500,1500
2019,,3500,4000,4000


In [94]:
new_region = df_region.reindex(['2018', ' 2019', ' 2020', ' 2021'])
new_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,,,,
2020,,,,
2021,,,,


In [95]:
re_indexed = new_region.reindex(columns=['North', 'South', 'New'])
re_indexed

Unnamed: 0_level_0,North,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,2500.0,1500.0,
2019,,,
2020,,,
2021,,,


In [37]:
re_indexed.fillna(0)

Unnamed: 0_level_0,North,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,2500.0,1500.0,0.0
2019,0.0,0.0,0.0
2020,0.0,0.0,0.0
2021,0.0,0.0,0.0


In [38]:
new_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,,,,
2020,,,,
2021,,,,


In [39]:
new_region.fillna(method='ffill')

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,3400.0,2400.0,2500.0,1500.0
2020,3400.0,2400.0,2500.0,1500.0
2021,3400.0,2400.0,2500.0,1500.0


In [50]:
new_region.loc['2022'] = [6400, 7500, 5300, 600]
new_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,,,,
2020,,,,
2021,,,,
2022,6400.0,7500.0,5300.0,600.0


In [53]:
new_region.interpolate().round()

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,4150.0,3675.0,3200.0,1275.0
2020,4900.0,4950.0,3900.0,1050.0
2021,5650.0,6225.0,4600.0,825.0
2022,6400.0,7500.0,5300.0,600.0


In [52]:
new_region.fillna(new_region.mean())

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,4900.0,4950.0,3900.0,1050.0
2020,4900.0,4950.0,3900.0,1050.0
2021,4900.0,4950.0,3900.0,1050.0
2022,6400.0,7500.0,5300.0,600.0


In [54]:
new_region.dropna()

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2022,6400.0,7500.0,5300.0,600.0


In [55]:
new_region.dropna(axis=1, thresh=3)

2018
2019
2020
2021
2022


In [56]:
new_region['na'] = np.nan
new_region

Unnamed: 0_level_0,East,West,North,South,na
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018,3400.0,2400.0,2500.0,1500.0,
2019,,,,,
2020,,,,,
2021,,,,,
2022,6400.0,7500.0,5300.0,600.0,


In [57]:
new_region.dropna(axis=0, how='any')

Unnamed: 0_level_0,East,West,North,South,na
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [58]:
new_region.dropna(axis=0, how='all')

Unnamed: 0_level_0,East,West,North,South,na
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018,3400.0,2400.0,2500.0,1500.0,
2022,6400.0,7500.0,5300.0,600.0,


In [59]:
new_region.loc['2017'] = [3400, 2400, 2500, 1500, np.nan]
new_region

Unnamed: 0_level_0,East,West,North,South,na
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018,3400.0,2400.0,2500.0,1500.0,
2019,,,,,
2020,,,,,
2021,,,,,
2022,6400.0,7500.0,5300.0,600.0,
2017,3400.0,2400.0,2500.0,1500.0,


In [60]:
new_region.duplicated()

years
2018     False
 2019    False
 2020     True
 2021     True
2022     False
2017      True
dtype: bool

In [61]:
new_region.sort_index()

Unnamed: 0_level_0,East,West,North,South,na
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019,,,,,
2020,,,,,
2021,,,,,
2017,3400.0,2400.0,2500.0,1500.0,
2018,3400.0,2400.0,2500.0,1500.0,
2022,6400.0,7500.0,5300.0,600.0,


In [62]:
new_region.drop_duplicates()

Unnamed: 0_level_0,East,West,North,South,na
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018,3400.0,2400.0,2500.0,1500.0,
2019,,,,,
2022,6400.0,7500.0,5300.0,600.0,


In [63]:
new_region

Unnamed: 0_level_0,East,West,North,South,na
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018,3400.0,2400.0,2500.0,1500.0,
2019,,,,,
2020,,,,,
2021,,,,,
2022,6400.0,7500.0,5300.0,600.0,
2017,3400.0,2400.0,2500.0,1500.0,


In [64]:
new_region.West

years
2018     2400.0
 2019       NaN
 2020       NaN
 2021       NaN
2022     7500.0
2017     2400.0
Name: West, dtype: float64

In [65]:
new_region.loc['2022']

East     6400.0
West     7500.0
North    5300.0
South     600.0
na          NaN
Name: 2022, dtype: float64

In [66]:
new_region.iloc[2]

East    NaN
West    NaN
North   NaN
South   NaN
na      NaN
Name:  2020, dtype: float64

In [67]:
new_region

Unnamed: 0_level_0,East,West,North,South,na
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018,3400.0,2400.0,2500.0,1500.0,
2019,,,,,
2020,,,,,
2021,,,,,
2022,6400.0,7500.0,5300.0,600.0,
2017,3400.0,2400.0,2500.0,1500.0,


In [68]:
new_region.iloc[1,2]

nan

In [71]:
new_region.sort_index(inplace = True)

In [72]:
new_region

Unnamed: 0_level_0,East,West,North,South,na
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019,,,,,
2020,,,,,
2021,,,,,
2017,3400.0,2400.0,2500.0,1500.0,
2018,3400.0,2400.0,2500.0,1500.0,
2022,6400.0,7500.0,5300.0,600.0,


In [73]:
new_region[new_region.North >= 4000]

Unnamed: 0_level_0,East,West,North,South,na
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022,6400.0,7500.0,5300.0,600.0,


In [75]:
new_region['East'] + new_region['South']

years
 2019       NaN
 2020       NaN
 2021       NaN
2017     4900.0
2018     4900.0
2022     7000.0
dtype: float64

In [76]:
new_region['East'].add(new_region['South'], fill_value=0)

years
 2019       NaN
 2020       NaN
 2021       NaN
2017     4900.0
2018     4900.0
2022     7000.0
dtype: float64

In [78]:
new_region['Total'] = new_region.drop('Total', axis=1).sum(axis=1)

KeyError: "['Total'] not found in axis"

In [79]:
new_region.sort_index(ascending=0)

Unnamed: 0_level_0,East,West,North,South,na
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022,6400.0,7500.0,5300.0,600.0,
2018,3400.0,2400.0,2500.0,1500.0,
2017,3400.0,2400.0,2500.0,1500.0,
2021,,,,,
2020,,,,,
2019,,,,,


In [80]:
new_region.sort_values(by=['Total'], ascending=False)

KeyError: 'Total'

In [81]:
new_region['South'].rank(ascending=0)

years
 2019    NaN
 2020    NaN
 2021    NaN
2017     1.5
2018     1.5
2022     3.0
Name: South, dtype: float64

In [84]:
new_region.describe()

Unnamed: 0,East,West,North,South,na
count,3.0,3.0,3.0,3.0,0.0
mean,4400.0,4100.0,3433.333333,1200.0,
std,1732.050808,2944.486373,1616.580754,519.615242,
min,3400.0,2400.0,2500.0,600.0,
25%,3400.0,2400.0,2500.0,1050.0,
50%,3400.0,2400.0,2500.0,1500.0,
75%,4900.0,4950.0,3900.0,1500.0,
max,6400.0,7500.0,5300.0,1500.0,


In [86]:
new_region.mean()

East     4400.000000
West     4100.000000
North    3433.333333
South    1200.000000
na               NaN
dtype: float64

In [87]:
new_region.sum(axis=1)

years
 2019        0.0
 2020        0.0
 2021        0.0
2017      9800.0
2018      9800.0
2022     19800.0
dtype: float64

In [96]:
new_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,,,,
2020,,,,
2021,,,,
