In [None]:
# Data
# 1. Tabular -> i) RDBMS ii) JSON -> dictionary
# 2. Heterogeneous 
# Pandas -> NumPy

In [1]:
# Data 
#  i) Time Series (streaming data) -> network (http/websocket/...), kafka/rabbitmq/...
# ii) DataFrame (static data) -> file (json/csv/...), db, ...

In [2]:
# Pandas
# Time Series

In [3]:
import pandas as pd
import numpy as np
print("pandas version: ", pd.__version__)
print("numpy version: ", np.__version__)

pandas version:  1.1.5
numpy version:  1.19.5


In [12]:
ts1 = pd.Series([8, 24, 16, 4, 42])

In [5]:
ts1

0     8
1    24
2    16
3     4
4    42
dtype: int64

In [6]:
ts1.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
ts1.values

array([ 8, 24, 16,  4, 42])

In [8]:
type(ts1.values)

numpy.ndarray

In [9]:
ts1[2]

16

In [10]:
ts1[1:3]

1    24
2    16
dtype: int64

In [13]:
ts2 = pd.Series([8, 24, 16, 4, 42], index=['a','b','c','d','e'])

In [15]:
ts2.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [17]:
ts2

a     8
b    24
c    16
d     4
e    42
dtype: int64

In [16]:
ts2['b']

24

In [18]:
ts2['a':'d']

a     8
b    24
c    16
d     4
dtype: int64

In [19]:
ts1[[0,3,2]]

0     8
3     4
2    16
dtype: int64

In [21]:
ts1[ts1>16] ** 3

1    13824
4    74088
dtype: int64

In [22]:
np.sqrt(ts1[ts1>16])

1    4.898979
4    6.480741
dtype: float64

In [23]:
names = ['jack', 'kate', 'james']
'jack' in names

True

In [24]:
'ben' in names

False

In [25]:
'b' in ts2

True

In [27]:
'g' in ts2

False

In [28]:
ts2.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [29]:
# how to create Series
ts1 = pd.Series([8, 24, 16, 4, 42])
ts2 = pd.Series([8, 24, 16, 4, 42], index=['a','b','c','d','e'])
population = {"turkey": 80_000_000, 'france': 70_000_000, 'italy': 60_000_000 }
ts3 = pd.Series(population) # key -> index, value -> values

In [31]:
ts3

turkey    80000000
france    70000000
italy     60000000
dtype: int64

In [32]:
ts3.index

Index(['turkey', 'france', 'italy'], dtype='object')

In [33]:
ts3.values

array([80000000, 70000000, 60000000])

In [34]:
ts3['turkey']

80000000

In [35]:
"turkey" in ts3 

True

In [36]:
"japan" in ts3

False

In [None]:
# 2. DataFrame : tabular like tables in relational database

In [37]:
world = {
    'country': ['turkey', 'france', 'italy' ],
    'population': [80_000_000, 70_000_000, 60_000_000],
    'surface_area' : [120_000_000, 130_000_000, 90_000_000],
    'capital' : ['ankara', 'paris', 'rome'] 
}
df1 = pd.DataFrame(world)

In [38]:
df1

Unnamed: 0,country,population,surface_area,capital
0,turkey,80000000,120000000,ankara
1,france,70000000,130000000,paris
2,italy,60000000,90000000,rome


In [39]:
df1.head(2)

Unnamed: 0,country,population,surface_area,capital
0,turkey,80000000,120000000,ankara
1,france,70000000,130000000,paris


In [40]:
df1.tail(1)

Unnamed: 0,country,population,surface_area,capital
2,italy,60000000,90000000,rome


In [43]:
df2 = pd.DataFrame(world, columns=["country","population"])

In [44]:
df2

Unnamed: 0,country,population
0,turkey,80000000
1,france,70000000
2,italy,60000000


In [45]:
df2.columns

Index(['country', 'population'], dtype='object')

In [46]:
df2.index

RangeIndex(start=0, stop=3, step=1)

In [49]:
df1['population']

0    80000000
1    70000000
2    60000000
Name: population, dtype: int64

In [50]:
df1.population

0    80000000
1    70000000
2    60000000
Name: population, dtype: int64

In [51]:
df1

Unnamed: 0,country,population,surface_area,capital
0,turkey,80000000,120000000,ankara
1,france,70000000,130000000,paris
2,italy,60000000,90000000,rome


In [57]:
continents = pd.Series(['Asia','Europe','Europe'], index=[0,1,2])

In [58]:
df1['continent'] = continents

In [59]:
df1

Unnamed: 0,country,population,surface_area,capital,continent
0,turkey,80000000,120000000,ankara,Asia
1,france,70000000,130000000,paris,Europe
2,italy,60000000,90000000,rome,Europe


In [60]:
# How to remove a column from DataFrame
# del df1['continent']

In [61]:
df1['rich'] = df1.country != 'turkey'

In [62]:
df1

Unnamed: 0,country,population,surface_area,capital,continent,rich
0,turkey,80000000,120000000,ankara,Asia,False
1,france,70000000,130000000,paris,Europe,True
2,italy,60000000,90000000,rome,Europe,True


In [64]:
orders = {
    'symbol': ['orcl', 'msft', 'ibm' , 'goggl'],
    'price': [123.34, 98.23, 64.93, 143.56],
    'quantity' : [120, 130, 90, 200] 
}
df3 = pd.DataFrame(orders)

In [65]:
df3

Unnamed: 0,symbol,price,quantity
0,orcl,123.34,120
1,msft,98.23,130
2,ibm,64.93,90
3,goggl,143.56,200


In [66]:
df3['volume'] = df3.price * df3.quantity # vectorized + element-wise

In [67]:
df3

Unnamed: 0,symbol,price,quantity,volume
0,orcl,123.34,120,14800.8
1,msft,98.23,130,12769.9
2,ibm,64.93,90,5843.7
3,goggl,143.56,200,28712.0


In [68]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   symbol    4 non-null      object 
 1   price     4 non-null      float64
 2   quantity  4 non-null      int64  
 3   volume    4 non-null      float64
dtypes: float64(2), int64(1), object(1)
memory usage: 256.0+ bytes


In [69]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   country       3 non-null      object
 1   population    3 non-null      int64 
 2   surface_area  3 non-null      int64 
 3   capital       3 non-null      object
 4   continent     3 non-null      object
 5   rich          3 non-null      bool  
dtypes: bool(1), int64(2), object(3)
memory usage: 251.0+ bytes


In [70]:
df1.describe()

Unnamed: 0,population,surface_area
count,3.0,3.0
mean,70000000.0,113333300.0
std,10000000.0,20816660.0
min,60000000.0,90000000.0
25%,65000000.0,105000000.0
50%,70000000.0,120000000.0
75%,75000000.0,125000000.0
max,80000000.0,130000000.0


In [71]:
df3.describe()

Unnamed: 0,price,quantity,volume
count,4.0,4.0,4.0
mean,107.515,135.0,15531.6
std,33.908534,46.547467,9587.133845
min,64.93,90.0,5843.7
25%,89.905,112.5,11038.35
50%,110.785,125.0,13785.35
75%,128.395,147.5,18278.6
max,143.56,200.0,28712.0


In [72]:
df3.cov()

Unnamed: 0,price,quantity,volume
price,1149.7887,1356.1,300574.7
quantity,1356.1,2166.666667,439150.7
volume,300574.738,439150.666667,91913140.0


In [73]:
df3

Unnamed: 0,symbol,price,quantity,volume
0,orcl,123.34,120,14800.8
1,msft,98.23,130,12769.9
2,ibm,64.93,90,5843.7
3,goggl,143.56,200,28712.0


In [76]:
df4 = df3.T

In [77]:
df4

Unnamed: 0,0,1,2,3
symbol,orcl,msft,ibm,goggl
price,123.34,98.23,64.93,143.56
quantity,120,130,90,200
volume,14800.8,12769.9,5843.7,28712


In [75]:
df3

Unnamed: 0,symbol,price,quantity,volume
0,orcl,123.34,120,14800.8
1,msft,98.23,130,12769.9
2,ibm,64.93,90,5843.7
3,goggl,143.56,200,28712.0


In [79]:
type(df3['price'].values)

numpy.ndarray

In [80]:
'symbol' in df3.columns

True

In [81]:
3 in df3.index

True

In [82]:
df3.index

RangeIndex(start=0, stop=4, step=1)

In [83]:
10 in df3.index

False

In [85]:
ts4 = pd.Series(['blue', 'purple', 'yellow', 'green'], index=[0,2,4,6])

In [86]:
ts4

0      blue
2    purple
4    yellow
6     green
dtype: object

In [89]:
ts4.reindex([0,1,2,3,4,5,6], method='ffill') # forward fill

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
6     green
dtype: object

In [90]:
df5 = pd.DataFrame(np.arange(9).reshape((3,3)), index=['a', 'c','d'],
                   columns=['turkey', 'france', 'italy'])

In [91]:
df5

Unnamed: 0,turkey,france,italy
a,0,1,2
c,3,4,5
d,6,7,8


In [92]:
df6 = df5.reindex(['a', 'b','c','d'])

In [93]:
df6

Unnamed: 0,turkey,france,italy
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [94]:
# float('nan'), np.nan

nan

In [96]:
df7 = df6.reindex(columns=['turkey', 'italy', 'japan'])

In [97]:
df7

Unnamed: 0,turkey,italy,japan
a,0.0,2.0,
b,,,
c,3.0,5.0,
d,6.0,8.0,


In [101]:
df6.drop('france',axis=1,inplace=True)

In [102]:
df6

Unnamed: 0,turkey,italy
a,0.0,2.0
b,,
c,3.0,5.0
d,6.0,8.0
