# 웹 프로젝트 + 빅데이터  

## numpy + pandas
- 처리, 분석  

## Series + DataFrame  
- numpy : ndarray  
- Series : Row, Column, 1차원 자료구조  
- DataFrame : 2차원 표

In [3]:
import numpy as np
import pandas as pd

dic = {'red':[1,2,3], 'blue':[4,5,6], 'black':[7,8,9]}

df = pd.DataFrame(dic)
df.index = ['one', 'two', 'three']
df

Unnamed: 0,red,blue,black
one,1,4,7
two,2,5,8
three,3,6,9


---

## dict 안에 dict를 정의했을 때 DF 형태

In [7]:
nestdict = {'red':{2012: 22, 2013: 33},
            'white':{2011: 13, 2012: 22, 2013: 16},
            'blue': {2011: 17, 2012: 27, 2013: 18}}
frame2 = pd.DataFrame(nestdict)
frame2.T

Unnamed: 0,2012,2013,2011
red,22.0,33.0,
white,22.0,16.0,13.0
blue,27.0,18.0,17.0


In [8]:
frame2

Unnamed: 0,red,white,blue
2012,22.0,22,27
2013,33.0,16,18
2011,,13,17


In [9]:
ser = pd.Series([5,0,3,8,4], index=['red','blue','yellow','white','green'])
ser

red       5
blue      0
yellow    3
white     8
green     4
dtype: int64

In [10]:
ser.index

Index(['red', 'blue', 'yellow', 'white', 'green'], dtype='object')

In [11]:
ser.values

array([5, 0, 3, 8, 4], dtype=int64)

## Series 내 최소값의 index

In [12]:
ser.idxmin()

'blue'

## Series에서 Label은 key가 아니기 때문에 중복될 수 있음

## index에 중복이 있는지 검사

### serd.index.is_unique  
  ㄴ False  
### frame.index.is_unique  
  ㄴTrue

---

### index들이 중복된 것, 중복되지 않는 것이 동시에 존재할 때 reindex  
- 중복된 index는 값을 그대로 가져옴, 지정하지 않은 index는 값을 지움

In [15]:
ser = pd.Series([2,5,7,4], index=['one','two','three','four'])
ser

one      2
two      5
three    7
four     4
dtype: int64

In [17]:
ser.reindex(['three','four','five','one'])

three    7.0
four     4.0
five     NaN
one      2.0
dtype: float64

---

## method = 'ffill' : 없는 index에 이전 값으로 채워줌  
## method = 'bfill' : 없는 index에 뒤의 값으로 채워줌

In [19]:
ser3 = pd.Series([1,5,6,3],index=[0,3,5,6])
ser3

0    1
3    5
5    6
6    3
dtype: int64

In [21]:
ser3.reindex(range(6))

0    1.0
1    NaN
2    NaN
3    5.0
4    NaN
5    6.0
dtype: float64

In [20]:
ser3.reindex(range(6),method='ffill')

0    1
1    1
2    1
3    5
4    5
5    6
dtype: int64

In [22]:
ser3.reindex(range(6),method='bfill')

0    1
1    5
2    5
3    5
4    6
5    6
dtype: int64

In [23]:
ser = pd.Series(np.arange(4.), index=['red','blue','yellow','white'])
ser

red       0.0
blue      1.0
yellow    2.0
white     3.0
dtype: float64

In [24]:
ser.drop('yellow')

red      0.0
blue     1.0
white    3.0
dtype: float64

In [25]:
ser

red       0.0
blue      1.0
yellow    2.0
white     3.0
dtype: float64

In [26]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
index=['red','blue','yellow','white'],
columns=['ball','pen','pencil','paper'])
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [27]:
frame.drop(['blue','yellow'])

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
white,12,13,14,15


In [28]:
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [34]:
# frame.drop('paper', axis=1)
del frame['paper']
frame

Unnamed: 0,ball,pen,pencil
red,0,1,2
blue,4,5,6
yellow,8,9,10
white,12,13,14


In [36]:
s1 = pd.Series([3,2,5,1],['white','yellow','green','blue'])
s2 = pd.Series([1,4,7,2,1],['white','yellow','black','blue','brown'])
s1

white     3
yellow    2
green     5
blue      1
dtype: int64

In [37]:
s2

white     1
yellow    4
black     7
blue      2
brown     1
dtype: int64

In [38]:
s1+s2

black     NaN
blue      3.0
brown     NaN
green     NaN
white     4.0
yellow    6.0
dtype: float64

---

## Series와 DF 간의 연산도 가능

In [39]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
index=['red','blue','yellow','white'],
columns=['ball','pen','pencil','paper'])

frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [40]:
ser = pd.Series(np.arange(4), index=['ball','pen','pencil','paper'])

ser

ball      0
pen       1
pencil    2
paper     3
dtype: int32

In [41]:
frame - ser

Unnamed: 0,ball,pen,pencil,paper
red,0,0,0,0
blue,4,4,4,4
yellow,8,8,8,8
white,12,12,12,12


## sqrt(제곱근) 는 자료구조의 원소단위로 적용됨

In [42]:
np.sqrt(frame)

Unnamed: 0,ball,pen,pencil,paper
red,0.0,1.0,1.414214,1.732051
blue,2.0,2.236068,2.44949,2.645751
yellow,2.828427,3.0,3.162278,3.316625
white,3.464102,3.605551,3.741657,3.872983


In [43]:
frame.apply(lambda x: x.max() - x.min())

ball      12
pen       12
pencil    12
paper     12
dtype: int64

In [44]:
def f(x):   # 함수가 숫자 하나만 리턴할 필요는 없음
    return pd.Series([x.min(), x.max()], index=['min','max'])

In [47]:
frame.apply(f)    # 한개의 컬럼당 한개의 시리즈가 나오므로, 그걸 합친게 df
                  # 따라서 df 형식으로 리턴됨

Unnamed: 0,ball,pen,pencil,paper
min,0,1,2,3
max,12,13,14,15


In [52]:
def f2(x):
    return pd.Series([x.mean(), x.std()], index=['mean', 'std'])

In [54]:
frame.apply(lambda s: pd.Series([s.mean(), s.std()], index=['mean','std']), axis=1)

Unnamed: 0,mean,std
red,1.5,1.290994
blue,5.5,1.290994
yellow,9.5,1.290994
white,13.5,1.290994


---

# 통계 함수

### 컬럼들의 합계

In [63]:
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [56]:
frame.sum()

ball      24
pen       28
pencil    32
paper     36
dtype: int64

In [62]:
frame.sum(axis=1)

red        6
blue      22
yellow    38
white     54
dtype: int64

In [64]:
frame.describe()

Unnamed: 0,ball,pen,pencil,paper
count,4.0,4.0,4.0,4.0
mean,6.0,7.0,8.0,9.0
std,5.163978,5.163978,5.163978,5.163978
min,0.0,1.0,2.0,3.0
25%,3.0,4.0,5.0,6.0
50%,6.0,7.0,8.0,9.0
75%,9.0,10.0,11.0,12.0
max,12.0,13.0,14.0,15.0


In [67]:
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [66]:
frame.sort_index()

Unnamed: 0,ball,pen,pencil,paper
blue,4,5,6,7
red,0,1,2,3
white,12,13,14,15
yellow,8,9,10,11


## 'pen'의 값을 가지고 솔팅

In [69]:
frame.sort_values(by=('pen'))

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [70]:
frame.sort_values(by=['pen','pencil'])

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [75]:
ran = pd.Series(np.random.randint(1,20,10))
ran

0     7
1     4
2     6
3    13
4     9
5    14
6    16
7    17
8    17
9     6
dtype: int32

## ascending=False : 내림차순

In [78]:
ran.rank(ascending=False)

0     7.0
1    10.0
2     8.5
3     5.0
4     6.0
5     4.0
6     3.0
7     1.5
8     1.5
9     8.5
dtype: float64

# 상관계수 : -1 ~ 0 ~ 1
- 양의 상관계수(0~1) : 서로 상관관계가 존재하며 그 크기는 1, 0  
                       상관관계가 존재하는지, 그 크기는 얼마인지 확인 가능
- 음의 상관계수(-1~0) :  
- 공분산 : 

In [79]:
seq2 = pd.Series([3,4,3,4,5,4,3,2],['2006','2007','2008','2009','2010','2011','2012','2013'])
seq = pd.Series([1,2,3,4,4,3,2,1],['2006','2007','2008','2009','2010','2011','2012','2013'])
seq.corr(seq2)
# seq.cov(seq2)

0.7745966692414835

In [80]:
frame2 = pd.DataFrame([[1,4,3,6],[4,5,6,1],[3,3,1,5],[4,1,6,4]],
index=['red','blue','yellow','white'],
columns=['ball','pen','pencil','paper'])

frame2

Unnamed: 0,ball,pen,pencil,paper
red,1,4,3,6
blue,4,5,6,1
yellow,3,3,1,5
white,4,1,6,4


In [86]:
ser = pd.Series([0,1,2,3,9],
index=['red','blue','yellow','white','green'])

ser

red       0
blue      1
yellow    2
white     3
green     9
dtype: int64

### 상관관계는 두 집단간의 관계인데 아래는 어떤 집단과의 상관관계일까

In [81]:
frame2.corr()   # corr()

Unnamed: 0,ball,pen,pencil,paper
ball,1.0,-0.276026,0.57735,-0.763763
pen,-0.276026,1.0,-0.079682,-0.361403
pencil,0.57735,-0.079682,1.0,-0.692935
paper,-0.763763,-0.361403,-0.692935,1.0


In [83]:
frame2.cov()

Unnamed: 0,ball,pen,pencil,paper
ball,2.0,-0.666667,2.0,-2.333333
pen,-0.666667,2.916667,-0.333333,-1.333333
pencil,2.0,-0.333333,6.0,-3.666667
paper,-2.333333,-1.333333,-3.666667,4.666667


### 서로 다른 두 집단간의 상관관계

In [87]:
frame2.corrwith(ser)

ball      0.730297
pen      -0.831522
pencil    0.210819
paper    -0.119523
dtype: float64

In [88]:
frame2.corrwith(frame)

ball      0.730297
pen      -0.831522
pencil    0.210819
paper    -0.119523
dtype: float64

In [89]:
ser = pd.Series([0,1,2,np.NaN,9],
index=['red','blue','yellow','white','green'])

ser

red       0.0
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64

In [90]:
ser['white'] = None

ser

red       0.0
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64

## NaN 값 필터링

In [None]:
>>> ser.dropna()
>>> ser[ser.notnull()]   # True인 것만 가져오므로 False는 빠짐

In [91]:
ser.dropna()   # Signature: ser.dropna(axis=0, inplace=False, how=None)

red       0.0
blue      1.0
yellow    2.0
green     9.0
dtype: float64

In [92]:
ser

red       0.0
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64

In [93]:
frame3 = pd.DataFrame([[6,np.nan,6],[np.nan,np.nan,np.nan],[2,np.nan,5]],
index = ['blue','green','red'],
columns = ['ball','mug','pen'])

frame3

Unnamed: 0,ball,mug,pen
blue,6.0,,6.0
green,,,
red,2.0,,5.0


In [94]:
frame3.dropna()

Unnamed: 0,ball,mug,pen


## NaN 에 0을 집어넣는 것

In [95]:
frame3.fillna(0)

Unnamed: 0,ball,mug,pen
blue,6.0,0.0,6.0
green,0.0,0.0,0.0
red,2.0,0.0,5.0


In [96]:
mser = pd.Series(np.random.rand(8),
index=[['white','white','white','blue','blue','red','red','red'],
        ['up','down','right','up','down','up','down','left']])

mser   # list를 가지고 2차원으로 표현한 것

white  up       0.412810
       down     0.310874
       right    0.621820
blue   up       0.968139
       down     0.083589
red    up       0.089414
       down     0.371283
       left     0.581391
dtype: float64

In [100]:
frame4 = pd.DataFrame(mser)

frame4

Unnamed: 0,Unnamed: 1,0
white,up,0.41281
white,down,0.310874
white,right,0.62182
blue,up,0.968139
blue,down,0.083589
red,up,0.089414
red,down,0.371283
red,left,0.581391


In [115]:
mser2 = pd.Series(np.random.rand(8),
index=[['A','A','B','B','B','C','C','C'],
        ['1','2','3','4','5','6','7','8']])

mser2   # list를 가지고 2차원으로 표현한 것
# mser2.columns = ['value']

A  1    0.069407
   2    0.461899
B  3    0.722559
   4    0.504170
   5    0.663376
C  6    0.579803
   7    0.899834
   8    0.421205
dtype: float64

In [116]:
mser2['A','2']

0.4618987453796397

In [119]:
mdf = pd.DataFrame(mser2)

mdf

Unnamed: 0,Unnamed: 1,0
A,1,0.069407
A,2,0.461899
B,3,0.722559
B,4,0.50417
B,5,0.663376
C,6,0.579803
C,7,0.899834
C,8,0.421205


In [120]:
mdf.stack()

A  1  0    0.069407
   2  0    0.461899
B  3  0    0.722559
   4  0    0.504170
   5  0    0.663376
C  6  0    0.579803
   7  0    0.899834
   8  0    0.421205
dtype: float64

In [121]:
mdf.unstack()

Unnamed: 0_level_0,0,0,0,0,0,0,0,0
Unnamed: 0_level_1,1,2,3,4,5,6,7,8
A,0.069407,0.461899,,,,,,
B,,,0.722559,0.50417,0.663376,,,
C,,,,,,0.579803,0.899834,0.421205


In [122]:
mframe = pd.DataFrame(np.random.randn(16).reshape(4,4),
index=[['white','white','red','red'], ['up','down','up','down']],
columns=[['pen','pen','paper','paper'],[1,2,1,2]])

mframe

Unnamed: 0_level_0,Unnamed: 1_level_0,pen,pen,paper,paper
Unnamed: 0_level_1,Unnamed: 1_level_1,1,2,1,2
white,up,-0.962733,0.433513,-0.004481,-0.938615
white,down,-0.158383,0.046841,0.527477,1.535149
red,up,1.123148,-0.23895,0.238925,-1.146422
red,down,-1.53033,-0.356533,-0.868015,0.005376


In [123]:
mframe.columns.names = ['objects', 'id']
mframe.index.names = ['colors', 'status']

mframe

Unnamed: 0_level_0,objects,pen,pen,paper,paper
Unnamed: 0_level_1,id,1,2,1,2
colors,status,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
white,up,-0.962733,0.433513,-0.004481,-0.938615
white,down,-0.158383,0.046841,0.527477,1.535149
red,up,1.123148,-0.23895,0.238925,-1.146422
red,down,-1.53033,-0.356533,-0.868015,0.005376


In [124]:
mframe.swaplevel('colors', 'status')

Unnamed: 0_level_0,objects,pen,pen,paper,paper
Unnamed: 0_level_1,id,1,2,1,2
status,colors,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
up,white,-0.962733,0.433513,-0.004481,-0.938615
down,white,-0.158383,0.046841,0.527477,1.535149
up,red,1.123148,-0.23895,0.238925,-1.146422
down,red,-1.53033,-0.356533,-0.868015,0.005376


In [125]:
mframe.sum(level='colors')

  mframe.sum(level='colors')


objects,pen,pen,paper,paper
id,1,2,1,2
colors,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
white,-1.121116,0.480354,0.522996,0.596535
red,-0.407182,-0.595484,-0.62909,-1.141046


In [126]:
mframe.sum(level='id', axis=1)

  mframe.sum(level='id', axis=1)


Unnamed: 0_level_0,id,1,2
colors,status,Unnamed: 2_level_1,Unnamed: 3_level_1
white,up,-0.967214,-0.505102
white,down,0.369094,1.581991
red,up,1.362073,-1.385372
red,down,-2.398345,-0.351158


# CHAPTER 5
# pandas: Reading and Writing Data

In [None]:
pd.read_table('ch05_01.csv',sep=',')

In [None]:
pd.read_csv('ch05_02.csv')  # 컬럼이 없으면 없다고 선언해줘야 함
                            # 없으면 첫째 행 데이터가 자동으로 컬럼으로 불러옴

In [None]:
pd.read_csv('ch05_02.csv', header=None)

### 컬럼명을 지정

In [None]:
pd.read_csv('ch05_02.csv', names=['white','red','blue','green','animal'])

In [128]:
df = pd.DataFrame(np.random.random((8,3)),
            index=[['A','A','B','B','B','C','C','C'],
                  [1,2,3,4,5,6,7,8]],
            columns=['v1','v2','v3'])
df

Unnamed: 0,Unnamed: 1,v1,v2,v3
A,1,0.116123,0.948583,0.271323
A,2,0.002953,0.57014,0.857224
B,3,0.734984,0.703688,0.414484
B,4,0.915859,0.722659,0.627775
B,5,0.254289,0.677008,0.296131
C,6,0.464438,0.011452,0.056978
C,7,0.900194,0.182992,0.13935
C,8,0.617988,0.150234,0.784215


In [143]:
df.index.names = ['Team', 'Times']

df

Unnamed: 0_level_0,Unnamed: 1_level_0,v1,v2,v3
Team,Times,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,1,0.116123,0.948583,0.271323
A,2,0.002953,0.57014,0.857224
B,3,0.734984,0.703688,0.414484
B,4,0.915859,0.722659,0.627775
B,5,0.254289,0.677008,0.296131
C,6,0.464438,0.011452,0.056978
C,7,0.900194,0.182992,0.13935
C,8,0.617988,0.150234,0.784215


## CSV 파일로 저장

In [144]:
df.to_csv('double_idx_test.csv')
print('double_idx_test.csv 파일저장 성공')

double_idx_test.csv 파일저장 성공


In [147]:
pd.read_csv('double_idx_test.csv', index_col=[0,1,2,3,4])

Team,Times,v1,v2,v3
A,1,0.116123,0.948583,0.271323
A,2,0.002953,0.57014,0.857224
B,3,0.734984,0.703688,0.414484
B,4,0.915859,0.722659,0.627775
B,5,0.254289,0.677008,0.296131
C,6,0.464438,0.011452,0.056978
C,7,0.900194,0.182992,0.13935
C,8,0.617988,0.150234,0.784215


# Regular Expression

- . : Single character, except newline
- \d : Digit
- \D : Non-digit character
- \s : Whitespace character
- \S : Non-whitespace character
- \n : New line character
- \t : Tab character
- \uxxxx : Unicode character specified by the hexadecimal number xxxx

In [149]:
pd.read_table('double_idx_test.csv',sep='\s+', engine='python')   # + : 하나 이상이다

Unnamed: 0,"Team,Times,v1,v2,v3"
0,"A,1,0.11612275516129655,0.9485825250898812,0.2..."
1,"A,2,0.0029529275999804483,0.570140421954832,0...."
2,"B,3,0.7349835767226052,0.7036878896224618,0.41..."
3,"B,4,0.9158585628235137,0.7226592621715626,0.62..."
4,"B,5,0.2542890334964031,0.6770082825159356,0.29..."
5,"C,6,0.46443840191784547,0.011451617865440467,0..."
6,"C,7,0.900193650603526,0.18299173022455828,0.13..."
7,"C,8,0.6179883452850797,0.1502341690026674,0.78..."


In [152]:
df = pd.DataFrame(np.arange(16).reshape(4,-1))  # 이렇게만 해도 컬럼, 인덱스가 자동으로 붙음

df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [153]:
df.to_csv('simple_df.csv')

In [155]:
pd.read_csv('simple_df.csv', index_col=[0])

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [156]:
df.to_csv('noindex.csv', index = False)

In [157]:
pd.read_csv('noindex.csv')

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


---
### 특정 행은 건너뛰기

In [None]:
pd.read_table('ch05_06.txt',sep=',',skiprows=[0,1,3,6])

In [None]:
pd.read_csv('ch05_02.csv',skiprows=[2],nrows=3,header=None)

# rows=3 : 읽어올 행 수
# skiprows=[2] : 스킵할 행

> out = pd.Series()  
> i = 0  
> pieces = pd.read_csv('ch05_01.csv',chunksize=3)   # 3개씩 쪼개서 데이터프레임이 됨  
> for piece in pieces:  
... out.set_value(i,piece['white'].sum())   # 해당 라인은 이 교제 코드 중 돌아가지 않는 코드  
... i = i + 1

In [176]:
pd.read_csv('ch05_01.csv')

Unnamed: 0,white,red,blue,green,animal
0,1,5,2,3,cat
1,2,7,8,5,dog
2,3,3,6,7,horse
3,2,2,8,3,duck
4,4,4,2,1,mouse


In [177]:
out = pd.Series([-1])
i = 0
pieces = pd.read_csv('ch05_01.csv', chunksize=3)
for piece in pieces:
    out[i] = piece['white'].sum()   # 해당 라인은 이 교제 코드 중 돌아가지 않는 코드
    i = i + 1
    
# for piece in pieces:
#     out = pd.concat([out.pd.Series(piece['white'].sum())], ignore_index=True)

out

0    6
1    6
dtype: int64

# Writing Data in HTML

In [178]:
frame = pd.DataFrame(np.arange(4).reshape(2,2))

print(frame.to_html())

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>0</th>
      <th>1</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>0</td>
      <td>1</td>
    </tr>
    <tr>
      <th>1</th>
      <td>2</td>
      <td>3</td>
    </tr>
  </tbody>
</table>


In [None]:
>>> s = ['<HTML>']
>>> s.append('<HEAD><TITLE>My DataFrame</TITLE></HEAD>')
>>> s.append('<BODY>')
>>> s.append(frame.to_html())
>>> s.append('</BODY></HTML>')
>>> html = ".join(s) : 문자열을 문자열로 만드는 것

In [181]:
frame = pd.DataFrame(np.random.random((4,4)),
                    index = ['white','black','red','blue'],
                    columns = ['up','down','right','left'])

frame

Unnamed: 0,up,down,right,left
white,0.464835,0.629464,0.802692,0.717255
black,0.82477,0.956418,0.829987,0.733803
red,0.727412,0.14046,0.640644,0.56376
blue,0.910057,0.909608,0.925293,0.899405


In [184]:
s = []
s.append('<html>')
s.append('<head><title>DF to HTML</title></head>')
s.append('<body>')
s.append(frame.to_html())
s.append('</body>')
s.append('</html>')
html = ''.join(s)    # 원소 하나하나를  ''로 연결해서 문자열로 만드는 것

print(html)

<html><head><title>DF to HTML</title></head><body><table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>up</th>
      <th>down</th>
      <th>right</th>
      <th>left</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>white</th>
      <td>0.464835</td>
      <td>0.629464</td>
      <td>0.802692</td>
      <td>0.717255</td>
    </tr>
    <tr>
      <th>black</th>
      <td>0.824770</td>
      <td>0.956418</td>
      <td>0.829987</td>
      <td>0.733803</td>
    </tr>
    <tr>
      <th>red</th>
      <td>0.727412</td>
      <td>0.140460</td>
      <td>0.640644</td>
      <td>0.563760</td>
    </tr>
    <tr>
      <th>blue</th>
      <td>0.910057</td>
      <td>0.909608</td>
      <td>0.925293</td>
      <td>0.899405</td>
    </tr>
  </tbody>
</table></body></html>


In [185]:
with open('df_to_html.html', 'w', encoding='UTF-8') as fout:
    fout.write(html)
    
print('파일에 html 쓰기 성공')

파일에 html 쓰기 성공


In [186]:
web_frames = pd.read_html('df_to_html.html')
web_frames[0]   # pd.read_html은 배열로 리턴됨, Unnamed:0의 0이 아님

Unnamed: 0.1,Unnamed: 0,up,down,right,left
0,white,0.464835,0.629464,0.802692,0.717255
1,black,0.82477,0.956418,0.829987,0.733803
2,red,0.727412,0.14046,0.640644,0.56376
3,blue,0.910057,0.909608,0.925293,0.899405
