# <font color=blue>빅데이터 분석 Cheat Sheet - 100가지 핵심기법</font>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

### 데이터 로딩

In [11]:
df = pd.read_csv('titanic.csv')

In [4]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [10]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [6]:
df.drop(df.index, inplace=True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [12]:
df_titanic = df[:]
df_titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [13]:
df = pd.read_csv('titanic.csv', index_col=3, usecols=['PassengerId','Name','Sex','Age',
                                                      'Fare','Survived','Pclass'])
df

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,Fare
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,7.2500
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,71.2833
"Heikkinen, Miss. Laina",3,1,3,female,26.0,7.9250
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,53.1000
"Allen, Mr. William Henry",5,0,3,male,35.0,8.0500
...,...,...,...,...,...,...
"Montvila, Rev. Juozas",887,0,2,male,27.0,13.0000
"Graham, Miss. Margaret Edith",888,1,1,female,19.0,30.0000
"Johnston, Miss. Catherine Helen ""Carrie""",889,0,3,female,,23.4500
"Behr, Mr. Karl Howell",890,1,1,male,26.0,30.0000


In [None]:
df.to_csv('titanic_datamart.csv')

In [None]:
df = pd.read_excel('titanic_excel.xlsx', sheet_name='타이타닉 샘플', header=2, index_col=0)
df

In [None]:
df.to_excel('titanic_excel_오늘.xlsx', sheet_name='지금 막 만든 따끈따끈 한 탭')

In [None]:
lst_html = pd.read_html('https://finance.naver.com', encoding='euc-kr') # euckr, cp949, utf-8

In [None]:
lst_html[3]

In [None]:
for i in range(len(lst_html)):
    print(lst_html[i])
    print('-'*80)

### 시리즈 (Series)

In [None]:
s = pd.Series({'첫번째':'홍길동', '두번째':'성춘향', '세번째':'이몽룡'})
s

In [None]:
s = pd.Series( [34,17,16], index=['첫번째','두번째','세번째'] )
s

In [None]:
s.index

In [None]:
s.values

In [None]:
s.dtypes

### 데이터프레임 (DataFrame)

In [None]:
df = pd.DataFrame( {'이름':['홍길동','성춘향','이몽룡'], 
                    '성별':['남','여','남'], 
                    '나이':[34,17,16]})
df

In [25]:
df = pd.DataFrame([['홍길동','남',34],
                   ['성춘향','여',17],
                   ['이몽룡','남',16]], 
                  columns=['이름','성별','나이'], 
                  index=['첫번째','두번째','세번째'])
df_chosun = df.copy()
df_chosun

Unnamed: 0,이름,성별,나이
첫번째,홍길동,남,34
두번째,성춘향,여,17
세번째,이몽룡,남,16


In [None]:
df.index

In [None]:
df.values

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.count()

In [None]:
df.head(2)

In [None]:
df.tail(3)

### 데이터프레임 인덱서(Indexer) loc[ ] 와 iloc[ ] 을 활용한 데이터 추출

In [None]:
df_titanic.head(15)

In [None]:
df_titanic[0:10:2]

In [None]:
df_titanic.loc[[0,2,4,6,8],]

In [None]:
df_titanic_sub = df_titanic.loc[[0,2,4,6,8]]
df_titanic_sub

In [None]:
df_titanic.query('index in @df_titanic_sub.index').head(10)

### 데이터프레임 인덱서 loc[ ], iloc[ ] 를 활용한 행조건, 열조건 추출

In [None]:
df_titanic.loc[   [1,3,5,7,9]   ,   ['Name','Sex']   ]

In [None]:
df_titanic.loc[ :, ['Name','Sex']]

In [None]:
df_titanic.loc[0:10, ['Name','Sex']]

In [None]:
df_titanic.iloc[0:5, [3,4]] # iloc 은 행,열 모두 인덱스로만 접근함

In [None]:
df_titanic.iloc[0:5,3:5]

In [None]:
df_titanic['Name']

In [None]:
df_titanic['Name'].to_frame()

In [None]:
df_titanic[['Name']]

In [None]:
df_titanic[['Name','Sex']]

### 인덱싱

In [None]:
df_chosun

In [None]:
df_chosun.set_index('이름')
df_chosun

In [None]:
df_chosun.rename({'첫번째':'첫째', '세번째':'세째'})

In [None]:
df_chosun

In [None]:
df_chosun.rename({'첫번째':'첫째', '두번째':'둘째', '세번째':'세째'}, inplace=True)
df_chosun

In [None]:
df_chosun.rename({'이름':'성명', '나이':'연령'}, axis=1, inplace=True)
df_chosun

In [None]:
df_chosun.loc['첫째']

In [None]:
df_chosun

In [None]:
df_chosun.reset_index()

In [None]:
df_chosun.reset_index(drop=True, inplace=True)
df_chosun

### 결측치 확인 및 처리

In [None]:
df_titanic.isna().sum()

In [None]:
df_titanic.iloc[0:5]['Cabin'].isna().sum()

In [None]:
df_titanic

In [None]:
df_titanic.dropna()

In [None]:
df_titanic['Age'].isna().sum()

In [None]:
df_titanic.dropna(how='all', axis=0, subset=['Age','Cabin'])

In [None]:
df_titanic['Age'].replace('nan', np.nan, inplace=True)

In [None]:
df_titanic['Age'].isnull().sum()

In [None]:
df_titanic['Age'].fillna(round(df_titanic['Age'].mean()))

In [None]:
df_titanic['Cabin'].isnull().sum()

In [None]:
df_titanic['Cabin']

In [None]:
df_titanic['Cabin'] = df_titanic['Cabin'].fillna(method='ffill')

In [None]:
df_titanic['Cabin'].isnull().sum()

In [None]:
df_titanic['Cabin'] = df_titanic['Cabin'].fillna(method='bfill')

In [None]:
df_titanic['Cabin'].isnull().sum()

### 데이터 타입 변경

In [None]:
df_titanic['Age']

In [None]:
df_titanic['Age'] = df_titanic['Age'].fillna(-1)

In [None]:
df_titanic['Age'] = df_titanic['Age'].astype(int)

In [None]:
df = pd.read_csv('temperature.csv', encoding='cp949')

In [None]:
df

In [None]:
df.info()

In [None]:
df['일시'].head()

In [None]:
# pd.to_datetime(df['일시'], format="%Y-%m-%d")

In [None]:
df['일시'] = pd.to_datetime(df['일시'], format="%Y-%m-%d")
df['일시'].head()

In [None]:
df['일시'] = df['일시'].dt.strftime('%Y년 %m월 %d일')
df['일시'].head()

In [None]:
df['일시'] = pd.to_datetime(df['일시'], format="%Y년 %m월 %d일")
df['일시']

### 날짜 핸들링

In [None]:
pd.Timestamp

In [None]:
pd.Timestamp.now()

In [None]:
pd.Timestamp.now() + pd.Timedelta(days=365)

In [None]:
df['일시']

In [None]:
df['일시'] + pd.Timedelta(days=3650*2)

In [None]:
from pandas.tseries.offsets import DateOffset

In [None]:
df['일시'] + DateOffset(years=2, months=5, days=30)

In [None]:
pd.DataFrame({'날짜':pd.date_range(start='2024-01-01', periods=100, freq='D')}) #M, MS, A, AS

In [None]:
pd.DataFrame(pd.date_range(start='2024-01-01', periods=365, freq='D'), columns=['날짜']) #M, MS, A, AS

In [None]:
pd.DataFrame(pd.date_range(start='2024-01-01', end='2024-12-31', freq='D'), columns=['날짜']) #M, MS, A, AS

In [None]:
df.head(10)

In [None]:
df['3일전'] = df['평균기온(℃)'].shift(3)
df

In [None]:
df['3일전'] - df['평균기온(℃)']

In [None]:
df['7일평균'] = df['평균기온(℃)'].rolling(7).mean()
df.tail(10)

In [None]:
for i in range(1844,1848) :
    print (df.iloc[i:i+7]['평균기온(℃)'].sum()/7)

In [None]:
df['평균기온(℃)'] - df['7일평균']

### 정렬

In [None]:
df_titanic.sort_values(['Age','Survived'], ascending=[False,True])

In [None]:
df_titanic.sort_values(['Survived','Pclass'], ascending=[True,False]).reset_index()

### 인덱서(Indexer) 추출 vs. 쿼리(Query) 추출

In [None]:
df_titanic['Pclass']==1

In [None]:
df_titanic[df_titanic['Pclass']==1]

In [None]:
df_titanic.query('Pclass==1')

In [None]:
df_titanic[(df_titanic['Pclass']==1) & (df_titanic['Survived']==1)]

In [None]:
df_titanic.query('Pclass==1 and Survived==1')

In [None]:
df_titanic.query('PassengerId in [10,20,30]')

In [None]:
df_titanic.query('PassengerId.isin([10,20,30])')

In [None]:
df_titanic[ (df_titanic['Age']>=20) 
           & (df_titanic['Age']<30) 
           & (df_titanic['Sex']=='female')]

In [None]:
df_titanic.query('Age >= 20 and Age < 30 and Sex=="female"')

### 조건 함수 필터링 (Conditional Function Filtering)

In [14]:
def chkAge(x):
    if x < 20:
        return True
    else:
        return False

In [15]:
df_titanic[df_titanic['Age'].apply(chkAge) & 
           df_titanic['Sex'].apply(lambda x : True if x == 'female' else False)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S
22,23,1,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q
24,25,0,3,"Palsson, Miss. Torborg Danira",female,8.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
852,853,0,3,"Boulos, Miss. Nourelain",female,9.0,1,1,2678,15.2458,,C
853,854,1,1,"Lines, Miss. Mary Conover",female,16.0,0,1,PC 17592,39.4000,D28,S
855,856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,392091,9.3500,,S
875,876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.2250,,C


In [16]:
def grade(x):
    if x['Pclass'] == 1 and x['SibSp'] == 1:
        return 1
    else:
        return 0

In [17]:
df_titanic['SocialLevel'] = df_titanic.apply(grade, axis=1)
df_titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SocialLevel
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0


In [18]:
df_titanic['SocialLevel'] = df_titanic.apply(lambda x : 1 if x['Pclass'] == 1 and 
                                                              x['SibSp']  == 1 else 0, axis=1)
df_titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SocialLevel
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0


### 문자열 조작

In [19]:
df_titanic['Sex'].map({'male':'남', 'female':'여'})

0      남
1      여
2      여
3      여
4      남
      ..
886    남
887    여
888    여
889    남
890    남
Name: Sex, Length: 891, dtype: object

In [20]:
df_titanic['Name'].to_frame()

Unnamed: 0,Name
0,"Braund, Mr. Owen Harris"
1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,"Heikkinen, Miss. Laina"
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,"Allen, Mr. William Henry"
...,...
886,"Montvila, Rev. Juozas"
887,"Graham, Miss. Margaret Edith"
888,"Johnston, Miss. Catherine Helen ""Carrie"""
889,"Behr, Mr. Karl Howell"


In [22]:
pd.DataFrame(df_titanic['Name'].str.split(expand=True, n=4))

Unnamed: 0,0,1,2,3,4
0,"Braund,",Mr.,Owen,Harris,
1,"Cumings,",Mrs.,John,Bradley,(Florence Briggs Thayer)
2,"Heikkinen,",Miss.,Laina,,
3,"Futrelle,",Mrs.,Jacques,Heath,(Lily May Peel)
4,"Allen,",Mr.,William,Henry,
...,...,...,...,...,...
886,"Montvila,",Rev.,Juozas,,
887,"Graham,",Miss.,Margaret,Edith,
888,"Johnston,",Miss.,Catherine,Helen,"""Carrie"""
889,"Behr,",Mr.,Karl,Howell,


In [23]:
pd.DataFrame(df_titanic['Name'].str.upper())

Unnamed: 0,Name
0,"BRAUND, MR. OWEN HARRIS"
1,"CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS TH..."
2,"HEIKKINEN, MISS. LAINA"
3,"FUTRELLE, MRS. JACQUES HEATH (LILY MAY PEEL)"
4,"ALLEN, MR. WILLIAM HENRY"
...,...
886,"MONTVILA, REV. JUOZAS"
887,"GRAHAM, MISS. MARGARET EDITH"
888,"JOHNSTON, MISS. CATHERINE HELEN ""CARRIE"""
889,"BEHR, MR. KARL HOWELL"


In [24]:
pd.DataFrame(df_titanic['Name'].str.lower())

Unnamed: 0,Name
0,"braund, mr. owen harris"
1,"cumings, mrs. john bradley (florence briggs th..."
2,"heikkinen, miss. laina"
3,"futrelle, mrs. jacques heath (lily may peel)"
4,"allen, mr. william henry"
...,...
886,"montvila, rev. juozas"
887,"graham, miss. margaret edith"
888,"johnston, miss. catherine helen ""carrie"""
889,"behr, mr. karl howell"


### 데이터프레임 결합

In [34]:
df_chosun

Unnamed: 0_level_0,이름,성별,나이
번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,홍길동,남,34
1,성춘향,여,17
2,이몽룡,남,16


In [33]:
df_chosun.index.name='번호'

In [31]:
df_chosun.reset_index(drop=True, inplace=True)

In [29]:
df_fruit = pd.DataFrame([[0,'망고',34],
                         [0,'바나나',10],
                         [1,'체리',3],
                         [2,'수박',2],
                         [0,'망고',10],
                         [3,'참외',1],
                         [1,'사과',53],
                         [4,'귤',50],
                         [2,'복숭아',25]], columns=['번호','과일','수량'])
df_fruit

Unnamed: 0,번호,과일,수량
0,0,망고,34
1,0,바나나,10
2,1,체리,3
3,2,수박,2
4,0,망고,10
5,3,참외,1
6,1,사과,53
7,4,귤,50
8,2,복숭아,25


In [35]:
pd.merge(df_chosun, df_fruit, on='번호', how='inner')

Unnamed: 0,번호,이름,성별,나이,과일,수량
0,0,홍길동,남,34,망고,34
1,0,홍길동,남,34,바나나,10
2,0,홍길동,남,34,망고,10
3,1,성춘향,여,17,체리,3
4,1,성춘향,여,17,사과,53
5,2,이몽룡,남,16,수박,2
6,2,이몽룡,남,16,복숭아,25


In [36]:
pd.merge(df_chosun, df_fruit, on='번호', how='outer')

Unnamed: 0,번호,이름,성별,나이,과일,수량
0,0,홍길동,남,34.0,망고,34
1,0,홍길동,남,34.0,바나나,10
2,0,홍길동,남,34.0,망고,10
3,1,성춘향,여,17.0,체리,3
4,1,성춘향,여,17.0,사과,53
5,2,이몽룡,남,16.0,수박,2
6,2,이몽룡,남,16.0,복숭아,25
7,3,,,,참외,1
8,4,,,,귤,50


In [37]:
pd.merge(df_chosun, df_fruit, on='번호', how='left')

Unnamed: 0,번호,이름,성별,나이,과일,수량
0,0,홍길동,남,34,망고,34
1,0,홍길동,남,34,바나나,10
2,0,홍길동,남,34,망고,10
3,1,성춘향,여,17,체리,3
4,1,성춘향,여,17,사과,53
5,2,이몽룡,남,16,수박,2
6,2,이몽룡,남,16,복숭아,25


In [38]:
pd.merge(df_chosun, df_fruit, on='번호', how='right')

Unnamed: 0,번호,이름,성별,나이,과일,수량
0,0,홍길동,남,34.0,망고,34
1,0,홍길동,남,34.0,바나나,10
2,1,성춘향,여,17.0,체리,3
3,2,이몽룡,남,16.0,수박,2
4,0,홍길동,남,34.0,망고,10
5,3,,,,참외,1
6,1,성춘향,여,17.0,사과,53
7,4,,,,귤,50
8,2,이몽룡,남,16.0,복숭아,25


In [41]:
pd.merge(df_chosun, df_fruit, left_on=['번호','나이'], right_on=['번호','수량'], how='inner')

Unnamed: 0,번호,이름,성별,나이,과일,수량
0,0,홍길동,남,34,망고,34


In [39]:
df_chosun

Unnamed: 0_level_0,이름,성별,나이
번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,홍길동,남,34
1,성춘향,여,17
2,이몽룡,남,16


In [40]:
df_fruit

Unnamed: 0,번호,과일,수량
0,0,망고,34
1,0,바나나,10
2,1,체리,3
3,2,수박,2
4,0,망고,10
5,3,참외,1
6,1,사과,53
7,4,귤,50
8,2,복숭아,25


### 부분합 (Partial Aggregation)

In [None]:
df_titanic.groupby('Pclass')['PassengerId','Survived'].sum()

In [None]:
df_titanic['PassengerId'] = df_titanic['PassengerId'].astype(str)

In [None]:
df_titanic.groupby('Pclass')['PassengerId','Survived'].sum()

In [None]:
df_titanic.groupby(['Pclass','Sex']).mean()

In [None]:
df_titanic.groupby(['Survived', 'Sex'])[['Age','Fare']].aggregate([np.max, np.mean, np.min])

In [None]:
pd.DataFrame(df_titanic.groupby(['Survived', 'Sex'])['Age'].aggregate(np.max))

In [None]:
df = pd.DataFrame({'부서': np.random.choice(['HR', 'Finance', 'IT', 'Sales'], 100),
                   '날짜': pd.date_range(start='2024-01-01', periods=100, freq='D'),
                   '매출': np.random.randint(1000, 5000, 100)})
df

In [None]:
df.groupby(['부서','날짜'])['매출'].aggregate([np.max, np.mean, np.min])

### 교차표 (Crosstab)

In [None]:
pd.crosstab(df_titanic['Sex'],df_titanic['Survived'])

In [None]:
pd.crosstab(df_titanic['Sex'],df_titanic['Survived'], normalize='all')

In [None]:
pd.crosstab(df_titanic['Sex'],df_titanic['Survived'], normalize='all', margins=True)

In [None]:
pd.crosstab(df_titanic['Sex'],df_titanic['Survived'], normalize='index', margins=True)

In [None]:
pd.crosstab(df_titanic['Sex'],df_titanic['Survived'], normalize='columns', margins=True)

In [None]:
pd.crosstab(index=[df_titanic['Sex'],df_titanic['Pclass']], 
            columns=[df_titanic['Embarked'],df_titanic['Survived']], 
            normalize='all', margins=True)

### 피봇 테이블 (Pivot Table)

In [None]:
pd.pivot_table(df_titanic, index=['Survived','Sex'], columns='Pclass', values='Age', aggfunc='mean')

In [None]:
pd.pivot_table(df_titanic, index=['Survived','Sex'], columns='Pclass', values=['Age','Fare'], aggfunc='min', margins=True)

In [None]:
df_titanic_pivot = pd.pivot_table(df_titanic,index=['Sex','Pclass'], 
                                  columns=['Survived','Embarked'], 
                                  values='Fare', aggfunc='mean', margins=True)
df_titanic_pivot

In [None]:
df_titanic_pivot = df_titanic_pivot.stack(0)
df_titanic_pivot

In [None]:
df_titanic_pivot = df_titanic_pivot.unstack(1)
df_titanic_pivot

In [None]:
df_chosun

In [None]:
pd.melt(df_chosun, id_vars=['성명'])

In [None]:
pd.melt(df_chosun, id_vars=['성명'], var_name='항목', value_name='값')

### Metplotlib 와 Seaborn 을 활용한 시각화

In [None]:
sns.set_style('whitegrid')
sns.set_palette('GnBu_r')

#### 막대그래프

In [None]:
import platform
if platform.system() == 'Windows' :
    plt.rc('font',family='Malgun Gothic')
    plt.rcParams['font.size']=9
    plt.rcParams['figure.figsize']=(10,5)

In [None]:
plt.figure(figsize=(5,3))
sns.barplot(data=df_titanic, x='Pclass', y='Survived', order=[1,2,3], hue='Sex', hue_order=['female','male'])
plt.grid(False)
plt.ylim(0,1)
plt.title('타이타닉 등급별 생존률')
plt.ylabel('생존율')
plt.xlabel('등급')
plt.legend(loc='upper right')
plt.xticks(fontsize=8, rotation=0)
plt.show()

#### 산점도와 회기선

In [None]:
df = sns.load_dataset('penguins')
sns.set_palette('Set1')
sns.scatterplot(data=df, x='bill_length_mm', y='bill_depth_mm')
plt.show()

In [None]:
sns.scatterplot(data=df, x='bill_length_mm', y='bill_depth_mm', hue='sex')
plt.show()

In [None]:
sns.scatterplot(data=df, x='bill_length_mm', y='bill_depth_mm', hue='sex', style='island')
plt.show()

In [None]:
sns.relplot(data=df, x='bill_length_mm', y='bill_depth_mm', hue='sex', col='island', kind='scatter')
plt.show()

In [None]:
sns.lmplot(data=df, x='bill_length_mm', y='bill_depth_mm', hue='sex')
plt.show()

#### 히스토그램

In [None]:
sns.set_palette('Set2')
sns.displot(data=df, x='flipper_length_mm')
plt.show()

In [None]:
sns.displot(data=df, x='flipper_length_mm', kde=True)
plt.show()

In [None]:
sns.set_palette('Set3')
sns.displot(data=df, x='flipper_length_mm', kind='kde')
plt.show()

In [None]:
sns.displot(data=df, x='flipper_length_mm', hue='species', kind='kde')
plt.show()

In [None]:
sns.displot(data=df, x='flipper_length_mm', hue='species', kind='kde', col='sex')
plt.show()

#### 상자그림 (Box Plot)

In [None]:
sns.set_palette('rainbow_r')
sns.boxplot(data=df, x='body_mass_g')
plt.show()

In [None]:
sns.boxplot(data=df, x='body_mass_g', y='species', hue='sex')
plt.show()

#### 연관그래프 (Pair Plot)

In [None]:
sns.pairplot(data=df)
plt.show()

In [None]:
sns.pairplot(data=df, hue='species')
plt.show()

#### 선그래프 (Line Plot)

In [None]:
sns.set_palette('winter')
df = sns.load_dataset('flights')
df_may = df.query("month == 'May'")
sns.lineplot(data=df_may, x='year', y='passengers')
plt.show()

In [None]:
sns.lineplot(data=df, x='year', y='passengers', hue='month')
plt.show()

#### 히트맵 (Heatmap)

In [None]:
df = df_titanic[['Survived','Age', 'Fare', 'SibSp', 'Pclass']].corr()
sns.heatmap(data=df, annot=True, fmt='.2f', cmap='YlOrBr')
plt.show()

In [None]:
df = pd.pivot_table(data=df_titanic, index='Sex', columns='Pclass', values='Survived', aggfunc='mean')
sns.heatmap(data=df, annot=True, fmt='.2f', cmap='Purples')
plt.show()

### 플롯리 익스프레스

In [None]:
import plotly.express as px

#### 플롯리 산점도 (Plotly Scatter Plot)

In [None]:
df = sns.load_dataset('penguins')
fig = px.scatter(data_frame=df, x='bill_length_mm', y='bill_depth_mm', 
                 color_discrete_sequence=px.colors.qualitative.Set2, 
                 template='plotly_white')
fig.show()

In [None]:
fig.write_image('plotly_scatter.png')

In [None]:
fig.write_image('plotly_scatter.jpg')

In [None]:
fig.write_image('plotly_scatter.pdf')

In [None]:
fig.write_html('plotly_scatter.html')

### 지도 (Polium)

In [None]:
import folium

In [None]:
fig = folium.Figure(width=700, height=500)
map = folium.Map(location=[37.510781008592716, 127.09607026177875], zoom_start=16).add_to(fig)
map

In [None]:
folium.Marker([37.510781008592716, 127.09607026177875], tooltip='롯데월드').add_to(map)
map

In [None]:
folium.Marker([37.510781008592716, 127.09607026177875], 
              tooltip='롯데월드',
              icon=folium.Icon(color='red', icon='star')).add_to(map)
map

In [None]:
folium.Marker([37.510781008592716, 127.09607026177875], 
              tooltip='롯데월드',
              icon=folium.Icon(color='red', icon='star'),
              popup='<iframe src="https://www.lottehotel.com/content/dam/lotte-hotel/lotte/world/promotion/package/5592-1440-pkg-LTWO.jpg.thumb.768.768.jpg" width="300" height="200"></iframe').add_to(map)
map

In [None]:
map.save('folium.html')

### 크롤링 (Crawling)

In [None]:
import requests
from bs4 import BeautifulSoup as bs

In [None]:
keyword = '용인'
url = f'https://search.naver.com/search.naver?ssc=tab.blog.all&sm=tab_jum&query={keyword}'
res = requests.get(url)
soup = bs(res.text, 'html.parser')
title = [i.text for i in soup.find_all('a', class_='title_link')][:10]
date  = [i.text for i in soup.find_all('span', class_='sub')][:10]
content = [i.text for i in soup.find_all('a', class_='dsc_link')][:10]
df = pd.DataFrame({'title':title, 'date':date, 'content':content})
df

In [None]:
df = df.style.set_properties(**{'text-align': 'left'})
df