# 데이터프레임과 시리즈

In [1]:
import pandas as pd

In [3]:
s = pd.Series(['banana', 42])
s

0    banana
1        42
dtype: object

In [4]:
s = pd.Series(['Wes Mckinney', 'Creator of Pandas'], index=['Person', 'Who'])
s

Person         Wes Mckinney
Who       Creator of Pandas
dtype: object

In [20]:
scientists = pd.DataFrame({
    'Name': ['Rosaline Franklin', 'William Gosset'],
    'Occupation': ['Chemist', 'Statistician'],
    'Born': ['1920-07-25', '1876-06-13'], 
    'Died': ['1958-04-16', '1937-10-16'], 
    'Age': [37, 61]
})
scientists

Unnamed: 0,Name,Occupation,Born,Died,Age
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


In [8]:
from collections import OrderedDict

In [19]:
scientists = pd.DataFrame(
    OrderedDict([
        ('Name', ['Rosaline Franklin', 'William Gosset']),
        ('Occupation', ['Chemist', 'Statistician']),
        ('Born', ['1920-07-25', '1876-06-13']), 
        ('Died', ['1958-04-16', '1937-10-16']), 
        ('Age', [37, 61])
    ])
)
scientists

Unnamed: 0,Name,Occupation,Born,Died,Age
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


In [25]:
scientists = pd.DataFrame(
    data= {
        'Occupation': ['Chemist', 'Statistician'],
        'Born': ['1920-07-25', '1876-06-13'], 
        'Died': ['1958-04-16', '1937-10-16'], 
        'Age': [37, 61]
    },
    index = ['Rosaline Franklin', 'William Gosset'],
    columns = ['Occupation', 'Age', 'Born', 'Died']
)
scientists

Unnamed: 0,Occupation,Age,Born,Died
Rosaline Franklin,Chemist,37,1920-07-25,1958-04-16
William Gosset,Statistician,61,1876-06-13,1937-10-16


In [28]:
first_row = scientists.loc['William Gosset'] 
print(type(first_row)) #시리즈로 선택해올때 정수형이 아닌 변수형으로 추출
first_row

<class 'pandas.core.series.Series'>


Occupation    Statistician
Age                     61
Born            1876-06-13
Died            1937-10-16
Name: William Gosset, dtype: object

In [34]:
first_row.index
#first_row.index[0]

Index(['Occupation', 'Age', 'Born', 'Died'], dtype='object')

In [36]:
first_row.keys() #index 속성과 같음
#first_row.keys()[0]

Index(['Occupation', 'Age', 'Born', 'Died'], dtype='object')

## 기초 통계 메서드

In [44]:
ages = scientists['Age']
type(ages); ages

print(ages)
ages.mean()
ages.min()
ages.max()
ages.std()

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64


16.97056274847714

# 시리즈 메서드

In [46]:
scientists = pd.read_csv('../data/pandas/scientists.csv')
ages = scientists['Age']

In [47]:
print(ages.max())
print(ages.mean())

90
59.125


# 불린 추출

In [49]:
#시리즈 불린 추출
print(ages > ages.mean()); print()
print(ages[ages > ages.mean()])

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

1    61
2    90
3    66
7    77
Name: Age, dtype: int64


In [50]:
#시리즈는 백터의 한종류
print(ages + ages)
print(ages * ages)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64
0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64


In [52]:
#시리즈와 브로드캐스팅 : 백터와 스칼라의 연산
print(ages + 100)
print(ages * 100)

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64
0    3700
1    6100
2    9000
3    6600
4    5600
5    4500
6    4100
7    7700
Name: Age, dtype: int64


In [55]:
#길이가 다른 백터의 연산 => 같은 인덱스만 계산되고 나머지는 누락값 처리(NaN)
print(pd.Series([1, 100])); print()
print(ages + pd.Series([1, 100])); print()

0      1
1    100
dtype: int64

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64



In [59]:
ages.sort_index(ascending=False)

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [60]:
#데이터프레임 불린 추출
scientists[scientists['Age'] > scientists['Age'].mean()]

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [61]:
scientists.loc[[True, True, False, True, True, False, True, True]]

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [65]:
#데이터 프레임 브로드캐스팅
scientists * 2 #문자열은 반복

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920-07-251920-07-25,1958-04-161958-04-16,74,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937-10-161937-10-16,122,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910-08-131910-08-13,180,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934-07-041934-07-04,132,ChemistChemist
4,Rachel CarsonRachel Carson,1907-05-271907-05-27,1964-04-141964-04-14,112,BiologistBiologist
5,John SnowJohn Snow,1813-03-151813-03-15,1858-06-161858-06-16,90,PhysicianPhysician
6,Alan TuringAlan Turing,1912-06-231912-06-23,1954-06-071954-06-07,82,Computer ScientistComputer Scientist
7,Johann GaussJohann Gauss,1777-04-301777-04-30,1855-02-231855-02-23,154,MathematicianMathematician


# 시리즈와 데이터프레임의 데이터 처리

In [70]:
print(scientists['Born'].dtype)
print(scientists['Died'].dtype)

object
object


In [79]:
#자료형 변경
born_datetime = pd.to_datetime(scientists['Born'], format='%Y-%m-%d')
print(born_datetime)

died_datetime = pd.to_datetime(scientists['Died'], format='%Y-%m-%d')
print(died_datetime)

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]
0   1958-04-16
1   1937-10-16
2   1910-08-13
3   1934-07-04
4   1964-04-14
5   1858-06-16
6   1954-06-07
7   1855-02-23
Name: Died, dtype: datetime64[ns]


In [74]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [87]:
#새로운 열 추가
scientists['born_dt'] = scientists['Born']
scientists['died_dt'] = scientists['Died']
(scientists['born_date'], scientists['died_date']) = (born_datetime, died_datetime)

scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt,born_date,died_date
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14,1907-05-27,1964-04-14
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,1813-03-15,1858-06-16
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07,1912-06-23,1954-06-07
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23,1777-04-30,1855-02-23


In [91]:
#데이터프레임 행과 열 삭제
#axis = 0 : 인덱스로 삭제(행방향)
#axis = 1 : 레이블로 삭제(열방향)
print(scientists.columns)
scientists_dropped = scientists.drop(['Age'], axis=1)
print(scientists_dropped.columns)

Index(['Name', 'Born', 'Died', 'Age', 'Occupation', 'born_dt', 'died_dt',
       'born_date', 'died_date'],
      dtype='object')
Index(['Name', 'Born', 'Died', 'Occupation', 'born_dt', 'died_dt', 'born_date',
       'died_date'],
      dtype='object')


In [96]:
print(scientists.index)
scientists_dropped = scientists.drop([0], axis=0)
print(scientists_dropped.index)

RangeIndex(start=0, stop=8, step=1)
Int64Index([1, 2, 3, 4, 5, 6, 7], dtype='int64')


# 데이터 저장하고 불러오기

In [None]:
#피클 : 데이터를 바이너리 형태로 직렬화한 오브젝트를 저장하는 방법 => 작은 용량
#바이너리 파일은 열어 볼 수 없고 단순히 저장, 읽어오기만 가능
#to_pickle(), read_pickle()

In [101]:
names = scientists['Name']
names

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object

In [103]:
names.to_pickle('files/scientists_names_seriese.pickle')

In [105]:
scientists_names_from_pickle = pd.read_pickle('files/scientists_names_seriese.pickle')
scientists_names_from_pickle

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object

In [107]:
scientists.to_pickle('files/scientists_df.pickle')

scientists_from_pickle = pd.read_pickle('files/scientists_df.pickle')
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [108]:
#csv : 쉼표 구분
#tsv : 탭 구분
names.to_csv('files/scientist_names_series.csv')
scientists.to_csv('files/scientists_df.tsv', sep='\t')
scientists.to_csv('files/scientists_df.csv', index=False)

In [110]:
#엑셀 => openpyxl(xls는 warnning)
!pip list

Package                       Version
----------------------------- -------------------
alabaster                     0.7.12
appdirs                       1.4.4
argh                          0.26.2
astroid                       2.5
async-generator               1.10
atomicwrites                  1.4.0
attrs                         20.3.0
autopep8                      1.5.6
Babel                         2.9.0
backcall                      0.2.0
bcrypt                        3.2.0
beautifulsoup4                4.9.3
black                         19.10b0
bleach                        3.3.0
brotlipy                      0.7.0
bs4                           0.0.1
certifi                       2020.12.5
cffi                          1.14.5
chardet                       4.0.0
click                         7.1.2
cloudpickle                   1.6.0
colorama                      0.4.4
cryptography                  3.4.7
cx-Oracle                     8.1.0
cycler                        0.10.0
deco

In [113]:
names_df = names.to_frame()
names_df

Unnamed: 0,Name
0,Rosaline Franklin
1,William Gosset
2,Florence Nightingale
3,Marie Curie
4,Rachel Carson
5,John Snow
6,Alan Turing
7,Johann Gauss


In [116]:
import openpyxl
names_df.to_excel('files/scientist_names_series_df.xlsx')