# 데이터 프레임 & 시리즈

---
1. 데이터 생성

In [1]:
import pandas as pd

# 시리즈 생성
s=pd.Series(['Age',26])
print(s)
d=pd.Series(['Name','John'])
print(d)
f=pd.Series(['John','26'],index=['Name','Age'])
print(f)

0    Age
1     26
dtype: object
0    Name
1    John
dtype: object
Name    John
Age       26
dtype: object


In [2]:
# 데이터프레임 생성
info=pd.DataFrame({
    'Name' : ['John','Amy'],
    'Age' : [26,30],
    'Occupation' : ['Engineer','Artist'],
    'Born' : ['1996-04-04','1992-08-12']
})
print(info)
info.index=info['Name']
print(info)

   Name  Age Occupation        Born
0  John   26   Engineer  1996-04-04
1   Amy   30     Artist  1992-08-12
      Name  Age Occupation        Born
Name                                  
John  John   26   Engineer  1996-04-04
Amy    Amy   30     Artist  1992-08-12


---
2. 시리즈 다루기 (1)

(1) index, values, keys

In [3]:
info=pd.DataFrame({
    'Name' : ['John','Amy'],
    'Age' : [26,30],
    'Occupation' : ['Engineer','Artist'],
    'Born' : ['1996-04-04','1992-08-12']
},
index=info['Name'],
columns=['Name','Age','Occupation','Born'])

print(info.loc['John'])
print('\nindex : ',info.index)
print('\nvalue : ',info.values)
print('\nkeys  : ',info.keys())

Name                John
Age                   26
Occupation      Engineer
Born          1996-04-04
Name: John, dtype: object

index :  Index(['John', 'Amy'], dtype='object', name='Name')

value :  [['John' 26 'Engineer' '1996-04-04']
 ['Amy' 30 'Artist' '1992-08-12']]

keys  :  Index(['Name', 'Age', 'Occupation', 'Born'], dtype='object')


(2) mean, min, max, std

In [4]:
ages=info['Age']
print('mean : ',ages.mean())  # 평균
print('min : ',ages.min())    # 최소
print('max : ',ages.max())    # 최대
print('std : ',ages.std())    # 표준편차

mean :  28.0
min :  26
max :  30
std :  2.8284271247461903


---
2. 시리즈 다루기 (2)

In [5]:
s=pd.read_csv('data/scientists.csv')
s

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [6]:
ages=s['Age']
print('mean',ages.mean())
print('max',ages.max())

mean 59.125
max 90


In [7]:
# 평균보다 나이가 많은 사람만 추출
print([ages>ages.mean()])
print(ages[ages>ages.mean()])
print(s.loc[ages[ages>ages.mean()].index,['Name','Age']])

[0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool]
1    61
2    90
3    66
7    77
Name: Age, dtype: int64
                   Name  Age
1        William Gosset   61
2  Florence Nightingale   90
3           Marie Curie   66
7          Johann Gauss   77


In [8]:
#평균 연령보다 높은 행만 출력

print(s[s['Age']>s['Age'].mean()])

                   Name        Born        Died  Age     Occupation
1        William Gosset  1876-06-13  1937-10-16   61   Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90          Nurse
3           Marie Curie  1867-11-07  1934-07-04   66        Chemist
7          Johann Gauss  1777-04-30  1855-02-23   77  Mathematician


In [9]:
# 시리즈와 브로드캐스팅
print(ages+ages,ages*ages)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64 0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64


In [10]:
# sort_index
rev_ages=ages.sort_index(ascending=False)   #ascending=False : index의 역순
rev_ages

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64

In [11]:
### 
print(ages+rev_ages)     # 동일한 인덱스끼리 연산!

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


---
4. 데이터프레임 다루기

In [12]:
s=pd.read_csv('data/scientists.csv')
# 불린 추출
print(s[s['Age']>s['Age'].mean()])    # age열에서 평균 age보다 높은 행만 출력

                   Name        Born        Died  Age     Occupation
1        William Gosset  1876-06-13  1937-10-16   61   Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90          Nurse
3           Marie Curie  1867-11-07  1934-07-04   66        Chemist
7          Johann Gauss  1777-04-30  1855-02-23   77  Mathematician


---
5. 데이터 처리하기

In [13]:
s=pd.read_csv('data/scientists.csv')
print(s['Born'].dtype)
print(s['Died'].dtype)

object
object


In [14]:
# object 형을 datetime으로 변경
born=pd.to_datetime(s['Born'],format='%Y-%m-%d')
print(born)
die=pd.to_datetime(s['Died'],format='%Y-%m-%d')
print(die)
# 원래 데이터 프레임에 추가
s['born_dt'],s['died_dt']=born,die
print(s.head())
#datetime끼리 계산
s['die-born']=s['died_dt']-s['born_dt']
print(s.head())

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]
0   1958-04-16
1   1937-10-16
2   1910-08-13
3   1934-07-04
4   1964-04-14
5   1858-06-16
6   1954-06-07
7   1855-02-23
Name: Died, dtype: datetime64[ns]
                   Name        Born        Died  Age    Occupation    born_dt  \
0     Rosaline Franklin  1920-07-25  1958-04-16   37       Chemist 1920-07-25   
1        William Gosset  1876-06-13  1937-10-16   61  Statistician 1876-06-13   
2  Florence Nightingale  1820-05-12  1910-08-13   90         Nurse 1820-05-12   
3           Marie Curie  1867-11-07  1934-07-04   66       Chemist 1867-11-07   
4         Rachel Carson  1907-05-27  1964-04-14   56     Biologist 1907-05-27   

     died_dt  
0 1958-04-16  
1 1937-10-16  
2 1910-08-13  
3 1934-07-04  
4 1964-04-14  
                   Name        Born        Died  Age    Occupation    born_dt  \
0     Rosaline Franklin  1920-07-2

In [15]:
# 데이터 섞기 random.shuffle
import random

print(s['Age'])
random.seed(42)
random.shuffle(s['Age'])
print(s['Age'])

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64
0    66
1    56
2    41
3    77
4    90
5    45
6    37
7    61
Name: Age, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[i], x[j] = x[j], x[i]


In [16]:
#열 삭제 drop
deled_s=s.drop(['Age'],axis=1)
deled_s

Unnamed: 0,Name,Born,Died,Occupation,born_dt,died_dt,die-born
0,Rosaline Franklin,1920-07-25,1958-04-16,Chemist,1920-07-25,1958-04-16,13779 days
1,William Gosset,1876-06-13,1937-10-16,Statistician,1876-06-13,1937-10-16,22404 days
2,Florence Nightingale,1820-05-12,1910-08-13,Nurse,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1867-11-07,1934-07-04,Chemist,1867-11-07,1934-07-04,24345 days
4,Rachel Carson,1907-05-27,1964-04-14,Biologist,1907-05-27,1964-04-14,20777 days
5,John Snow,1813-03-15,1858-06-16,Physician,1813-03-15,1858-06-16,16529 days
6,Alan Turing,1912-06-23,1954-06-07,Computer Scientist,1912-06-23,1954-06-07,15324 days
7,Johann Gauss,1777-04-30,1855-02-23,Mathematician,1777-04-30,1855-02-23,28422 days


---
6. 데이터 저장하고 불러오기

In [17]:
# 데이터 저장 pickle
names=s['Name']
names.to_pickle('./out.pickle')
load_s=pd.read_pickle('./out.pickle')
load_s

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object

In [18]:
# csv로 저장
names.to_csv('./out.csv')

In [19]:
# tsv로 저장
names.to_csv('./out.tsv',sep='\t')