In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/gapminder.tsv', sep='\t')

# 시리즈 만들기
* 딕셔너리와 비슷하지만, 데이터를 다루는데 특화

In [3]:
s = pd.Series(['apple', 33])
s

0    apple
1       33
dtype: object

In [8]:
# 인덱스 디포트 0, 1, 2, 3, 4
list_data = ['2022-07-15', 3.14, 'sesac', 100, True]
s = pd.Series(list_data)
print(s)
print('\n')
print(type(s))

0    2022-07-15
1          3.14
2         sesac
3           100
4          True
dtype: object


<class 'pandas.core.series.Series'>


In [12]:
idx = s.index
val = s.values
print(idx)
print('\n', '---------------------------------------')
print(val)

RangeIndex(start=0, stop=5, step=1)

 ---------------------------------------
['2022-07-15' 3.14 'sesac' 100 True]


In [14]:
dict_data = {'a': 1, 'b': 2, "c": 3} # 딕셔너리를 시리즈로 만들기
s = pd.Series(dict_data)
print(s)
print(type(s))

a    1
b    2
c    3
dtype: int64
<class 'pandas.core.series.Series'>


In [15]:
s = pd.Series(['Jane', 'student'], index=['person', 'job'])
print(s)

person       Jane
job       student
dtype: object


# 딕셔너리로 데이터프레임 생성

In [24]:
scientists = pd.DataFrame(
    {
        'Name': ['Rosaline Franklin', 'William Gosset'],
        'Occupation': ['Chemist', 'Statistician'],
        'Born': ['1920-07-25', '1876-06-13'],
        'Died': ['1958-04-16', '1937-10-16'],
        'Age': [37, 61]
    })
scientists

Unnamed: 0,Name,Occupation,Born,Died,Age
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


In [25]:
scientists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 5 columns):
Name          2 non-null object
Occupation    2 non-null object
Born          2 non-null object
Died          2 non-null object
Age           2 non-null int64
dtypes: int64(1), object(4)
memory usage: 208.0+ bytes


In [33]:
scientists = pd.DataFrame( # 인덱스 추가 및 컬럼 순서 변경
    {
        'Name': ['Rosaline Franklin', 'William Gosset'],
        'Occupation': ['Chemist', 'Statistician'],
        'Born': ['1920-07-25', '1876-06-13'],
        'Died': ['1958-04-16', '1937-10-16'],
        'Age': [37, 61]
    },
        index=['Rosaline Franklin', 'William Gosset'],
        columns=['Occupation', 'Age', 'Died']
    )
scientists

Unnamed: 0,Occupation,Age,Died
Rosaline Franklin,Chemist,37,1958-04-16
William Gosset,Statistician,61,1937-10-16


In [27]:
scientists.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, Rosaline Franklin to William Gosset
Data columns (total 3 columns):
Occupation    2 non-null object
Age           2 non-null int64
Died          2 non-null object
dtypes: int64(1), object(2)
memory usage: 64.0+ bytes


## 딕셔너리는 데이터의 순서를 보장하지 않음
* 순서가 보장된 딕셔너리를 전달하려면 **OrderedDict** 클래스 사용

In [57]:
from collections import OrderedDict
scientists = pd.DataFrame([

                            ('Name', ['Rosaline Franklin', 'William Gosset']),
                            ('Occupation', ['Chemist', 'Statistician']),
                            ('Born', ['1920-07-25', '1876-06-13']),
                            ('Died', ['1958-04-16', '1937-10-16']),
                            ('Age', [37, 61])
                          
                            ])
scientists

Unnamed: 0,0,1
0,Name,"[Rosaline Franklin, William Gosset]"
1,Occupation,"[Chemist, Statistician]"
2,Born,"[1920-07-25, 1876-06-13]"
3,Died,"[1958-04-16, 1937-10-16]"
4,Age,"[37, 61]"


In [58]:
type(scientists)

pandas.core.frame.DataFrame

# 데이터프레임에서 시리즈 선택하기

In [59]:
scientists = pd.DataFrame( # 인덱스 추가 및 컬럼 순서 변경
    data = {
        'Name': ['Rosaline Franklin', 'William Gosset'],
        'Occupation': ['Chemist', 'Statistician'],
        'Born': ['1920-07-25', '1876-06-13'],
        'Died': ['1958-04-16', '1937-10-16'],
        'Age': [37, 61]
    },
        index=['Rosaline Franklin', 'William Gosset'],
        columns=['Occupation', 'Age', 'Died']
    )
scientists

Unnamed: 0,Occupation,Age,Died
Rosaline Franklin,Chemist,37,1958-04-16
William Gosset,Statistician,61,1937-10-16


In [60]:
type(scientists)

pandas.core.frame.DataFrame

In [61]:
first_row = scientists.loc['William Gosset']
print(type(first_row))

<class 'pandas.core.series.Series'>


# 문제풀이

In [4]:

df = pd.DataFrame(
{
    'Name': ['Bob', 'Jessica', 'Mary', 'John', 'Mel'],
    'Births': [968, 155, 77, 578, 973]
})

df

Unnamed: 0,Name,Births
0,Bob,968
1,Jessica,155
2,Mary,77
3,John,578
4,Mel,973


In [5]:
df['Name']

0        Bob
1    Jessica
2       Mary
3       John
4        Mel
Name: Name, dtype: object

In [7]:
df.loc[0:2]

Unnamed: 0,Name,Births
0,Bob,968
1,Jessica,155
2,Mary,77


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
Name      5 non-null object
Births    5 non-null int64
dtypes: int64(1), object(1)
memory usage: 208.0+ bytes


In [23]:
df[df['Births'] >= 100]

Unnamed: 0,Name,Births
0,Bob,968
1,Jessica,155
3,John,578
4,Mel,973


In [24]:
df['Births'].mean()

550.2

In [25]:
friend_df = pd.read_csv('../data/friend_list.csv')
friend_df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [28]:
friend_txt_df = pd.read_csv('../data/friend_list.csv')
friend_txt_df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [30]:
friend_tab_df = pd.read_csv('../data/friend_list_tab.txt')
friend_tab_df.head()

Unnamed: 0,name\tage\tjob
0,John\t20\tstudent
1,Jenny\t30\tdeveloper
2,Nate\t30\tteacher
3,Julia\t40\tdentist
4,Brian\t45\tmanager


In [34]:
# 헤드없이 출력
friend_no_head_df = pd.read_csv('../data/friend_list_no_head.csv')
friend_no_head_df

Unnamed: 0,John,20,student
0,Jenny,30,developer
1,Nate,30,teacher
2,Julia,40,dentist
3,Brian,45,manager
4,Chris,25,intern
