# Pandas는 무엇인가요?

* 데이터 분석 및 가공에 사용되는 파이썬 라이브러리

In [26]:
import pandas as pd

pd.__version__

'1.3.5'

In [2]:
data_frame = pd.read_csv('data/friend_list.csv') #pandas는 r의 데이터프레임 자료형의 기능들을 동일 하게 적용해서 제공해주고 있다.
data_frame

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


## 데이터프레임 (Dataframe)

* 가로축과 세로축이 있는 엑셀과 유사한 데이터 구조
* 가로축은 row(행), 세로축은 column(열)
* 데이터베이스의 테이블 구조

In [3]:
#데이터프레임이 가지고 있는 함수의 예제
data_frame.head(3) #default : 5

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


## 시리즈(Series)

* 데이터 프레임의 컬럼(열)은 모두 시리즈
* 단순히 파이썬 리스트를 간직한 오브젝트
* 리스트를 파라미터로 주면 바로 시리즈가 생성
* 데이터 가공 및 분석이 파이선 리스트보다 훨씬 쉽다

In [4]:
type(data_frame.job) #Pandas의 공식적인 자료형

pandas.core.series.Series

In [5]:
#시리즈의 함수 예제
data_frame.job = data_frame.job.str.upper()
data_frame.head()

Unnamed: 0,name,age,job
0,John,20,STUDENT
1,Jenny,30,DEVELOPER
2,Nate,30,TEACHER
3,Julia,40,DENTIST
4,Brian,45,MANAGER


In [6]:
s1 = pd.core.series.Series(['one','two','three'])
s2 = pd.core.series.Series([1, 2, 3])

pd.DataFrame(data=dict(word=s1, num= s2))

Unnamed: 0,word,num
0,one,1
1,two,2
2,three,3


In [7]:
#구분자에 의해 컬럼이 구분되어지게하기
df = pd.read_csv('data/friend_list_tab.txt',delimiter='\t') #delimiter = 데이터의 구분단위, \t = 탭
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [8]:
#헤더가 없는 경우
df = pd.read_csv('data/friend_list_no_head.csv',header = None) # header = None ,헤더가 없음을 표시
df.head()

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [9]:
df.columns = ['name', 'age', 'job']
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


# 데이터프레임 파이썬 코드로 생성하기

In [10]:
friend_dict_list = [{'name':'Jone', 'age':20, 'job':'student'},
                    {'name':'Jenny', 'age':30, 'job':'developer'},
                    {'name':'Nate', 'age':25, 'job':'teacher'}]

df = pd.DataFrame(friend_dict_list)
df.head()

Unnamed: 0,name,age,job
0,Jone,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [11]:
df = df[['name', 'age', 'job']]
df.head()

Unnamed: 0,name,age,job
0,Jone,20,student
1,Jenny,30,developer
2,Nate,25,teacher


## OrderedDict로 데이터프레임 생성하기

* OrderedDict 자료구조로 데이터프레임을 생성하려면, 컬럼의 순서가 뒤바뀌지 않음.

In [12]:
from collections import OrderedDict

In [13]:
friend_ordered_dict = OrderedDict([('name', ['John', 'Jenny', 'Nate']),
                                   ('age', [20, 30, 25]),
                                   ('job',['student', 'developer', 'teacher'])])

df = pd.DataFrame.from_dict(friend_ordered_dict)
df.tail(2) #하위 2개 보이기

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,25,teacher


### list로 데이터프레임 생성하기 

In [14]:
friend_list = [['john', 20, 'student'],
               ['Jenny', 30, 'developer'],
               ['Nate', 25, 'teacher']]

column_name = ['name', 'age', 'job']

df = pd.DataFrame.from_records(friend_list, columns = column_name)
df.head()

Unnamed: 0,name,age,job
0,john,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [15]:
friend_dict = {'name': ['John', 'Jenny', 'Nate'],
                       'age' : [20, 30, 25],
                       'job': ['student', 'developer', 'teacher']
                      } 
                              

df = pd.DataFrame.from_dict(friend_dict)
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,25,teacher


## 파일형태로 보관하기

In [16]:
df.to_csv('data/friend_list_from_df.csv')

In [17]:
df.to_csv('data/friend_list_from_df.txt')

In [18]:
df.to_csv('data/friend_list_from_df_header_index.csv', header = False, index = False) #헤더 인덱스 없음

In [19]:
friend_dict = {'name': ['John', 'Jenny', 'Nate'],
                       'age' : [20, None , 25],
                       'job': ['student', 'developer', 'teacher']
                      } 
                              

df = pd.DataFrame.from_dict(friend_dict)
df.head()

Unnamed: 0,name,age,job
0,John,20.0,student
1,Jenny,,developer
2,Nate,25.0,teacher


In [20]:
df.to_csv('data/friend_dict_from_df.csv')

In [21]:
df.to_csv('data/friend_dict_from_df_narep.csv',na_rep='-') #'-'로 채우기

# 데이터 접근 방법

- 인덱스로 row 선택히기 

In [22]:
friend_dict = {'name': ['John', 'Jenny', 'Nate'],
                       'age' : [20, None , 25],
                       'job': ['student', 'developer', 'teacher']
                      } 
                              

df = pd.DataFrame.from_dict(friend_dict)
df.head()

Unnamed: 0,name,age,job
0,John,20.0,student
1,Jenny,,developer
2,Nate,25.0,teacher


In [47]:
friend_dict = {'name': ['John', 'Jenny', 'Nate'],
                       'age' : [20, 30 , 25],
                       'job': ['student', 'developer', 'teacher']
                      } 
                              

df = pd.DataFrame.from_dict(friend_dict)
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [24]:
df[1:3] #순차적인 접근

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,25,teacher


In [25]:
df.loc[0, 2] # ','의 존재로 행열의 개념으로 컴퓨터가 접근 할 수 있어서 허용하지 않고 에러를 불러온다. 

KeyError: 2

In [None]:
# 순차적이지 않은 row 선택
df.loc[[0, 2]]  #선택 되어지는 row는 list로만 접근가능

In [None]:
df.loc[[0:2]]

### 컬럼값에 따른 row 선택하기

* 마치 데이터 베이스에 쿼리를 전달하듯, 특정한 컬럼값을 충족하는 row만 선택.


In [None]:
df_filtered = df[df.age > 25]
df_filtered

In [None]:
df_query = df.query('age>25')
df_query

In [None]:
df_filtered = df[(df.age > 25)&(df.name == 'Nate')]
df_filtered

In [None]:
df

# 컬럼 필터하기

### 인덱스로 필터하기

In [46]:
friend_list = [['John', 20, 'student'],
               ['Jenny', 30, 'developer'],
               ['Nate', 25, 'teacher']]

df = pd.DataFrame.from_records(friend_list)
df

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [None]:
# 모든 row 데이터를 보여주되, 컬럼의 항목은 0~1까지만 출력.
df.iloc[:,:2] #행렬 접근

In [None]:
# 모든 row 데이터를 보여주되, 컬럼의 항목은 0과 2만 출력.
df.iloc[:,[0,2]] #개별적인 row데이터 list로 접근

### 컬럼이름으로 필터하기

In [None]:
df = pd.read_csv('data/friend_list_no_head.csv', header = None, names=['name','age','job']) #pd.read_table()
df

In [None]:
df[['name', 'age']]

In [None]:
df.filter(items=['age', 'job'])

In [None]:
df.filter(like='a', axis=1) #열(axis=1)에 a가 들어가는 컬럼만 필터링

In [None]:
#정규식
df.filter(regex='b$', axis=1) 
#regex = 정규표현식 

# row 드롭하기

* row 인덱스로 row를 drop할 수 있습니다.

In [41]:
import pandas as pd
from collections import OrderedDict

In [40]:
friend_dict_list = [{'age':20,'job':'student'},
                    {'age':30,'job':'developer'},
                    {'age':25,'job':'teacher'}]

df = pd.DataFrame(friend_dict_list, index=['John','Jenny','Nate'])
df.head()

Unnamed: 0,age,job
John,20,student
Jenny,30,developer
Nate,25,teacher


In [42]:
df.drop(['John', 'Nate'])

Unnamed: 0,age,job
Jenny,30,developer


In [43]:
df

Unnamed: 0,age,job
John,20,student
Jenny,30,developer
Nate,25,teacher


In [44]:
df = df.drop(['John', 'Nate'])
df

Unnamed: 0,age,job
Jenny,30,developer


### drop된 결과를 바로 데이터프레임에 저장하는 법
* inplace 키워드를 사용하면, 따로 저장할 필요가 없이 drop된 결과가 데이터프레임에 저장된다.

In [None]:
df.drop(['John','Nate'], inplace = True) #drop메서드에서 inplace = True 설정시 바로 적용

In [48]:
friend_dict = {'name': ['John', 'Jenny', 'Nate'],
                       'age' : [20, 30 , 25],
                       'job': ['student', 'developer', 'teacher']
                      } 
                              

df = pd.DataFrame.from_dict(friend_dict)
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [49]:
# row index로 drop하는 방법
df = df.drop(df.index[[0, 2]])
df

Unnamed: 0,name,age,job
1,Jenny,30,developer


## column값으로 row drop하기

In [51]:
friend_dict = {'name': ['John', 'Jenny', 'Nate'],
               'age' : [20, 30 , 25],
               'job': ['student', 'developer', 'teacher']
                } 
                              

df = pd.DataFrame.from_dict(friend_dict)
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [52]:
df[df.age != 30]

Unnamed: 0,name,age,job
0,John,20,student
2,Nate,25,teacher


In [55]:
df.drop('age', axis = 1) #axis가 기본값으로 0으로 잡혀있다.(2차원데이터 기준 행)(열에 접근시 axis=1선언 해주어야함.)

Unnamed: 0,name,job
0,John,student
1,Jenny,developer
2,Nate,teacher


In [62]:
df['salary'] = 0
df

Unnamed: 0,name,age,job,salary
0,John,20,student,0
1,Jenny,30,developer,0
2,Nate,25,teacher,0


In [64]:
df = df.drop('salary', axis = 1)
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [66]:
#numpy를 이용해서 한줄에 새로운 컬럼값을 생성
import numpy as np

df['salary'] = np.where(df['job'] != 'student','yes','no')   # 기존의 변수값을 가지고 파생변수를 한번에 생성
df

Unnamed: 0,name,age,job,salary
0,John,20,student,no
1,Jenny,30,developer,yes
2,Nate,25,teacher,yes


In [89]:
friedn_dict_list = [{'name':'John', 'midterm':95, 'final':85},
                    {'name':'Jenny', 'midterm':85, 'final':80},
                    {'name':'Nate', 'midterm':75, 'final':95},
                    {'name':'Bryan', 'midterm':55, 'final':45}]

score_df = pd.DataFrame(friedn_dict_list, columns =['name', 'midterm', 'final'])
score_df

Unnamed: 0,name,midterm,final
0,John,95,85
1,Jenny,85,80
2,Nate,75,95
3,Bryan,55,45


In [90]:
# 파생 변수로 컬럼추가
score_df['total'] = score_df['midterm'] + score_df['final']
score_df

Unnamed: 0,name,midterm,final,total
0,John,95,85,180
1,Jenny,85,80,165
2,Nate,75,95,170
3,Bryan,55,45,100


In [91]:
#평균
score_df['average'] = score_df['total'] / 2
score_df

Unnamed: 0,name,midterm,final,total,average
0,John,95,85,180,90.0
1,Jenny,85,80,165,82.5
2,Nate,75,95,170,85.0
3,Bryan,55,45,100,50.0


In [92]:
# 리스트에 조건별 값을 담아서, 새로운 컬럼으로 추가시킬수 있다.

grades = []

for row in score_df['average'] :
    if row >= 90:
        grades.append('A')
    elif row >= 80:
        grades.append('B')
    elif row >= 70:
        grades.append('C')
    elif row >= 60:
        grades.append('D')
    else:
        grades.append('F')
        
score_df['grade'] = grades
score_df

Unnamed: 0,name,midterm,final,total,average,grade
0,John,95,85,180,90.0,A
1,Jenny,85,80,165,82.5,B
2,Nate,75,95,170,85.0,B
3,Bryan,55,45,100,50.0,F


### apply() 사용예제

In [93]:
# 값의 수정

def pass_or_fail(row): #함수 정의
    if row != 'F':
        return 'Pass'
    else:
        return 'Fail'
    
score_df.grade = score_df.grade.apply(pass_or_fail)
score_df

Unnamed: 0,name,midterm,final,total,average,grade
0,John,95,85,180,90.0,Pass
1,Jenny,85,80,165,82.5,Pass
2,Nate,75,95,170,85.0,Pass
3,Bryan,55,45,100,50.0,Fail


In [105]:
#연월일의 정보에서 연도만 추출하는 예제

date_list = [{'yyyy-mm-dd':'2000-06-27'},
             {'yyyy-mm-dd':'2002-09-24'},
             {'yyyy-mm-dd':'2005-12-20'}]

date_df = pd.DataFrame(date_list, columns=['yyyy-mm-dd'])
date_df

Unnamed: 0,yyyy-mm-dd
0,2000-06-27
1,2002-09-24
2,2005-12-20


In [106]:
def year(row):
    return row.split('-')[0]

def month(row):
    return row.split('-')[1]

def day(row):
    return row.split('-')[2]

date_df['year'] = date_df['yyyy-mm-dd'].apply(year)
date_df['month'] = date_df['yyyy-mm-dd'].apply(month)
date_df['day'] = date_df['yyyy-mm-dd'].apply(day)
date_df

Unnamed: 0,yyyy-mm-dd,year,month,day
0,2000-06-27,2000,6,27
1,2002-09-24,2002,9,24
2,2005-12-20,2005,12,20


### apply()에 파라미터 전달하기
* 키워드 파라미터를 사용하면, appply가 적용된 함수에 파라미터를 전달 할 수 있다.

In [107]:
def age(year, current_year):
    return current_year - int(year)

date_df['age'] = date_df['year'].apply(age, current_year = 2022)
date_df

Unnamed: 0,yyyy-mm-dd,year,month,day,age
0,2000-06-27,2000,6,27,22
1,2002-09-24,2002,9,24,20
2,2005-12-20,2005,12,20,17


In [108]:
def get_introduce(age, prefix, suffix):
    return prefix + str(age) + suffix

date_df['introduce'] = date_df['age'].apply(get_introduce, prefix = 'I am ',suffix=' years old')
date_df

Unnamed: 0,yyyy-mm-dd,year,month,day,age,introduce
0,2000-06-27,2000,6,27,22,I am 22 years old
1,2002-09-24,2002,9,24,20,I am 20 years old
2,2005-12-20,2005,12,20,17,I am 17 years old


In [109]:
# 여러개의 컬럼을 동시에 전달하기

def get_introduce2(row):
    return 'I was born in ' + str(row.year) + ' my age is ' + str(row.age)

date_df.introduce = date_df.apply(get_introduce2, axis=1)
date_df

Unnamed: 0,yyyy-mm-dd,year,month,day,age,introduce
0,2000-06-27,2000,6,27,22,I was born in 2000 my age is 22
1,2002-09-24,2002,9,24,20,I was born in 2002 my age is 20
2,2005-12-20,2005,12,20,17,I was born in 2005 my age is 17


# map( )으로 컬럼 추가 및 변경하기

In [111]:
date_df['year'] = date_df['yyyy-mm-dd'].map(year)
date_df

Unnamed: 0,yyyy-mm-dd,year,month,day,age,introduce
0,2000-06-27,2000,6,27,22,I was born in 2000 my age is 22
1,2002-09-24,2002,9,24,20,I was born in 2002 my age is 20
2,2005-12-20,2005,12,20,17,I was born in 2005 my age is 17


In [112]:
# 파라미터로 딕셔너리를 전달하면 컬럼값을 쉽게 원하는 값으로 변경 가능, 
# 기존의 컬럼값은 딕셔너리의 key로 사용되고, 해당되는 value값으로 컬럼

job_list =[{'age':20, 'job':'student'},
           {'age':30, 'job':'developer'},
           {'age':35, 'job':'teacher'}]

df = pd.DataFrame(job_list)
df

Unnamed: 0,age,job
0,20,student
1,30,developer
2,35,teacher


In [115]:
df.job =df.job.map({"student":1, "developer":2, "teacher":3 })
df

Unnamed: 0,age,job
0,20,1
1,30,2
2,35,3


### applymap( )메서드
* 데이터프레임 전체의 각각의 값을 한 번에 변경시킬 때 사용하면 유용.

In [116]:
x_y = [{'x':5.5, 'y':-5.6},
       {'x':-5.2, 'y':5.5},
       {'x':-1.6, 'y':-4.5}]

df = pd.DataFrame(x_y)
df

Unnamed: 0,x,y
0,5.5,-5.6
1,-5.2,5.5
2,-1.6,-4.5


In [117]:
df = df.applymap(np.around)  #np.around 반올림
df

Unnamed: 0,x,y
0,6.0,-6.0
1,-5.0,6.0
2,-2.0,-4.0


## 데이터프레임에 row추가하기

In [118]:
score_df

Unnamed: 0,name,midterm,final,total,average,grade
0,John,95,85,180,90.0,Pass
1,Jenny,85,80,165,82.5,Pass
2,Nate,75,95,170,85.0,Pass
3,Bryan,55,45,100,50.0,Fail


In [119]:
df2 = pd.DataFrame([['Ben', 50, 50]],columns=['name','midterm','final'])
df2

Unnamed: 0,name,midterm,final
0,Ben,50,50


In [120]:
score_df.append(df2, ignore_index=True)

Unnamed: 0,name,midterm,final,total,average,grade
0,John,95,85,180.0,90.0,Pass
1,Jenny,85,80,165.0,82.5,Pass
2,Nate,75,95,170.0,85.0,Pass
3,Bryan,55,45,100.0,50.0,Fail
4,Ben,50,50,,,


## groupby( )함수
* 데이터에서 정보를 취하기 위해서 그룹별로 묶는 방법.

In [123]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Sera', 'major': "Psychology", 'sex': "female"}
               ]

stdf = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
stdf

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [126]:
groupby_major = stdf.groupby('major') 
groupby_major #주소값 반환

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002617D882588>

In [127]:
groupby_major.groups  #딕셔너리 형태 관리

{'Computer Science': [0, 1, 6, 7], 'Economics': [4, 5, 9], 'Physics': [2], 'Psychology': [3, 8, 10]}

In [128]:
for name, group in groupby_major:
    print(name + " : " + str(len(group)))
    print(group)
    print()

Computer Science : 4
       name             major     sex
0      John  Computer Science    male
1      Nate  Computer Science    male
6  Jeniffer  Computer Science  female
7    Edward  Computer Science    male

Economics : 3
    name      major     sex
4  Janny  Economics  female
5   Yuna  Economics  female
9  Wendy  Economics  female

Physics : 1
      name    major   sex
2  Abraham  Physics  male

Psychology : 3
     name       major     sex
3   Brian  Psychology    male
8    Zara  Psychology  female
10   Sera  Psychology  female



In [130]:
# 그룹객체를 다시 데이터 프레임으로 생성.
df_major_cnt =  pd.DataFrame({'count':groupby_major.size()}).reset_index()
df_major_cnt

Unnamed: 0,major,count
0,Computer Science,4
1,Economics,3
2,Physics,1
3,Psychology,3


In [133]:
groupby_sex = stdf.groupby('sex')
groupby_sex

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002617D888948>

In [134]:
for name, group in groupby_sex:
    print(name + " : " + str(len(group)))
    print(group)
    print()

female : 6
        name             major     sex
4      Janny         Economics  female
5       Yuna         Economics  female
6   Jeniffer  Computer Science  female
8       Zara        Psychology  female
9      Wendy         Economics  female
10      Sera        Psychology  female

male : 5
      name             major   sex
0     John  Computer Science  male
1     Nate  Computer Science  male
2  Abraham           Physics  male
3    Brian        Psychology  male
7   Edward  Computer Science  male



In [135]:
df_sex_cnt =  pd.DataFrame({'count':groupby_sex.size()}).reset_index()
df_sex_cnt

Unnamed: 0,sex,count
0,female,6
1,male,5


In [136]:
stdf

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


# 중복데이터 drop하기

In [139]:
# 중복데이터 삽입
stdf2 = pd.DataFrame([['Zara','Psychology', 'female']], columns = ['name', 'major', 'sex'])
stdf2

stdf = stdf.append(stdf2, ignore_index = True) #ignore_index 기존인덱스 무시
stdf

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [152]:
# 중복 데이터 확인
stdf.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
dtype: bool

In [147]:
stdf.drop_duplicates(keep='first')

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [148]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': None, 'sex': "female"},
                {'name': 'Sera', 'major': "Psychology", 'sex': None},
                {'name': 'John', 'major': "Computer Science", 'sex': None},
                {'name': 'Nate', 'major': None, 'sex': "male"}
               ]

stdf = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
stdf

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,,female


In [151]:
# 컬럼이 똑같을 경우, 중복된데이터로 표시
stdf.duplicated(['name'])

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11     True
12     True
dtype: bool

In [153]:
#keep값을 first 또는 last라고 값을 줘서 중복된 값 중, 어느 값을 살릴지 결정,
stdf.drop_duplicates(['name'], keep = 'last') # keep의 기본값은 'first'

Unnamed: 0,name,major,sex
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,,female
10,Sera,Psychology,
11,John,Computer Science,


## None 처리하기

In [182]:
#Null 또는 NaN 확인하기 
friend_dict_list = [{'name':'Jone', 'age':20, 'job':'student'},
                    {'name':'Jenny', 'age':30, 'job':'developer'},
                    {'name':'Yuna', 'age':20, 'job':'teacher'},
                    {'name':'Nate', 'age':None, 'job':'teacher'},
                    {'name':'Bryan', 'age':12, 'job':'teacher'}]

df = pd.DataFrame(friend_dict_list)
df.head()

Unnamed: 0,name,age,job
0,Jone,20.0,student
1,Jenny,30.0,developer
2,Yuna,20.0,teacher
3,Nate,,teacher
4,Bryan,12.0,teacher


In [183]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    5 non-null      object 
 1   age     4 non-null      float64
 2   job     5 non-null      object 
dtypes: float64(1), object(2)
memory usage: 248.0+ bytes


In [184]:
df.describe() #  R의 summary 함수 개념

Unnamed: 0,age
count,4.0
mean,20.5
std,7.371115
min,12.0
25%,18.0
50%,20.0
75%,22.5
max,30.0


In [185]:
df.isna() #NaN 체크

Unnamed: 0,name,age,job
0,False,False,False
1,False,False,False
2,False,False,False
3,False,True,False
4,False,False,False


In [186]:
df.isnull() #null의 의미를 담고있는 데이터 ex)NaN, None, null 등..

Unnamed: 0,name,age,job
0,False,False,False
1,False,False,False
2,False,False,False
3,False,True,False
4,False,False,False


### Null 또는 NaN 값 변경.

In [180]:
#Null을 0으로 설정 예제.

tmp = df
tmp['age'] =  tmp['age'].fillna(0)
tmp

Unnamed: 0,name,age,job
0,Jone,20.0,student
1,Jenny,30.0,developer
2,Yuna,20.0,teacher
3,Nate,0.0,teacher
4,Bryan,12.0,teacher


In [187]:
# 평균(중위수)을 계산해서 Null을 평균으로 대체

df['age'].fillna(df.groupby('job')['age'].transform('median'), inplace=True)
df

Unnamed: 0,name,age,job
0,Jone,20.0,student
1,Jenny,30.0,developer
2,Yuna,20.0,teacher
3,Nate,16.0,teacher
4,Bryan,12.0,teacher


# Unique
* 컬럼에 여러값이 있을 때, 중복없이 어떤 값들이 있는지 확인하는 방법

In [189]:
job_list = [{'name': 'John', 'job': "teacher"},
                {'name': 'Nate', 'job': "teacher"},
                {'name': 'Fred', 'job': "teacher"},
                {'name': 'Abraham', 'job': "student"},
                {'name': 'Brian', 'job': "student"},
                {'name': 'Janny', 'job': "developer"},
                {'name': 'Nate', 'job': "teacher"},
                {'name': 'Obrian', 'job': "dentist"},
                {'name': 'Yuna', 'job': "teacher"},
                {'name': 'Rob', 'job': "lawyer"},
                {'name': 'Brian', 'job': "student"},
                {'name': 'Matt', 'job': "student"},
                {'name': 'Wendy', 'job': "banker"},
                {'name': 'Edward', 'job': "teacher"},
                {'name': 'Ian', 'job': "teacher"},
                {'name': 'Chris', 'job': "banker"},
                {'name': 'Philip', 'job': "lawyer"},
                {'name': 'Janny', 'job': "basketball player"},
                {'name': 'Gwen', 'job': "teacher"},
                {'name': 'Jessy', 'job': "student"}
         ]
df = pd.DataFrame(job_list, columns = ['name', 'job'])
df

Unnamed: 0,name,job
0,John,teacher
1,Nate,teacher
2,Fred,teacher
3,Abraham,student
4,Brian,student
5,Janny,developer
6,Nate,teacher
7,Obrian,dentist
8,Yuna,teacher
9,Rob,lawyer


In [190]:
# 컬럼(시리즈)의 uniqe() 함수를 사용하여, 중복 없이 컬럼에 있는 모든값을 출력.

print(df.job.unique())

['teacher' 'student' 'developer' 'dentist' 'lawyer' 'banker'
 'basketball player']


In [191]:
# 각 unique한 값 별로 몇개의 데이터가 속하는 지 value_counts() 함수로 확인.

df.job.value_counts()

teacher              8
student              5
lawyer               2
banker               2
developer            1
dentist              1
basketball player    1
Name: job, dtype: int64

# 두개의 데이터 프레임 합치기

In [193]:
l1 = [{'name': 'John', 'job': "teacher"},
       {'name': 'Nate', 'job': "teacher"},
       {'name': 'Fred', 'job': "teacher"}]

l2 = [{'name': 'Abraham', 'job': "student"},
       {'name': 'Brian', 'job': "student"},
       {'name': 'Janny', 'job': "developer"}]

df1 = pd.DataFrame(l1, columns = ['name', 'job'])
df2 = pd.DataFrame(l2, columns = ['name', 'job'])

In [194]:
#pd.concat() : 두번째 데이터프레임을 첫번째 데이터 프레임의 새로운 row(행) 합침.
frames = [df1, df2]
result = pd.concat(frames, ignore_index = True)
result

Unnamed: 0,name,job
0,John,teacher
1,Nate,teacher
2,Fred,teacher
3,Abraham,student
4,Brian,student
5,Janny,developer


In [196]:
# df.append() : 두번째 데이터프레임을 첫번째 데이터프레임의 새로운 row(행) 합침.
frames2 = df1.append(df2,ignore_index = True)
frames2

Unnamed: 0,name,job
0,John,teacher
1,Nate,teacher
2,Fred,teacher
3,Abraham,student
4,Brian,student
5,Janny,developer


In [197]:
l1 = [{'name': 'John', 'job': "teacher"},
      {'name': 'Nate', 'job': "student"},
      {'name': 'Jack', 'job': "developer"}]

l2 = [{'age': 25, 'country': "U.S"},
      {'age': 30, 'country': "U.K"},
      {'age': 45, 'country': "Korea"}]

df1 = pd.DataFrame(l1, columns = ['name', 'job'])
df2 = pd.DataFrame(l2, columns = ['age', 'country'])

In [198]:
result = pd.concat([df1,df2],ignore_index = True,axis=1)
result

Unnamed: 0,0,1,2,3
0,John,teacher,25,U.S
1,Nate,student,30,U.K
2,Jack,developer,45,Korea


## 두개의 리스트를 묶어서 데이터프레임으로 생성

In [199]:
label = [1, 2, 3, 4, 5]
prediction = [1, 2, 2 , 5, 5]

comparision = pd.DataFrame({'label': label,
                             'prediction': prediction
                           })

comparision

Unnamed: 0,label,prediction
0,1,1
1,2,2
2,3,2
3,4,5
4,5,5
