In [None]:
!pip install -q condacolab

In [None]:
import condacolab
condacolab.install()

⏬ Downloading https://github.com/jaimergp/miniforge/releases/latest/download/Mambaforge-colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:35
🔁 Restarting kernel...


In [None]:
!conda --version

conda 4.9.2


In [None]:
from google.colab import drive 
drive.mount('/content/drive') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd drive/MyDrive/Colab\ Notebooks/pandas-master

/content/drive/MyDrive/Colab Notebooks/pandas-master


In [None]:
ls

[0m[01;34mdata[0m/  friends.csv  Pandas_Cheatsheet.ipynb


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1.DataFrame, Series

In [None]:
# Series
s1 = pd.core.series.Series([1,2,3])
s1

0    1
1    2
2    3
dtype: int64

In [None]:
type(s1)

pandas.core.series.Series

In [None]:
s2 = pd.Series(['one','two','three'])
s2

0      one
1      two
2    three
dtype: object

In [None]:
# DataFrame
pd.DataFrame(data=dict(num=s1,word=s2))

Unnamed: 0,num,word
0,1,one
1,2,two
2,3,three


## 2.Read File to DataFrame

In [None]:
# drive에 저장된 데이터 불러오기(csv)
df = pd.read_csv('data/friend_list.csv')

In [None]:
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [None]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [None]:
# drive에 저장된 데이터 불러오기(txt)
df= pd.read_csv('data/friend_list.txt')

In [None]:
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [None]:
# drive에 저장된 데이터 불러오기(txt - tab 구분 : delimiter)
df= pd.read_csv('data/friend_list_tab.txt',delimiter="\t")

In [None]:
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [None]:
#header 가 없을 때
df = pd.read_csv('data/friend_list_no_head.csv', header = None)
df

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [None]:
df.columns= ['name','age','job']
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [None]:
df = pd.read_csv('data/friend_list_no_head.csv', header = None, names =['name','age','job'])
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


## 3.Create DataFrame

### from dictionary

In [None]:
# 딕셔너리 사용
friend_dict_list = [{'name' : 'John', 'age' : 25,'job':'student'},
                    {'name' : 'Nate', 'age' : 30,'job':'teacher'}
                    ]

In [None]:
df = pd.DataFrame(friend_dict_list)

In [None]:
df = df[['name','job','age']]
df.head()

Unnamed: 0,name,job,age
0,John,student,25
1,Nate,teacher,30


### from OrderedDict

In [None]:
from collections import OrderedDict

In [None]:
# key의 순서보장 OrderedDict사용
friend_ordered_dict = OrderedDict(
    [
      ('name',['Jone','Nate']),
      ('age',[25,30]),
      ('job',['student','teacher']),
    ]
)

In [None]:
df = pd.DataFrame.from_dict(friend_ordered_dict)
df

Unnamed: 0,name,age,job
0,Jone,25,student
1,Nate,30,teacher


### from list

In [None]:
# 리스트 사용
friend_list = [
      ['Jone',20,'student'],
      ['Nate',30,'teacher']
]

In [None]:
column_name = ['name','age','job']

In [None]:
df = pd.DataFrame.from_records(friend_list,columns=column_name)
df

Unnamed: 0,name,age,job
0,Jone,20,student
1,Nate,30,teacher


In [None]:
friend_list = [
      ['name',['Jone','Nate']],
      ['age',[25,30]],
      ['job',['student','teacher']]
]

In [None]:
df = pd.DataFrame.from_dict(dict(friend_list) )
df

Unnamed: 0,name,age,job
0,Jone,25,student
1,Nate,30,teacher


## 4.Write DataFrame to File

In [None]:
friends = [{'name':'Jone','age':20,'job':'student'},
           {'name':'Jenny','age':30,'job':None},
           {'name':'Nate','age':40,'job':'teacher'}
           ]
df = pd.DataFrame(friends)
df = df[['name','age','job']]
df

Unnamed: 0,name,age,job
0,Jone,20,student
1,Jenny,30,
2,Nate,40,teacher


In [None]:
df.to_csv('friends.csv',index = True, header = True, na_rep='-')

In [None]:
ls

[0m[01;34mdata[0m/  friends.csv  Pandas_Cheatsheet.ipynb


## 5.Select Row

### by index

In [None]:
friends_list = [
        ['name',['Jhon','Jenny','Nate']],
        ['age',[20,30,30]],
        ['job',['student','developer','teacher']]
]
df = pd.DataFrame.from_dict(dict(friends_list))
df

Unnamed: 0,name,age,job
0,Jhon,20,student
1,Jenny,30,developer
2,Nate,30,teacher


In [None]:
df[1:3]

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [None]:
df = df[1:3]
df

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [None]:
df.loc[[0,2]]

Unnamed: 0,name,age,job
0,Jhon,20,student
2,Nate,30,teacher


### by column condition

In [None]:
df[df.age > 25]

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [None]:
df.query('age>25')

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [None]:
df[(df.age>25) & (df.name=='Nate')]

Unnamed: 0,name,age,job
2,Nate,30,teacher


## 6.Filter Column

### by index

In [None]:
friend_list = [['John',20, 'student'],
               ['Jenny',30, 'developer'],
               ['Nate',30, 'teacher']
]
df = pd.DataFrame.from_records(friend_list)
df

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


In [None]:
df.iloc[:,0:2]

Unnamed: 0,0,1
0,John,20
1,Jenny,30
2,Nate,30


In [None]:
df.iloc[0:2,0:2]

Unnamed: 0,0,1
0,John,20
1,Jenny,30


### by column name

In [None]:
df = pd.read_csv('data/friend_list_no_head.csv',header = None, names=['name','age','job'])
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [None]:
df_filtered = df[['name','age']]
df_filtered

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [None]:
df.filter(items=['age','job'])

Unnamed: 0,age,job
0,20,student
1,30,developer
2,30,teacher
3,40,dentist
4,45,manager
5,25,intern


In [None]:
# axis=0(index) : 행을 따라 동작
# axis=1(columns) : 열을 따라 동작
df.filter(like='a',axis=1)

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [None]:
#정규식
df.filter(regex='b$',axis=1)

Unnamed: 0,job
0,student
1,developer
2,teacher
3,dentist
4,manager
5,intern


## 7.Drop rows

In [None]:
friends = [{'age' : 15, 'job' : 'student'},
           {'age' : 25, 'job' : 'developer'},
           {'age' : 30, 'job' : 'teacher'}]
df = pd.DataFrame(friends,index = ['John','Jenny', 'Nate'],
                  columns = ['age','job'])
df

Unnamed: 0,age,job
John,15,student
Jenny,25,developer
Nate,30,teacher


In [None]:
df = df.drop(['John','Nate'])
df

Unnamed: 0,age,job
Jenny,25,developer


In [None]:
df.drop(['John','Nate'],inplace= True)

In [None]:
df

Unnamed: 0,age,job
Jenny,25,developer


In [None]:
friends = [{'name': 'John','age' : 15, 'job' : 'student'},
           {'name': 'Ben','age' : 25, 'job' : 'developer'},
           {'name': 'Jenny','age' : 30, 'job' : 'teacher'}]
df = pd.DataFrame(friends ,columns = ['name','age','job'])
df

Unnamed: 0,name,age,job
0,John,15,student
1,Ben,25,developer
2,Jenny,30,teacher


In [None]:
#인덱스로 삭제
df = df.drop(df.index[[0,2]])

In [None]:
df

Unnamed: 0,name,age,job
1,Ben,25,developer


In [None]:
df = df[df.age>20]

In [None]:
df

Unnamed: 0,name,age,job
1,Ben,25,developer
2,Jenny,30,teacher


## 8.Drop column

In [None]:
friends = [{'name': 'John','age' : 15, 'job' : 'student'},
           {'name': 'Ben','age' : 25, 'job' : 'developer'},
           {'name': 'Jenny','age' : 30, 'job' : 'teacher'}]
df = pd.DataFrame(friends ,columns = ['name','age','job'])
df

Unnamed: 0,name,age,job
0,John,15,student
1,Ben,25,developer
2,Jenny,30,teacher


In [None]:
df = df.drop('age',axis=1)

In [None]:
df

Unnamed: 0,name,job
0,John,student
1,Ben,developer
2,Jenny,teacher


In [None]:
df.drop('age',axis=1, inplace = True)

In [None]:
df

Unnamed: 0,name,job
0,John,student
1,Ben,developer
2,Jenny,teacher


## 7.Add Column,Row / Update Column,Row

### column

In [None]:
friends_dict_list = [
                     {'name':'Jone','age':15,'job':'student'},   
                     {'name':'Jenny','age':30,'job':'developer'},   
                     {'name':'Nate','age':30,'job':'teacher'}
                     ]
df = pd.DataFrame(friends_dict_list, columns=['name','age','job'])
df

Unnamed: 0,name,age,job
0,Jone,15,student
1,Jenny,30,developer
2,Nate,30,teacher


In [None]:
df['salary' ]=0
df

Unnamed: 0,name,age,job,salary
0,Jone,15,student,0
1,Jenny,30,developer,0
2,Nate,30,teacher,0


In [None]:
df['salary'] = np.where(df['job']!='student','yes','no')

In [None]:
df

Unnamed: 0,name,age,job,salary
0,Jone,15,student,no
1,Jenny,30,developer,yes
2,Nate,30,teacher,yes


In [None]:
friends_dict_list = [
                     {'name':'Jone','midterm':95,'final':85},   
                     {'name':'Jenny','midterm':85,'final':80},   
                     {'name':'Nate','midterm':30,'final':10}
                     ]
df = pd.DataFrame(friends_dict_list, columns=['name','midterm','final'])
df

Unnamed: 0,name,midterm,final
0,Jone,95,85
1,Jenny,85,80
2,Nate,30,10


In [None]:
df['total'] = df['midterm'] + df['final']
df

Unnamed: 0,name,midterm,final,total
0,Jone,95,85,180
1,Jenny,85,80,165
2,Nate,30,10,40


In [None]:
df['average'] =df['total']/2
df

Unnamed: 0,name,midterm,final,total,average
0,Jone,95,85,180,90.0
1,Jenny,85,80,165,82.5
2,Nate,30,10,40,20.0


In [None]:
grades=[]
for row in df['average']:
  if row >= 90 :
    grades.append('A')
  elif row >= 80 :
    grades.append('B')
  else:
    grades.append('F')

df['grade'] = grades
df

Unnamed: 0,name,midterm,final,total,average,grade
0,Jone,95,85,180,90.0,A
1,Jenny,85,80,165,82.5,B
2,Nate,30,10,40,20.0,F


In [None]:
def pass_or_fail(row):
  if row != 'F':
    return "Pass"
  else:
    return "Fail"

In [None]:
df.grade = df.grade.apply(pass_or_fail)

In [None]:
df

Unnamed: 0,name,midterm,final,total,average,grade
0,Jone,95,85,180,90.0,Pass
1,Jenny,85,80,165,82.5,Pass
2,Nate,30,10,40,20.0,Fail


In [None]:
date_list=[
           {
            'yyyy-mm-dd':'2000-06-26'   
           },
           {
            'yyyy-mm-dd':'2000-07-26'   
           }
]

df = pd.DataFrame(date_list,columns=['yyyy-mm-dd'])
df

Unnamed: 0,yyyy-mm-dd
0,2000-06-26
1,2000-07-26


In [None]:
def extract_year(row):
  return row.split('-')[0]

In [None]:
df['year']=df['yyyy-mm-dd'].apply(extract_year)
df

Unnamed: 0,yyyy-mm-dd,year
0,2000-06-26,2000
1,2000-07-26,2000


### Row

In [None]:
friends_dict_list = [
                     {'name':'Jone','midterm':95,'final':85},   
                     {'name':'Jenny','midterm':85,'final':80},   
                     {'name':'Nate','midterm':30,'final':10}
                     ]
df = pd.DataFrame(friends_dict_list, columns=['name','midterm','final'])
df

Unnamed: 0,name,midterm,final
0,Jone,95,85
1,Jenny,85,80
2,Nate,30,10


In [None]:
df2 = pd.DataFrame([['Ben',50,50]],columns=['name','midterm','final'])
df2

Unnamed: 0,name,midterm,final
0,Ben,50,50


In [None]:
df.append(df2,ignore_index=True)

Unnamed: 0,name,midterm,final
0,Jone,95,85
1,Jenny,85,80
2,Nate,30,10
3,Ben,50,50


## 8.Group by

In [None]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Sera', 'major': "Psychology", 'sex': "female"}
         ]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [None]:
groupby_major = df.groupby('major')
groupby_major.groups

{'Computer Science': [0, 1, 6, 7], 'Economics': [4, 5, 9], 'Physics': [2], 'Psychology': [3, 8, 10]}

In [None]:
for name,group in groupby_major:
  print(name + " : " + str(len(group)))
  print(group)
  print('-----')

Computer Science : 4
       name             major     sex
0      John  Computer Science    male
1      Nate  Computer Science    male
6  Jeniffer  Computer Science  female
7    Edward  Computer Science    male
-----
Economics : 3
    name      major     sex
4  Janny  Economics  female
5   Yuna  Economics  female
9  Wendy  Economics  female
-----
Physics : 1
      name    major   sex
2  Abraham  Physics  male
-----
Psychology : 3
     name       major     sex
3   Brian  Psychology    male
8    Zara  Psychology  female
10   Sera  Psychology  female
-----


In [None]:
df_major_cnt = pd.DataFrame({'count' : groupby_major.size()}).reset_index()
df_major_cnt

Unnamed: 0,major,count
0,Computer Science,4
1,Economics,3
2,Physics,1
3,Psychology,3


In [None]:
groupby_sex = df.groupby('sex')

In [None]:
for name,group in groupby_sex:
  print(name + " : " + str(len(group)))
  print(group)
  print('-----')

female : 6
        name             major     sex
4      Janny         Economics  female
5       Yuna         Economics  female
6   Jeniffer  Computer Science  female
8       Zara        Psychology  female
9      Wendy         Economics  female
10      Sera        Psychology  female
-----
male : 5
      name             major   sex
0     John  Computer Science  male
1     Nate  Computer Science  male
2  Abraham           Physics  male
3    Brian        Psychology  male
7   Edward  Computer Science  male
-----


In [None]:
df_sex_cnt = pd.DataFrame({'count':groupby_sex.size()}).reset_index()
df_sex_cnt

Unnamed: 0,sex,count
0,female,6
1,male,5


## 9.Drop Duplicate

In [None]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Sera', 'major': "Psychology", 'sex': "female"},
                {'name': 'John', 'major': "Computer Science", 'sex': "male"},
         ]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [None]:
# True 가 중복된 값
df.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11     True
dtype: bool

In [None]:
df.drop_duplicates()

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [None]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Nate', 'major': None, 'sex': "male"},
                {'name': 'John', 'major': "Computer Science", 'sex': None},
         ]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [None]:
df.duplicated(['name'])

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10     True
11     True
dtype: bool

In [None]:
# df.drop_duplicates(['name'],keep='first')
df.drop_duplicates(['name'],keep='last')

Unnamed: 0,name,major,sex
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female
10,Nate,,male
11,John,Computer Science,


## 10.how to manage None value?

In [None]:
school_id_list = [{'name': 'John', 'job': "teacher", 'age': 40},
                {'name': 'Nate', 'job': "teacher", 'age': 35},
                {'name': 'Yuna', 'job': "teacher", 'age': 37},
                {'name': 'Abraham', 'job': "student", 'age': 10},
                {'name': 'Brian', 'job': "student", 'age': 12},
                {'name': 'Janny', 'job': "student", 'age': 11},
                {'name': 'Nate', 'job': "teacher", 'age': None},
                {'name': 'John', 'job': "student", 'age': None}
         ]
df = pd.DataFrame(school_id_list, columns = ['name', 'job', 'age'])
df

Unnamed: 0,name,job,age
0,John,teacher,40.0
1,Nate,teacher,35.0
2,Yuna,teacher,37.0
3,Abraham,student,10.0
4,Brian,student,12.0
5,Janny,student,11.0
6,Nate,teacher,
7,John,student,


In [None]:
# 8개 row
df.shape

(8, 3)

In [None]:
# age가 2개 None
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    8 non-null      object 
 1   job     8 non-null      object 
 2   age     6 non-null      float64
dtypes: float64(1), object(2)
memory usage: 320.0+ bytes


In [None]:
df.isna()

Unnamed: 0,name,job,age
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,True
7,False,False,True


In [None]:
df.isnull()

Unnamed: 0,name,job,age
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,True
7,False,False,True


In [None]:
df.age = df.age.fillna(0)

In [None]:
df

Unnamed: 0,name,job,age
0,John,teacher,40.0
1,Nate,teacher,35.0
2,Yuna,teacher,37.0
3,Abraham,student,10.0
4,Brian,student,12.0
5,Janny,student,11.0
6,Nate,teacher,0.0
7,John,student,0.0


In [None]:
school_id_list = [{'name': 'John', 'job': "teacher", 'age': 40},
                {'name': 'Nate', 'job': "teacher", 'age': 35},
                {'name': 'Yuna', 'job': "teacher", 'age': 37},
                {'name': 'Abraham', 'job': "student", 'age': 10},
                {'name': 'Brian', 'job': "student", 'age': 12},
                {'name': 'Janny', 'job': "student", 'age': 11},
                {'name': 'Nate', 'job': "teacher", 'age': None},
                {'name': 'John', 'job': "student", 'age': None}
         ]
df = pd.DataFrame(school_id_list, columns = ['name', 'job', 'age'])
df

Unnamed: 0,name,job,age
0,John,teacher,40.0
1,Nate,teacher,35.0
2,Yuna,teacher,37.0
3,Abraham,student,10.0
4,Brian,student,12.0
5,Janny,student,11.0
6,Nate,teacher,
7,John,student,


In [None]:
df['age'].fillna(df.groupby('job')['age'].transform('median'),inplace = True)

In [None]:
df

Unnamed: 0,name,job,age
0,John,teacher,40.0
1,Nate,teacher,35.0
2,Yuna,teacher,37.0
3,Abraham,student,10.0
4,Brian,student,12.0
5,Janny,student,11.0
6,Nate,teacher,37.0
7,John,student,11.0


## 11.Apply

In [None]:
data_list = [{'yyyy-mm-dd' : '2000-06-27'},
             {'yyyy-mm-dd' : '2002-09-24'},
             {'yyyy-mm-dd' : '2005-12-20'}]
df = pd.DataFrame(data_list, columns=['yyyy-mm-dd'])
df

Unnamed: 0,yyyy-mm-dd
0,2000-06-27
1,2002-09-24
2,2005-12-20


In [None]:
def extract_year(column):
  return column.split("-")[0]

In [None]:
#모든 row에 함수 적용
df['year'] = df['yyyy-mm-dd'].apply(extract_year)

In [None]:
df

Unnamed: 0,yyyy-mm-dd,year
0,2000-06-27,2000
1,2002-09-24,2002
2,2005-12-20,2005


In [None]:
def get_age(year,current_year):
  return current_year-int(year)

In [None]:
df['age'] = df['year'].apply(get_age,current_year=2018)
df

Unnamed: 0,yyyy-mm-dd,year,age
0,2000-06-27,2000,18
1,2002-09-24,2002,16
2,2005-12-20,2005,13


In [None]:
def get_introduce(age,prefix,suffix):
  return prefix + str(age) + suffix

In [None]:
df['introduce']= df['age'].apply(get_introduce,prefix="I am ",suffix=" years old")
df

Unnamed: 0,yyyy-mm-dd,year,age,introduce
0,2000-06-27,2000,18,I am 18 years old
1,2002-09-24,2002,16,I am 16 years old
2,2005-12-20,2005,13,I am 13 years old


In [None]:
def get_introduce_2(row):
  return "I was born in " + str(row.year) + " my age is " + str(row.age)

In [None]:
df.introduce = df.apply(get_introduce_2, axis=1)
df

Unnamed: 0,yyyy-mm-dd,year,age,introduce
0,2000-06-27,2000,18,I was born in 2000 my age is 18
1,2002-09-24,2002,16,I was born in 2002 my age is 16
2,2005-12-20,2005,13,I was born in 2005 my age is 13


## 12.map,applymap

In [None]:
data_list = [{'date' : '2000-06-27'},
             {'date' : '2002-09-24'},
             {'date' : '2005-12-20'}]
df = pd.DataFrame(data_list, columns=['date'])
df

Unnamed: 0,date
0,2000-06-27
1,2002-09-24
2,2005-12-20


In [None]:
def extract_year (date):
  return date.split('-')[0]

In [None]:
df['year'] = df['date'].map(extract_year)

In [None]:
df

Unnamed: 0,date,year
0,2000-06-27,2000
1,2002-09-24,2002
2,2005-12-20,2005


In [None]:
data_list = [{'age' : 20, 'job' : 'student'},
             {'age' : 30, 'job' : 'developer'},
             {'age' : 30, 'job' : 'teacher'}]
df = pd.DataFrame(data_list)
df

Unnamed: 0,age,job
0,20,student
1,30,developer
2,30,teacher


In [None]:
df.job = df.job.map({'student':1,'developer':2,'teacher':3})

In [None]:
df

Unnamed: 0,age,job
0,20,1
1,30,2
2,30,3


In [None]:
x_y = [{'x':5.5,'y':-5.6,'z':-1.1},
       {'x':-5.2,'y':5.5,'z':-2.2},
       {'x':-1.6,'y':-4.5,'z':-3.3}]
df = pd.DataFrame(x_y)
df

Unnamed: 0,x,y,z
0,5.5,-5.6,-1.1
1,-5.2,5.5,-2.2
2,-1.6,-4.5,-3.3


In [None]:
df = df.applymap(np.around)
df

Unnamed: 0,x,y,z
0,6.0,-6.0,-1.0
1,-5.0,6.0,-2.0
2,-2.0,-4.0,-3.0


## 13.Unique

In [None]:
job_list = [{'name': 'John', 'job': "teacher"},
                {'name': 'Nate', 'job': "teacher"},
                {'name': 'Fred', 'job': "teacher"},
                {'name': 'Abraham', 'job': "student"},
                {'name': 'Brian', 'job': "student"},
                {'name': 'Janny', 'job': "developer"},
                {'name': 'Nate', 'job': "teacher"},
                {'name': 'Obrian', 'job': "dentist"},
                {'name': 'Yuna', 'job': "teacher"},
                {'name': 'Rob', 'job': "lawyer"},
                {'name': 'Brian', 'job': "student"},
                {'name': 'Matt', 'job': "student"},
                {'name': 'Wendy', 'job': "banker"},
                {'name': 'Edward', 'job': "teacher"},
                {'name': 'Ian', 'job': "teacher"},
                {'name': 'Chris', 'job': "banker"},
                {'name': 'Philip', 'job': "lawyer"},
                {'name': 'Janny', 'job': "basketball player"},
                {'name': 'Gwen', 'job': "teacher"},
                {'name': 'Jessy', 'job': "student"}
         ]
df = pd.DataFrame(job_list, columns = ['name', 'job'])
df

Unnamed: 0,name,job
0,John,teacher
1,Nate,teacher
2,Fred,teacher
3,Abraham,student
4,Brian,student
5,Janny,developer
6,Nate,teacher
7,Obrian,dentist
8,Yuna,teacher
9,Rob,lawyer


In [None]:
df.job.unique()

array(['teacher', 'student', 'developer', 'dentist', 'lawyer', 'banker',
       'basketball player'], dtype=object)

In [None]:
df.job.value_counts()

teacher              8
student              5
banker               2
lawyer               2
dentist              1
developer            1
basketball player    1
Name: job, dtype: int64

## 14.Concatenate two dataframe

In [None]:
l1 = [{'name': 'John', 'job': "teacher"},
      {'name': 'Nate', 'job': "student"},
      {'name': 'Fred', 'job': "developer"}]

l2 = [{'name': 'Ed', 'job': "dentist"},
      {'name': 'Jack', 'job': "farmer"},
      {'name': 'Ted', 'job': "designer"}]

l3 = [{'name': 'John', 'job': "teacher"},
      {'name': 'Nate', 'job': "student"},
      {'name': 'Yuna', 'job': "developer"}]

l4 = [{'age': 25, 'country': "U.S"},
      {'age': 30, 'country': "U.K"},
      {'age': 45, 'country': "Korea"}]
         
df1 = pd.DataFrame(l1, columns = ['name', 'job'])
df2 = pd.DataFrame(l2, columns = ['name', 'job'])

In [None]:
df1

Unnamed: 0,name,job
0,John,teacher
1,Nate,student
2,Fred,developer


In [None]:
df2

Unnamed: 0,name,job
0,Ed,dentist
1,Jack,farmer
2,Ted,designer


In [None]:
result = pd.concat([df1,df2],ignore_index=True)
result

Unnamed: 0,name,job
0,John,teacher
1,Nate,student
2,Fred,developer
3,Ed,dentist
4,Jack,farmer
5,Ted,designer


In [None]:
result = df1.append(df2,ignore_index=True)
result

Unnamed: 0,name,job
0,John,teacher
1,Nate,student
2,Fred,developer
3,Ed,dentist
4,Jack,farmer
5,Ted,designer


In [None]:
df1 = pd.DataFrame(l3, columns = ['name', 'job'])
df2 = pd.DataFrame(l4, columns = ['age', 'country'])

In [None]:
df1

Unnamed: 0,name,job
0,John,teacher
1,Nate,student
2,Yuna,developer


In [None]:
df2

Unnamed: 0,age,country
0,25,U.S
1,30,U.K
2,45,Korea


In [None]:
result = pd.concat([df1,df2],axis=1,ignore_index=True)
result

Unnamed: 0,0,1,2,3
0,John,teacher,25,U.S
1,Nate,student,30,U.K
2,Yuna,developer,45,Korea


In [None]:
# list 합치기
label = [1,2,3,4,5]
prediction = [1,2,2,5,5]

comparison = pd.DataFrame(
    {'label': label,
     'prediction': prediction
    })

comparison

Unnamed: 0,label,prediction
0,1,1
1,2,2
2,3,2
3,4,5
4,5,5
