In [151]:
# 판다스를 사용하여 텍스트 데이터셋에 존재하는 모든 종류의 불완전성을 개선하는 방법을 살펴본다
import pandas as pd

In [152]:
inspections = pd.read_csv("chicago_food_inspections.csv")
inspections

Unnamed: 0,Name,Risk
0,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
1,JETS PIZZA,Risk 2 (Medium)
2,ROOM 1520,Risk 3 (Low)
3,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
4,CHARTWELLS,Risk 1 (High)
...,...,...
153805,WOLCOTT'S,Risk 1 (High)
153806,DUNKIN DONUTS/BASKIN-ROBBINS,Risk 2 (Medium)
153807,Cafe 608,Risk 1 (High)
153808,mr.daniel's,Risk 1 (High)


In [153]:
inspections["Name"].head()

0     MARRIOT MARQUIS CHICAGO   
1                    JETS PIZZA 
2                     ROOM 1520 
3      MARRIOT MARQUIS CHICAGO  
4                  CHARTWELLS   
Name: Name, dtype: object

In [154]:
# values 속성을 사용하면 저장된 넘파이 ndarray를 얻을 수 있다.
inspections["Name"].head().values

array([' MARRIOT MARQUIS CHICAGO   ', ' JETS PIZZA ', '   ROOM 1520 ',
       '  MARRIOT MARQUIS CHICAGO  ', ' CHARTWELLS   '], dtype=object)

In [155]:
inspections["Name"].str

<pandas.core.strings.accessor.StringMethods at 0x168ad89a0>

In [156]:
dessert = "  cheesecake "
dessert.lstrip()



'cheesecake '

In [157]:
dessert.rstrip()

'  cheesecake'

In [158]:
# 양쪽 끝에서 공백 제거
dessert.strip()

'cheesecake'

In [159]:
inspections["Name"].str.lstrip().head()

0    MARRIOT MARQUIS CHICAGO   
1                   JETS PIZZA 
2                    ROOM 1520 
3     MARRIOT MARQUIS CHICAGO  
4                 CHARTWELLS   
Name: Name, dtype: object

In [160]:
inspections["Name"].str.rstrip().head()

0      MARRIOT MARQUIS CHICAGO
1                   JETS PIZZA
2                    ROOM 1520
3      MARRIOT MARQUIS CHICAGO
4                   CHARTWELLS
Name: Name, dtype: object

In [161]:
inspections["Name"] = inspections["Name"].str.strip()

In [162]:
inspections.columns

Index(['Name', 'Risk'], dtype='object')

In [163]:
for column in inspections.columns:
    inspections[column] = inspections[column].str.strip()

In [164]:
inspections["Name"].str.lower().head()

0    marriot marquis chicago
1                 jets pizza
2                  room 1520
3    marriot marquis chicago
4                 chartwells
Name: Name, dtype: object

In [165]:
steaks = pd.Series(["porterhouse", "filet mignon", "ribeye"])
steaks

0     porterhouse
1    filet mignon
2          ribeye
dtype: object

In [166]:
steaks.str.upper()

0     PORTERHOUSE
1    FILET MIGNON
2          RIBEYE
dtype: object

In [167]:
inspections["Name"].str.capitalize()

0              Marriot marquis chicago
1                           Jets pizza
2                            Room 1520
3              Marriot marquis chicago
4                           Chartwells
                      ...             
153805                       Wolcott's
153806    Dunkin donuts/baskin-robbins
153807                        Cafe 608
153808                     Mr.daniel's
153809                      Tempo cafe
Name: Name, Length: 153810, dtype: object

In [168]:
# 각 단어의 첫 번째 문자를 대문자로 표기 
inspections["Name"].str.title()

0              Marriot Marquis Chicago
1                           Jets Pizza
2                            Room 1520
3              Marriot Marquis Chicago
4                           Chartwells
                      ...             
153805                       Wolcott'S
153806    Dunkin Donuts/Baskin-Robbins
153807                        Cafe 608
153808                     Mr.Daniel'S
153809                      Tempo Cafe
Name: Name, Length: 153810, dtype: object

In [169]:
inspections["Risk"].head()

0      Risk 1 (High)
1    Risk 2 (Medium)
2       Risk 3 (Low)
3      Risk 1 (High)
4      Risk 1 (High)
Name: Risk, dtype: object

In [170]:
# 과연 모든 행이 동일한 형식을 가질까?
len(inspections)

153810

In [171]:
# 형식이 다른 두가지 값... Nan과 All 문자열이 있다.
# 중요한 값인지 무시해도 되는 값인지 결정해야 한다.
# 이 예제에서는 절충안으로 결측값 Nan은 제거하고, 'All'은 Risk4(Extreme)으로 다루겠다.
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All', nan],
      dtype=object)

In [172]:
inspections = inspections.dropna(subset = ["Risk"])

In [173]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All'],
      dtype=object)

In [174]:
# 첫 번째 매개변수인 to_replace로 검색할 값을 지정하고 두 번째 매개변수인 value로 각 항목을 대체할 항목을 지정한다.
inspections = inspections.replace(
            to_replace= "All", value = "Risk 4 (Extreme)"
            )

In [175]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)',
       'Risk 4 (Extreme)'], dtype=object)

In [176]:
inspections["Risk"].str.slice(5,6)

0         1
1         2
2         3
3         1
4         1
         ..
153805    1
153806    2
153807    1
153808    1
153809    1
Name: Risk, Length: 153744, dtype: object

In [177]:
inspections["Risk"].str[5:6].head()

0    1
1    2
2    3
3    1
4    1
Name: Risk, dtype: object

In [178]:
inspections["Risk"].str[8:-1]

0           High
1         Medium
2            Low
3           High
4           High
           ...  
153805      High
153806    Medium
153807      High
153808      High
153809      High
Name: Risk, Length: 153744, dtype: object

In [179]:
"Pizza" in "Jets Pizza"

True

In [180]:
# 문자열 검색에서 가장 큰 문제는 대소문자를 구분하는 것. 
# 하위 문자열이 있는지 확인하기 전에 먼저 모든 열 값의 대소문자를 통일해야 한다.
"pizza" in "Jets Pizza"

False

In [181]:
# 판다스는 name 열 값의 원본 대소문자를 유지한다.
# 데이터프레임은 절대로 변경되지 않는다. 
has_pizza = inspections["Name"].str.lower().str.contains("pizza")
inspections[has_pizza]

Unnamed: 0,Name,Risk
1,JETS PIZZA,Risk 2 (Medium)
19,NANCY'S HOME OF STUFFED PIZZA,Risk 1 (High)
27,"NARY'S GRILL & PIZZA ,INC.",Risk 1 (High)
29,NARYS GRILL & PIZZA,Risk 1 (High)
68,COLUTAS PIZZA,Risk 1 (High)
...,...,...
153756,ANGELO'S STUFFED PIZZA CORP,Risk 1 (High)
153764,COCHIAROS PIZZA #2,Risk 1 (High)
153772,FERNANDO'S MEXICAN GRILL & PIZZA,Risk 1 (High)
153788,REGGIO'S PIZZA EXPRESS,Risk 1 (High)


In [182]:
# tacos 문자열로 시작하는 모든 식당 추출
start_with_tacos = inspections["Name"].str.lower().str.startswith("tacos") 
inspections[start_with_tacos]

Unnamed: 0,Name,Risk
69,TACOS NIETOS,Risk 1 (High)
556,TACOS EL TIO 2 INC.,Risk 1 (High)
675,TACOS DON GABINO,Risk 1 (High)
958,TACOS EL TIO 2 INC.,Risk 1 (High)
1036,TACOS EL TIO 2 INC.,Risk 1 (High)
...,...,...
143587,TACOS DE LUNA,Risk 1 (High)
144026,TACOS GARCIA,Risk 1 (High)
146174,Tacos Place's 1,Risk 1 (High)
147810,TACOS MARIO'S LIMITED,Risk 1 (High)


In [183]:
end_with_tacos = inspections["Name"].str.lower().str.endswith("tacos")
inspections[end_with_tacos]

Unnamed: 0,Name,Risk
382,LAZO'S TACOS,Risk 1 (High)
569,LAZO'S TACOS,Risk 1 (High)
2652,FLYING TACOS,Risk 3 (Low)
3250,JONY'S TACOS,Risk 1 (High)
3812,PACO'S TACOS,Risk 1 (High)
...,...,...
151121,REYES TACOS,Risk 1 (High)
151318,EL MACHO TACOS,Risk 1 (High)
151801,EL MACHO TACOS,Risk 1 (High)
153087,RAYMOND'S TACOS,Risk 1 (High)


In [184]:
customers = pd.read_csv("customers.csv")
customers

Unnamed: 0,Name,Address
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire..."
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,..."
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495"
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991"
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7..."
...,...,...
9956,Dana Browning,"762 Andrew Views Apt. 254, North Paul, New Mex..."
9957,Amanda Anderson,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ..."
9958,Eric Davis,"73015 Michelle Squares, Watsonville, West Virg..."
9959,Taylor Hernandez,"129 Keith Greens, Haleyfurt, Oklahoma, 98916"


In [185]:
customers["Name"].str.len()

0       13
1       17
2       15
3       19
4       14
        ..
9956    13
9957    15
9958    10
9959    16
9960    16
Name: Name, Length: 9961, dtype: int64

In [186]:
phone_number = "555-123-4567"
phone_number.split("-")

['555', '123', '4567']

In [187]:
# 다음 두 줄은 결과가  같다.
customers["Name"].str.split(pat = " ").head()
customers["Name"].str.split(" ").head()

0           [Frank, Manning]
1       [Elizabeth, Johnson]
2         [Donald, Stephens]
3    [Michael, Vincent, III]
4          [Jasmine, Zamora]
Name: Name, dtype: object

In [188]:
# 인덱스 3을보면 3이 나온다. 'MD', 'Jr' 같은 접미사 때문에 일부 이름은 세 단어 이상으로 구성된다.
# Michael, Vincent, III같은 이름의 경우..
customers["Name"].str.split(" ").str.len().head()

0    2
1    2
2    2
3    3
4    2
Name: Name, dtype: int64

In [189]:
# 분할의 최대 임계값을 1로 설정하면 첫 번째 공백에서 문자열을 분할하고 중지한다. 그러면 2개의 요소를 가진 리스트로 구성된 시리즈 완성 가능
customers["Name"].str.split(pat = " ", n = 1).head()

0          [Frank, Manning]
1      [Elizabeth, Johnson]
2        [Donald, Stephens]
3    [Michael, Vincent III]
4         [Jasmine, Zamora]
Name: Name, dtype: object

In [190]:
# str.get을 활용하여 firstname을 추출할 수 있다.
customers["Name"].str.split(pat = " ", n = 1 ).str.get(0)

0           Frank
1       Elizabeth
2          Donald
3         Michael
4         Jasmine
          ...    
9956         Dana
9957       Amanda
9958         Eric
9959       Taylor
9960       Sherry
Name: Name, Length: 9961, dtype: object

In [191]:
customers["Name"].str.split(pat = " ", n = 1 ).str.get(-1)

0           Manning
1           Johnson
2          Stephens
3       Vincent III
4            Zamora
           ...     
9956       Browning
9957       Anderson
9958          Davis
9959      Hernandez
9960      Nicholson
Name: Name, Length: 9961, dtype: object

In [192]:
# expand에 true를 인수로 전달하면 이 메서드는 리스트 시리즈 대신 새로운 데이터프레임을 반환한다.
customers["Name"].str.split(pat = " ", n = 1, expand = True)

Unnamed: 0,0,1
0,Frank,Manning
1,Elizabeth,Johnson
2,Donald,Stephens
3,Michael,Vincent III
4,Jasmine,Zamora
...,...,...
9956,Dana,Browning
9957,Amanda,Anderson
9958,Eric,Davis
9959,Taylor,Hernandez


In [193]:
customers["Name"].str.split(pat = " ", expand = True).head()

Unnamed: 0,0,1,2
0,Frank,Manning,
1,Elizabeth,Johnson,
2,Donald,Stephens,
3,Michael,Vincent,III
4,Jasmine,Zamora,


In [194]:
customers[["First Name", "Last Name"]] = customers["Name"].str.split(pat = " ", n =1, expand = True)
customers


Unnamed: 0,Name,Address,First Name,Last Name
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora
...,...,...,...,...
9956,Dana Browning,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning
9957,Amanda Anderson,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson
9958,Eric Davis,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis
9959,Taylor Hernandez,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez


In [195]:
customers = customers.drop(labels = "Name", axis = 1)
customers

Unnamed: 0,Address,First Name,Last Name
0,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora
...,...,...,...
9956,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning
9957,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson
9958,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis
9959,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez


In [196]:
customers

Unnamed: 0,Address,First Name,Last Name
0,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora
...,...,...,...
9956,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning
9957,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson
9958,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis
9959,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez


In [207]:
customers[["도로","시","주","우편번호"]] = customers["Address"].str.lower().str.split(",", expand = True) 
customers.drop(labels = "Address", axis = 1)

Unnamed: 0,First Name,Last Name,도로,시,주,우편번호
0,Frank,Manning,6461 quinn groves,east matthew,new hampshire,16656
1,Elizabeth,Johnson,1360 tracey ports apt. 419,kyleport,vermont,31924
2,Donald,Stephens,19120 fleming manors,prestonstad,montana,23495
3,Michael,Vincent III,441 olivia creek,jimmymouth,georgia,82991
4,Jasmine,Zamora,4246 chelsey ford apt. 310,karamouth,utah,76252
...,...,...,...,...,...,...
9956,Dana,Browning,762 andrew views apt. 254,north paul,new mexico,28889
9957,Amanda,Anderson,44188 day crest apt. 901,lake marcia,maine,37378
9958,Eric,Davis,73015 michelle squares,watsonville,west virginia,03933
9959,Taylor,Hernandez,129 keith greens,haleyfurt,oklahoma,98916
