In [2]:
import pandas as pd
import numpy as np

In [3]:
s = pd.Series(["a", "b", "c"])
s

0    a
1    b
2    c
dtype: object

In [4]:
s.dtype

dtype('O')

In [5]:
s = pd.Series(['a','b','c'], dtype="string")
s.dtype

string[python]

In [6]:
# or astype after the Series or DataFrame is created
s.astype("string")

0    a
1    b
2    c
dtype: string

In [7]:
s = pd.Series(["a", 2, np.nan], dtype="string")
s
# <NA> for string

0       a
1       2
2    <NA>
dtype: string

In [8]:
s.str.count("a")

0       1
1       0
2    <NA>
dtype: Int64

In [9]:
s.str.count("A")

0       0
1       0
2    <NA>
dtype: Int64

In [10]:
s.dropna().str.count("a")

0    1
1    0
dtype: Int64

In [11]:
s.str.isdigit() 

0    False
1     True
2     <NA>
dtype: boolean

In [12]:
s.str.match("a")

0     True
1    False
2     <NA>
dtype: boolean

In [13]:
s.str.lower()

0       a
1       2
2    <NA>
dtype: string

In [14]:
s.str.upper()

0       A
1       2
2    <NA>
dtype: string

In [15]:
s.str.len()

0       1
1       1
2    <NA>
dtype: Int64

In [16]:
s = pd.Series(
    ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string"
)

print(s.str.lower())
print(s.str.upper())
print(s.str.len())
print(s.str.capitalize())


0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string
0       A
1       B
2       C
3    AABA
4    BACA
5    <NA>
6    CABA
7     DOG
8     CAT
dtype: string
0       1
1       1
2       1
3       4
4       4
5    <NA>
6       4
7       3
8       3
dtype: Int64
0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    Caba
7     Dog
8     Cat
dtype: string


## The string methods on Index are especially useful for cleaning up or transforming DataFrame columns. For instance, you may have columns with leading or trailing whitespace:

In [17]:
idx = pd.Index([" jack", "jill ", " jesse ", "frank"])
idx

Index([' jack', 'jill ', ' jesse ', 'frank'], dtype='object')

In [18]:
idx.str.strip()

Index(['jack', 'jill', 'jesse', 'frank'], dtype='object')

In [19]:
idx.str.lstrip()

Index(['jack', 'jill ', 'jesse ', 'frank'], dtype='object')

In [20]:
idx.str.rstrip()

Index([' jack', 'jill', ' jesse', 'frank'], dtype='object')

In [21]:
df = pd.DataFrame(
    np.random.randn(3, 2), columns=[" Column A ", " Column B "], index=range(3)
)
df

Unnamed: 0,Column A,Column B
0,-0.790505,0.517867
1,-0.350764,-0.339499
2,-1.310095,-0.006007


In [22]:
df.columns.str.strip()

Index(['Column A', 'Column B'], dtype='object')

## Splitting and replacing strings

In [23]:
s2 = pd.Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype="string")
s2

0    a_b_c
1    c_d_e
2     <NA>
3    f_g_h
dtype: string

In [24]:
s2.str.replace(r"[^A-Za-z0-9]", "", regex=True)

0     abc
1     cde
2    <NA>
3     fgh
dtype: string

In [25]:
s2

0    a_b_c
1    c_d_e
2     <NA>
3    f_g_h
dtype: string

In [26]:
s2.str.split("_")

0    [a, b, c]
1    [c, d, e]
2         <NA>
3    [f, g, h]
dtype: object

In [27]:
s2.str.split("_").get(1) # or s2.str.split("_").str[1]

['c', 'd', 'e']

In [28]:
s2.str.split("_").get(0)

['a', 'b', 'c']

### It is easy to expand this to return a DataFrame using expand.

In [29]:
s2.str.split("_", expand=True)

Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


In [30]:
s2.str.split("_", expand=True, n=1)

Unnamed: 0,0,1
0,a,b_c
1,c,d_e
2,,
3,f,g_h


## rsplit is similar to split except it works in the reverse direction, i.e., from the end of the string to the beginning of the string:

In [31]:
s2.str.rsplit("_", expand=True, n=1)

Unnamed: 0,0,1
0,a_b,c
1,c_d,e
2,,
3,f_g,h


In [32]:
s2.str.rsplit("_")

0    [a, b, c]
1    [c, d, e]
2         <NA>
3    [f, g, h]
dtype: object

### replace

In [33]:
s3 = pd.Series(
    ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
    dtype="string",
)
s3

0       A
1       B
2       C
3    Aaba
4    Baca
5        
6    <NA>
7    CABA
8     dog
9     cat
dtype: string

In [34]:
s3.str.replace("^.a|dog", "XX-XX", case=False, regex=True)

0          A
1          B
2          C
3    XX-XXba
4    XX-XXca
5           
6       <NA>
7    XX-XXBA
8      XX-XX
9     XX-XXt
dtype: string

## Concatenating a single Series into a string

In [35]:
s = pd.Series(["a", "b", "c", "d"], dtype="string")
s.str.cat(sep="")

'abcd'

In [36]:
s.str.cat(sep=",")

'a,b,c,d'

### By default, missing values are ignored. Using na_rep, they can be given a representation:

In [37]:
s = pd.Series(["a", "b", "c", "d", np.nan, np.inf], dtype="string")
s

0       a
1       b
2       c
3       d
4    <NA>
5     inf
dtype: string

In [38]:
s.str.cat(sep="") # only np.nan ignored, np.inf still comming

'abcdinf'

In [39]:
s.str.cat(sep=",",na_rep="-")

'a,b,c,d,-,inf'

### Concatenating a Series and something list-like into a Series

In [40]:
s = pd.Series(["a", "b", "c", "d", np.inf, np.nan], dtype="string")
s
s.str.cat(["A", "B", "C", "D", "E", "F"], na_rep="-")

0      aA
1      bB
2      cC
3      dD
4    infE
5      -F
dtype: string

In [41]:
t = pd.Series(["a", "b", np.nan, "d"], dtype="string")
s = pd.Series(["a", "b", "c", "d"], dtype="string")
s.str.cat(t)

0      aa
1      bb
2    <NA>
3      dd
dtype: string

### Concatenating a Series and something array-like into a Series

In [42]:
d = pd.concat([t, s], axis=1)
d

Unnamed: 0,0,1
0,a,a
1,b,b
2,,c
3,d,d


In [43]:
d = pd.concat([t, s], axis=0)
d

0       a
1       b
2    <NA>
3       d
0       a
1       b
2       c
3       d
dtype: string

### In contrast to splitting, we also often need to combine multiple strings.

In [44]:
data = {
    'Name': ['ALICE', 'Bob', 'Charlie ', ' DAvid', 'eve'],
    'Address': ['JL. Anggrek 12', 'Jl. Mawar No. 5', 'Jl. Melati, No.10', 'Jl. Tulip 3A', 'Jl. Sakura 2B']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Address
0,ALICE,JL. Anggrek 12
1,Bob,Jl. Mawar No. 5
2,Charlie,"Jl. Melati, No.10"
3,DAvid,Jl. Tulip 3A
4,eve,Jl. Sakura 2B


In [45]:
df['Complete information'] = df['Name'] + ' live in ' + df['Address'] 
df

Unnamed: 0,Name,Address,Complete information
0,ALICE,JL. Anggrek 12,ALICE live in JL. Anggrek 12
1,Bob,Jl. Mawar No. 5,Bob live in Jl. Mawar No. 5
2,Charlie,"Jl. Melati, No.10","Charlie live in Jl. Melati, No.10"
3,DAvid,Jl. Tulip 3A,DAvid live in Jl. Tulip 3A
4,eve,Jl. Sakura 2B,eve live in Jl. Sakura 2B


### Checking Substring Availability

In [50]:
df[df['Address'].str.contains('Jl.')]

Unnamed: 0,Name,Address,Complete information
1,Bob,Jl. Mawar No. 5,Bob live in Jl. Mawar No. 5
2,Charlie,"Jl. Melati, No.10","Charlie live in Jl. Melati, No.10"
3,DAvid,Jl. Tulip 3A,DAvid live in Jl. Tulip 3A
4,eve,Jl. Sakura 2B,eve live in Jl. Sakura 2B


### Extraction Using Regular Expressions

In [52]:
df['Home Number'] = df['Address'].str.extract('(\d+)')
df

  df['Home Number'] = df['Address'].str.extract('(\d+)')


Unnamed: 0,Name,Address,Complete information,Home Number
0,ALICE,JL. Anggrek 12,ALICE live in JL. Anggrek 12,12
1,Bob,Jl. Mawar No. 5,Bob live in Jl. Mawar No. 5,5
2,Charlie,"Jl. Melati, No.10","Charlie live in Jl. Melati, No.10",10
3,DAvid,Jl. Tulip 3A,DAvid live in Jl. Tulip 3A,3
4,eve,Jl. Sakura 2B,eve live in Jl. Sakura 2B,2
