# Strings
-  https://pandas.pydata.org/docs/user_guide/text.html

In [2]:
import numpy as np
import pandas as pd

In [3]:
# series
pd.Series(["a", "b", "c"])

0    a
1    b
2    c
dtype: object

In [4]:
pd.Series(["a", "b", "c"], dtype="string")  #explicit

0    a
1    b
2    c
dtype: string

In [5]:
pd.Series(["a", "b", "c"], dtype=pd.StringDtype())  #explicit

0    a
1    b
2    c
dtype: string

In [7]:
s = pd.Series(["a", "b", "c"])  #object
s

0    a
1    b
2    c
dtype: object

In [8]:
s.astype("string")

0    a
1    b
2    c
dtype: string

In [10]:
# No to strings
s1 = pd.Series([1, 2, np.nan], dtype="Int64")
print(s1)
print(s1.astype('string'))

0       1
1       2
2    <NA>
dtype: Int64
0       1
1       2
2    <NA>
dtype: string


In [11]:
# object and string are different for behaviour

In [12]:
# String methods
s = pd.Series( ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string")
s

0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [14]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

In [15]:
s.str.upper()

0       A
1       B
2       C
3    AABA
4    BACA
5    <NA>
6    CABA
7     DOG
8     CAT
dtype: string

In [16]:
s.str.len()

0       1
1       1
2       1
3       4
4       4
5    <NA>
6       4
7       3
8       3
dtype: Int64

In [17]:
# strip spaces
idx = pd.Index([" jack", "jill ", " jesse ", "frank"])
idx

Index([' jack', 'jill ', ' jesse ', 'frank'], dtype='object')

In [18]:
idx.str.strip()

Index(['jack', 'jill', 'jesse', 'frank'], dtype='object')

In [19]:
idx.str.lstrip()

Index(['jack', 'jill ', 'jesse ', 'frank'], dtype='object')

In [20]:
idx.str.rstrip()

Index([' jack', 'jill', ' jesse', 'frank'], dtype='object')

In [21]:
#split & replace
s2 = pd.Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype="string")
s2

0    a_b_c
1    c_d_e
2     <NA>
3    f_g_h
dtype: string

In [22]:
s2.str.split('_')

0    [a, b, c]
1    [c, d, e]
2         <NA>
3    [f, g, h]
dtype: object

In [23]:
s2.str.split("_").str.get(1)  #extract elements

0       b
1       d
2    <NA>
3       g
dtype: object

In [25]:
s2.str.split("_").str[2]

0       c
1       e
2    <NA>
3       h
dtype: object

In [26]:
s2.str.split("_", expand=True)  #expand into columns

Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h
