# Pandas 2: Optional arguments for functions

* indexing: treat one or more columns as the returned DataFrame
* type inference and data conversion: user-defined value conversions and custom list of missing value markers
* datetime parsing: combining capability, combining date, time, and inforamtoin spread over multiple columns
* iterating: support for iterating over chunks of large files
* unclean data issues: skipping rows or a footer, comments, or other minor things

In [2]:
import pandas as pd

In [3]:
print(pd.__version__) 

1.0.3


In [7]:
# if using old version 0.25 or something else
#conda update pandas
#conda install pandas

SyntaxError: invalid syntax (<ipython-input-7-ff43c1519e0a>, line 2)

In [8]:
from matplotlib import pyplot as plt
import numpy as np
from pylab import *

In [9]:
# LOTS OF PARAMETERS
#Simple

df = pd.read_csv('examples1.csv')

df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,woohoo


In [10]:
pd.read_table('examples1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,woohoo


In [11]:
# no header
# can have pandas assign default column names
pd.read_csv('examples2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,woohoo


In [12]:
# can specify the names 

pd.read_csv('examples2.csv', names=['a','b','c','d','message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,woohoo


In [13]:
# if message was the Index of the returned DataFrame column
# can indicate column at index 4 or named 'message' using index_column argument

names = ['a', 'b', 'c', 'd', 'message']

pd.read_csv('examples2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
woohoo,9,10,11,12


In [14]:
# create a hierarchical index from multiple columns
# pass a list of column numbers or names

parsed = pd.read_csv('examples3.csv',index_col=['key1', 'key2'])

parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [15]:
# table without a delimiter, using whitespace or some other pattern to separate fields

list(open('ex_whitespace.txt'))

['\tA \tB \tC\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb 0.927272 0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382 1.100491']

In [16]:
result = pd.read_table('ex_whitespace.txt', sep='\s+')
result
# with fewer column names than number of data rows, read_table infers the first column should be the DataFrame's index

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [17]:
# to skip rows

list(open('examples4.csv'))


['ï»¿# hey!,,,,\n',
 'a,b,c,d,message\n',
 '# just wanted to make things more difficult for you,,,,\n',
 '"# who reads CSV files with computers, anyway?",,,,\n',
 '1,2,3,4,hello\n',
 '5,6,7,8,world\n',
 '9,10,11,12,woohoo\n']

In [18]:
pd.read_csv('examples4.csv', skiprows=[0, 2, 3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,woohoo


In [19]:
# if data is missing pandas uses sentinel value, NaN, NA, or Null

result = pd.read_csv('examples5.csv')
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [20]:
pd.isnull(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [21]:
# na_values option take a list or set to consider missing values

result = pd.read_csv('examples5.csv', na_values=['NULL'])
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [22]:
# can place NA values via a dict
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}

pd.read_csv('examples5.csv', na_values=sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


## Reading text files in pieces

When processing large viles or figuring out the right set of arguments to correctly process a large file, may only want to read a small piece of a file or iterate through smaller chunks of the file.

In [23]:
pd.options.display.max_rows=10
result= pd.read_csv('examples6.csv')

result

Unnamed: 0,one,two,three,four,key
0,-8.0,8.0,-5.0,-10.0,a
1,4.0,-1.0,1.0,-9.0,b
2,-1.0,-7.0,-1.0,-3.0,d
3,-3.0,-4.0,7.0,-5.0,c
4,0.0,-5.0,4.0,9.0,e
...,...,...,...,...,...
14,-5.0,-7.0,1.0,-9.0,n
15,-2.0,-8.0,-3.0,-7.0,h
16,8.0,5.0,-6.0,-3.0,r
17,-10.0,2.0,0.0,-6.0,w


In [24]:
# want to only read a small unmber of rows and not the entire file specify with nrows

pd.read_csv('examples6.csv', nrows =3)

Unnamed: 0,one,two,three,four,key
0,-8.0,8.0,-5.0,-10.0,a
1,4.0,-1.0,1.0,-9.0,b
2,-1.0,-7.0,-1.0,-3.0,d


In [25]:
# to read a file in chunks

chunker = pd.read_csv('examples6.csv', chunksize=1000)

chunker

<pandas.io.parsers.TextFileReader at 0x22f4b0dbf88>

In [26]:
# the TextParser object returned by read_csv allows you to iterate over parts of the file according to chunksize
### skip this

chunker = pd.read_csv('examples6.csv', chunksize=1000)

tot = pd.Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)

tot[:10]

  """


v    3.0
h    3.0
r    3.0
s    2.0
e    1.0
c    1.0
d    1.0
a    1.0
n    1.0
b    1.0
dtype: float64

## Writing data

In [27]:
data = pd.read_csv('examples5.csv')

data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [28]:
data.to_csv('examples_out.csv')

In [29]:
# to print text result in the console not html
import sys

In [30]:
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [31]:
# to change missing values from empty strings to a sentinel value

data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [32]:
# with no options specified, both row and column labels are written
# can disable these

data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [33]:
# or write only a subset of columns

data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


### Reading excel files

Recommend using csv instead of excel. 

In [34]:
import xlrd
import openpyxl
import xlwt
import xlsxwriter

pd.read_excel('examples1.xlsx')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,woohoo


In [35]:
# writing excel files
frame= pd.read_excel('examples1.xlsx')

writer = pd.ExcelWriter('examples2.xlsx')

frame.to_excel(writer,'Sheet1')

writer.save()

In [36]:
# or pass a file path to _excel

frame.to_excel('examples2.xlsx')

## Data Cleaning and Preparation

80 / 20 rule. 

### Missing data

In [37]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [38]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [39]:
# BE CAREFUL
# built-in Python None value is also treated as NA in object arrays

string_data[0] = None

string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering out missing data

In [40]:
from numpy import nan as NA

data = pd.Series([1, NA, 3.5, NA, 7])

data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [41]:
# equivalent to 

data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [42]:
# With DataFrame objects, may want to drop rows or columns that are all NA or only those containing NAs
# by default .dropna drops any row containing a missing value

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])

data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [43]:
# .dropna default settings will remove any row containing a missing value
cleaned = data.dropna()

cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [44]:
# passing how='all' only drop rows that are all NA

data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [45]:
# to drop columns that are all NA

data[4] = NA

data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [46]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [47]:
# only keep rows containing a certain number of observations

#create a random DataFrame
df = pd.DataFrame(np.random.randn(7, 3))

df


Unnamed: 0,0,1,2
0,-0.75495,0.103121,1.120116
1,0.296437,0.264022,-1.174243
2,-0.738885,-0.373641,-1.288857
3,-0.703354,1.547443,-0.88658
4,-0.756235,1.148851,-1.464035
5,1.826833,-0.956923,-0.678762
6,1.628087,-1.380036,0.88428


In [48]:
# add some NA values
df.iloc[:4,1] = NA

In [49]:
# add more NA values
df.iloc[:2,2] = NA

In [50]:
df

Unnamed: 0,0,1,2
0,-0.75495,,
1,0.296437,,
2,-0.738885,,-1.288857
3,-0.703354,,-0.88658
4,-0.756235,1.148851,-1.464035
5,1.826833,-0.956923,-0.678762
6,1.628087,-1.380036,0.88428


In [51]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.756235,1.148851,-1.464035
5,1.826833,-0.956923,-0.678762
6,1.628087,-1.380036,0.88428


In [52]:
# use thresh to only keep rows with a certain number of observations

df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.738885,,-1.288857
3,-0.703354,,-0.88658
4,-0.756235,1.148851,-1.464035
5,1.826833,-0.956923,-0.678762
6,1.628087,-1.380036,0.88428


### Filling in missing data

In [53]:
# use fillna() with a constant
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.75495,0.0,0.0
1,0.296437,0.0,0.0
2,-0.738885,0.0,-1.288857
3,-0.703354,0.0,-0.88658
4,-0.756235,1.148851,-1.464035
5,1.826833,-0.956923,-0.678762
6,1.628087,-1.380036,0.88428


In [None]:
# use fillna() with a dict, can use a different fill value for each column
# fill NA's in column 1 with 0.5 and column 2 with 0

df.fillna({1: 0.5, 2: 0})

In [None]:
# fillna returns a new object, but you can modify the existing object into place
_ = df.fillna(0, inplace=True)

df

In [None]:
df = pd.DataFrame(np.random.randn(6, 3))

df

In [None]:
# can reindex using fillna

df.iloc[2:, 1] = NA

df.iloc[4:, 2] = NA

df

In [None]:
df.fillna(method='ffill')

In [None]:
df.fillna(method='ffill', limit=2)

In [None]:
# pass the mean or median value of a Series for NA

data = pd.Series([1., NA, 3.5, NA, 7])

data

In [None]:
data.fillna(data.mean())

### Data Transformation

In [None]:
# Removing duplicates

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],'k2': [1, 1, 2, 3, 3, 4, 4]})

data

In [None]:
# duplicate detection by row, do two consecutive rows have the same value
data.duplicated()

In [None]:
# removes duplicates so the duplicated array is False:

data.drop_duplicates()

In [None]:
# instead of all the columsn can use any subset to detect duplicates

data['v1']= range(7)

data

In [None]:
# duplicates dropped based on column
data.drop_duplicates(['k1'])

In [None]:
#duplicated and drop_duplicates by default keep the first observed value combination
# passing keep='last' will return the last one
# row 5 removed

data.drop_duplicates(['k1', 'k2'], keep='last')

### Transforming data using a function or mapping

In [None]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon','Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

data

In [None]:
# where does the meat come from?
# dict of values
# map performs element-wise transformations and other data cleaning-related operations

meat_to_animal = {
'bacon': 'Pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'Corned beef': 'cow',
'honey ham': 'Pig',
'nova lox': 'salmon'
}

In [None]:
# some of the values are upper case bc I am lazy typer and I cut and paste stuff

lowercased= data['food'].str.lower()

data['animal'] = lowercased.map(meat_to_animal)

data

### Replacing Values

In [None]:
# fillna, map, and replace
# replace simpler and more flexible

data = pd.Series([1., -999., 2., -999., -1000., 3.])

data

In [None]:
data.replace(-999, np.nan)

In [None]:
# replace multiple values, pass a list, then the substitute value

data.replace([-999, -1000], np.nan)

In [None]:
# different replacement for each value, pass a list of substitutes

data.replace([-999, -1000], [np.nan, 0])

In [None]:
# can also pass a dict

data.replace({-999: np.nan, -1000: 0})

### Renaming Axis Indexes

Axis labels can be transformed by a function or mapping to create new differently labeled object. Can modify axes in place without creating a new data structure.

In [None]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

data

In [None]:
transform = lambda x: x[:4].upper()

data.index.map(transform)

In [None]:
data.index = data.index.map(transform)

data

In [None]:
# to create a transformed version of a dataset without modifying the original, use rename

data.rename(index=str.title, columns=str.upper)

In [None]:
# can rename with a dict-like object to provide new values for a subset of the axis labels
# rename ohio to indiana, three to peekaboo

data.rename(index={'OHIO': 'INDIANA'},columns={'three': 'peekaboo'})

In [None]:
# using rename to modify a dataset in place

data.rename(index={'OHIO': 'INDIANA'}, inplace=True)

data

### Discretization and Binning : continuous data

In [None]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

# determine size of bin, 18-25, 26-35, 36-60, 61-100
bins = [18, 25, 35, 60, 100]

# .cut function

cats = pd.cut(ages, bins)

# .cut creates a special Categorical object, e.g. bins
# treat object like an array of strings indicating the bin name
cats

In [None]:
# cats.codes= 18-25 =1, 26-35 = 2, ...,
cats.codes

In [None]:
cats.categories

In [None]:
pd.value_counts(cats)

In [None]:
# parenthesis means the side is open, while square bracket means it is closed (inclusive)
# change which side is closed by passing right=False

pd.cut(ages, [18, 26, 36, 61, 100], right=False)

In [None]:
pd.value_counts(cats)

In [None]:
# pass own bin names by passing a list or array to the labels option

group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

pd.cut(ages, bins, labels=group_names)

In [None]:
# pass an integer of bins to cut instead of explicit edges, compute equal length bins 
# bins are based on the min and max values in the data

data = np.random.rand(20)

data

In [None]:
# cut the data into fourths
# precision limits the data to two digits
pd.cut(data, 4, precision=2)

In [None]:
# to cut based on quantiles

# create a normally distributed data
data = np.random.randn(1000)

In [None]:
# create the categories
cats = pd.qcut(data, 4)

cats

In [None]:
pd.value_counts(cats)

In [None]:
# to cut your own quantiles
# quantiles between 0 and 1

pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

### Detecting and filtering outliers

In [None]:
data = pd.DataFrame(np.random.randn(1000, 4))

data.describe()

In [None]:
# find outliers exceeding abs|3| in columns
col= data[2]

col[np.abs(col) > 3]

In [None]:
# select all rows having a value exceeding any value exceeding -3 or 3

data[(np.abs(data) > 3).any(1)]

In [None]:
# values can be based on criteria, to cap values outside the interval -3 and 3

data[np.abs(data) > 3] = np.sign(data) * 3

data.describe()

In [None]:
# to determine if the value is positive or negative

np.sign(data).head()

### Permutation and Random Sampling

Randomly ordering a Series or the rows in a DataFrame use numpy.random.permutation. Call with the length of the axis you want to permute produces an array of integers indicationg the new orddering:

In [None]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))

df

In [None]:
sampler = np.random.permutation(5)

sampler

In [None]:
# to insert permutation
df

In [None]:
df.take(sampler)

In [None]:
# to select subset without replacement, use the sample method

df.sample(n=3)

In [None]:
# to sample with replacement, to allow repeat choices, pass replace= True

choices = pd.Series([5, 7, -1, 6, 4])

draws = choices.sample(n=10, replace=True)

draws

### String manipulation

In [None]:
# String object methodds built in
# comma-separated string broken into pieces with split

val = 'a,b, kenneth'

val.split(',')

In [None]:
# split combined with strip to trim whitespace including line breaks

pieces =[x.strip() for x in val.split(',')]

pieces

In [None]:
# to add a delimiter
first, second, third = pieces

first +'::' + second + '::' + third

In [None]:
# faster way is to pass list or tuuple using the join method 

'::'.join(pieces)

In [None]:
# in keyword to detect a substring, index and find can also be used but

'kenneth' in val

In [None]:
val.index('a')

In [None]:
# , is included in the index as 1, making b 2
val.index('b')

In [None]:
val.index('kenneth')

In [None]:
val.index(',')

In [None]:
val.find(':')

In [None]:
# count returns the number of ocurrences of a particular substring
val.count('n')

In [None]:
val.count(',')

In [None]:
# replace used to substitute occurrences of one pattern for another

val.replace(',',';')

In [None]:
val.replace(',','')

In [None]:
val.endswith("q")

In [None]:
val.startswith("s")

### Regular Expressions (short intro)

A string pattern or regex, is a string formed according to the regular expression langauge. Python has built in re module to apply regular expressions to strings. Used for pattern matching, substitution, and splitting. 

In [None]:
import re

In [None]:
# text with variable whitespace characters
text = "whoohoo squoink\t bonk \tvroom"

#re.split(), compiles the regular expression and then split method is called on the passed text
re.split('\s+', text)

In [None]:
# to compile the regex yourself

regex = re.compile('\s+')

regex.split(text)

In [None]:
# to find all patterns matching the regex, use the findall method
regex.findall(text)

In [None]:
# findall finds all mathches

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

regex = re.compile(pattern, flags=re.IGNORECASE)

regex.findall(text)

### Vectorized string functions in pandas

Make sure you have version 1.0.0

#### two ways to store text data in pandas:

1. object-dtype NumPy array
2. StringDtype extension type

[Working with Text Data in pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html)

In [None]:
# object type
s = pd.Series(['a', 'b', 'c'])
s

In [None]:
# to use string dtype, specify the dtype

pd.Series(['a', 'b', 'c'], dtype="string")



In [None]:
# or

pd.Series(['a', 'b', 'c'], dtype=pd.StringDtype())

In [None]:
s

In [None]:
# or convert object-dtype to StringDtype
s.astype("string")

#### Behavior differences between StringDtype and object dtype

In [None]:
# StringDtype string accessor methods that return nubmeric output will always return a nullable integer dtype
# rather than either int or float dtype
s = pd.Series(["a", None, "b"], dtype="string")

s

In [None]:
s.str.count('a')

In [None]:
s.dropna().str.count('a')

In [None]:
# to see if there are matches
s.str.match('a')

In [None]:
s2 = pd.Series(["a", "aa", "ab", None, "b"], dtype="string")
s2

In [None]:
s2.str.count('a')

In [None]:
# only provides boolean at the first match
s2.str.match('a')

In [None]:
#### Splitting or replacing strings

In [None]:
s3 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="string")

s3

In [None]:
# to get the middle value
s3.str.split('_').str.get(1)

In [None]:
# or expand Series to return a dataframe
s3.str.split('_', expand=True)

In [None]:
#rsplit works in the reverse direction
s3.str.rsplit('_', expand=True, n=1)

In [None]:
s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', '', np.nan, 'CABA', 'dog', 'cat'], dtype="string")
s4

In [None]:
s4.str.replace('^.a|dog', 'XX-XX ', case=False)

In [None]:
# bad format
dollars = pd.Series(['12', '-$10', '$10,000'], dtype="string")

dollars

In [None]:
# to remove the money sign special characters

dollars.str.replace('$', '')

In [None]:
#Concatenation: linking together or joining
# concatenate combine

s = pd.Series(['a', 'b', 'c', 'd'], dtype="string")
s

In [None]:
s.str.cat(sep=',')

In [None]:
# if not specified, keyowrd sep defaults to the empty string
s.str.cat()

In [None]:
# by default missing values are ignored
t = pd.Series(['a', 'b', np.nan, 'd'], dtype="string")

t

In [None]:
t.str.cat(sep=',')

In [None]:
t.str.cat(sep=',', na_rep='-')

In [None]:
# concatenating a Series or a list into a Series
#first argument to cat() can be a list like object, provided that it matches the length of the calling Series or Index

s

In [None]:
# link new list to previous Series s
s.str.cat(['A', 'B', 'C', 'D'])

In [None]:
#link Series s to Series t
s.str.cat(t)

In [None]:
# Concatenating a series and something array-like into a series

In [None]:
s

In [None]:
t

In [None]:
# combine s and t into a Series
d = pd.concat([t, s], axis=1)

In [None]:
d

In [None]:
 s.str.cat(d, na_rep='-')

### Concatenating a series and an indexed object into a series, with alignment

In [None]:
u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2], dtype="string")

u

In [None]:
s

In [None]:
s.str.cat(u, join='left')

In [None]:
v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4],  dtype="string")
v

In [None]:
s.str.cat(v, join='left', na_rep='-')

In [None]:
s.str.cat(v, join='outer', na_rep='-')

In [None]:
# same alignment used when you have a DataFrame
f = d.loc[[3, 2, 1, 0], :]
f

In [None]:
s

In [None]:
s.str.cat(f, join='left', na_rep='-')

### Concatenating a Series and many objects into a series

In [None]:
s

In [None]:
u

In [None]:
s.str.cat([u, u.to_numpy()], join='left')

In [None]:
v

In [None]:
s.str.cat([v, u, u.to_numpy()], join='outer', na_rep='-')

### Indexing with .str

In [None]:
# can use [] to directly index by position location

s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan,'CABA', 'dog', 'cat'], dtype="string")
  
s

In [None]:
# indexes the data by the first letter
s.str[0]

In [None]:
# indexes the data by the second letter
s.str[1]

In [None]:
# words which have 3 letters or more
s.str[3:]

### Testing for strings that match or contain a pattern

In [None]:
# just a letter
pattern = r'[a-z]'

pd.Series(['1', '2', '3a', '3b', '03c'], dtype="string").str.contains(pattern)

In [None]:
# just a number
pattern = r'[0-9]'

pd.Series(['1', '2', '3a', '3b', '03c'], dtype="string").str.contains(pattern)

In [None]:
 # letter and number together

pattern = r'[0-9][a-z]'

pd.Series(['1', '2', '3a', '3b', '03c'],dtype="string").str.contains(pattern)

In [None]:
#match a pattern of number then letter
pattern = r'[0-9][a-z]'

pd.Series(['1', '2', '3a', '3b', '03c'], dtype="string").str.match(pattern)

In [None]:
s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],dtype="string")

s4

In [None]:
s4.str.contains('A', na=False)

In [None]:
# contains is case sensitive
s4.str.contains('ab', na=False)

In [None]:
### Combining String and regex

In [None]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@hotmail.com','Rob': 'rob@gmail.com', 'Wes': np.nan}

data= pd.Series(data)

data

In [None]:
data.isnull()

In [None]:
# apply string and regular expression methods applied to each values using data.map
# cannot use data.map on NA values
data.str.contains('gmail')

In [None]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [None]:
# remove @ symbol ., separate each email to its three parts
data.str.findall(pattern, flags=re.IGNORECASE)