In [2]:
import pandas as pd
import numpy as np

In [3]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [4]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
string_data[0] = None

In [8]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [9]:
from numpy import nan as NA

In [10]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [11]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                    [NA, NA, NA],[NA, 6.5, 3.]])

In [14]:
cleaned = data.dropna()

In [15]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [17]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [18]:
data[4] = NA

In [19]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [20]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
df = pd.DataFrame(np.random.randn(7, 3))

In [22]:
df.iloc[:4, 1] = NA

In [24]:
df.iloc[:2, 2] = NA

In [25]:
df

Unnamed: 0,0,1,2
0,-1.722023,,
1,-0.720557,,
2,-0.18292,,1.62218
3,-0.162496,,0.747574
4,-1.41284,0.487857,-0.221255
5,0.494071,0.202628,0.003949
6,-0.272102,-1.920852,0.962349


In [26]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.41284,0.487857,-0.221255
5,0.494071,0.202628,0.003949
6,-0.272102,-1.920852,0.962349


In [27]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.18292,,1.62218
3,-0.162496,,0.747574
4,-1.41284,0.487857,-0.221255
5,0.494071,0.202628,0.003949
6,-0.272102,-1.920852,0.962349


In [28]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.722023,0.0,0.0
1,-0.720557,0.0,0.0
2,-0.18292,0.0,1.62218
3,-0.162496,0.0,0.747574
4,-1.41284,0.487857,-0.221255
5,0.494071,0.202628,0.003949
6,-0.272102,-1.920852,0.962349


In [29]:
df.fillna({1: 0.5, 2:0})

Unnamed: 0,0,1,2
0,-1.722023,0.5,0.0
1,-0.720557,0.5,0.0
2,-0.18292,0.5,1.62218
3,-0.162496,0.5,0.747574
4,-1.41284,0.487857,-0.221255
5,0.494071,0.202628,0.003949
6,-0.272102,-1.920852,0.962349


In [30]:
_ = df.fillna(0, inplace=True)

In [31]:
df

Unnamed: 0,0,1,2
0,-1.722023,0.0,0.0
1,-0.720557,0.0,0.0
2,-0.18292,0.0,1.62218
3,-0.162496,0.0,0.747574
4,-1.41284,0.487857,-0.221255
5,0.494071,0.202628,0.003949
6,-0.272102,-1.920852,0.962349


In [32]:
df = pd.DataFrame(np.random.randn(6,3))

In [33]:
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA

In [34]:
df

Unnamed: 0,0,1,2
0,0.917365,-0.948212,-1.697435
1,-1.042681,0.818424,2.414474
2,0.973294,,-0.555401
3,0.854952,,0.592931
4,0.187932,,
5,0.550359,,


In [35]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.917365,-0.948212,-1.697435
1,-1.042681,0.818424,2.414474
2,0.973294,0.818424,-0.555401
3,0.854952,0.818424,0.592931
4,0.187932,0.818424,0.592931
5,0.550359,0.818424,0.592931


In [36]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.917365,-0.948212,-1.697435
1,-1.042681,0.818424,2.414474
2,0.973294,0.818424,-0.555401
3,0.854952,0.818424,0.592931
4,0.187932,,0.592931
5,0.550359,,0.592931


In [37]:
data = pd.Series([1., NA, 3.5, NA, 7])

In [38]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [39]:
 data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
 'k2': [1, 1, 2, 3, 3, 4, 4]})

In [40]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [48]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [50]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [52]:
data['v1'] = range(7)

In [54]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [56]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [57]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
 'Pastrami', 'corned beef', 'Bacon',
 'pastrami', 'honey ham', 'nova lox'],
 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [58]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [59]:
meat_to_animal =  {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

In [60]:
lowercased = data['food'].str.lower()

In [61]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [63]:
data['animal'] = lowercased.map(meat_to_animal, )

In [64]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [65]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [66]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

In [67]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [68]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [69]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [70]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [71]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [72]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
 index=['Ohio', 'Colorado', 'New York'],
 columns=['one', 'two', 'three', 'four'])

In [73]:
transform = lambda x:x[:4].upper()

In [74]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [75]:
data.index = data.index.map(transform)

In [76]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [82]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [83]:
data.rename(index={'OHIO': 'INDIANA'},
           columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [85]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)

In [86]:
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [87]:
ages = [20,22, 25, 27, 21, 23, 37, 31, 61, 45, 41,32 ]

In [88]:
bins = [18, 25, 35, 60, 100]

In [89]:
cats = pd.cut(ages, bins)

In [90]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [91]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [92]:
pd.cut(ages, [18,26, 36, 61, 100],right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [93]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [95]:
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [97]:
data = np.random.rand(20)

In [99]:
pd.cut(data, 5, precision=2)

[(0.049, 0.24], (0.049, 0.24], (0.049, 0.24], (0.61, 0.79], (0.42, 0.61], ..., (0.24, 0.42], (0.61, 0.79], (0.24, 0.42], (0.42, 0.61], (0.049, 0.24]]
Length: 20
Categories (5, interval[float64]): [(0.049, 0.24] < (0.24, 0.42] < (0.42, 0.61] < (0.61, 0.79] < (0.79, 0.98]]

In [101]:
pd.cut(data, 4).value_counts()

(0.049, 0.281]    7
(0.281, 0.513]    3
(0.513, 0.744]    8
(0.744, 0.975]    2
dtype: int64

In [104]:
data = np.random.randn(1000)

In [105]:
cats = pd.qcut(data, 4)

In [106]:
cats

[(-0.00721, 0.693], (-0.00721, 0.693], (-3.678, -0.654], (-0.00721, 0.693], (-0.654, -0.00721], ..., (-3.678, -0.654], (0.693, 3.36], (-3.678, -0.654], (-0.00721, 0.693], (-0.654, -0.00721]]
Length: 1000
Categories (4, interval[float64]): [(-3.678, -0.654] < (-0.654, -0.00721] < (-0.00721, 0.693] < (0.693, 3.36]]

In [107]:
cats.value_counts()

(-3.678, -0.654]      250
(-0.654, -0.00721]    250
(-0.00721, 0.693]     250
(0.693, 3.36]         250
dtype: int64

In [109]:
pd.qcut(data, [0, 0.1, 0.5, 0.9 ,1]).value_counts()

(-3.678, -1.273]      100
(-1.273, -0.00721]    400
(-0.00721, 1.214]     400
(1.214, 3.36]         100
dtype: int64

In [110]:
data = pd.DataFrame(np.random.randn(1000,4))

In [111]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.01631,0.023864,0.018521,-0.030082
std,0.97964,1.009873,0.997371,1.006776
min,-3.306316,-2.701821,-3.19372,-3.544331
25%,-0.681004,-0.657877,-0.616821,-0.719342
50%,-0.010222,0.009167,0.022988,-0.043528
75%,0.638516,0.720606,0.676953,0.629005
max,3.11017,2.851493,2.883656,3.228579


In [112]:
col = data[2]

In [113]:
col[np.abs(col) > 3]

157   -3.193720
308   -3.065724
449   -3.043021
527   -3.154587
557   -3.039633
Name: 2, dtype: float64

In [115]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
21,3.11017,-1.199508,0.423516,0.751182
157,-0.620211,0.143196,-3.19372,-1.160834
249,-0.643939,-1.071379,-1.181646,-3.544331
308,-2.168466,-1.480798,-3.065724,1.229334
449,1.374063,0.097566,-3.043021,2.586806
527,-0.995941,0.331739,-3.154587,-0.843285
553,-1.001509,1.273656,-0.912137,3.228579
557,2.069164,1.775324,-3.039633,-0.095915
783,3.028395,-0.565595,-0.098954,-0.723307
981,-3.306316,0.222207,0.803644,0.822064


In [116]:
(np.abs(data) > 3).any(1)

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21      True
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
970    False
971    False
972    False
973    False
974    False
975    False
976    False
977    False
978    False
979    False
980    False
981     True
982    False
983    False
984    False
985    False
986    False
987    False
988    False
989    False
990    False
991    False
992    False
993    False
994    False
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

In [118]:
data[np.abs(data) > 3] = np.sign(data) * 3

In [119]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.016142,0.023864,0.019018,-0.029766
std,0.978224,1.009873,0.99583,1.004304
min,-3.0,-2.701821,-3.0,-3.0
25%,-0.681004,-0.657877,-0.616821,-0.719342
50%,-0.010222,0.009167,0.022988,-0.043528
75%,0.638516,0.720606,0.676953,0.629005
max,3.0,2.851493,2.883656,3.0


In [120]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,-1.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,-1.0
3,1.0,-1.0,1.0,-1.0
4,-1.0,-1.0,-1.0,1.0


In [121]:
df = pd.DataFrame(np.arange(5*4).reshape((5,4)))

In [122]:
sampler = np.random.permutation(5)

In [123]:
sampler

array([1, 0, 4, 2, 3])

In [124]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [126]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
4,16,17,18,19
2,8,9,10,11
3,12,13,14,15


In [131]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
4,16,17,18,19
2,8,9,10,11
3,12,13,14,15


In [132]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
4,16,17,18,19


In [133]:
choices = pd.Series([5, 7, -1, 6, 4])

In [134]:
draws = choices.sample(n=10, replace=True)

In [141]:
draws.sort_values()

2   -1
4    4
4    4
0    5
0    5
0    5
3    6
3    6
1    7
1    7
dtype: int64

In [142]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
 'data1': range(6)})

In [143]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [146]:
dummies = pd.get_dummies(df['key'], prefix='key')

In [147]:
df_with_dummy = df[['data1']].join(dummies)

In [148]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [151]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('movielens/movies.dat', sep='::',
                      header=None, names=mnames)

  This is separate from the ipykernel package so we can avoid doing imports until


In [152]:
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [153]:
all_genres = []

In [154]:
for x in movies.genres:
    all_genres.extend(x.split('|'))

In [155]:
all_genres

['Animation',
 "Children's",
 'Comedy',
 'Adventure',
 "Children's",
 'Fantasy',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Comedy',
 'Action',
 'Crime',
 'Thriller',
 'Comedy',
 'Romance',
 'Adventure',
 "Children's",
 'Action',
 'Action',
 'Adventure',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Horror',
 'Animation',
 "Children's",
 'Drama',
 'Action',
 'Adventure',
 'Romance',
 'Drama',
 'Thriller',
 'Drama',
 'Romance',
 'Thriller',
 'Comedy',
 'Action',
 'Action',
 'Comedy',
 'Drama',
 'Crime',
 'Drama',
 'Thriller',
 'Thriller',
 'Drama',
 'Sci-Fi',
 'Drama',
 'Romance',
 'Drama',
 'Drama',
 'Romance',
 'Adventure',
 'Sci-Fi',
 'Drama',
 'Drama',
 'Drama',
 'Sci-Fi',
 'Adventure',
 'Romance',
 "Children's",
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Documentary',
 'Comedy',
 'Comedy',
 'Romance',
 'Drama',
 'Drama',
 'War',
 'Action',
 'Crime',
 'Drama',
 'Drama',
 'Action',
 'Adventure',
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Crime',
 'Thrill

In [156]:
genres = pd.unique(all_genres)

In [157]:
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [159]:
zero_matrix = np.zeros((len(movies), len(genres)))

In [160]:
dummies = pd.DataFrame(zero_matrix, columns=genres)

In [161]:
gen = movies.genres[0]

In [162]:
gen.split('|')

['Animation', "Children's", 'Comedy']

In [163]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

In [165]:
for i,gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices]

In [168]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))

In [169]:
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                0
Genre_Children's                               0
Genre_Comedy                                   0
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [170]:
np.random.seed(12345)

In [171]:
values = np.random.rand(10)

In [172]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [173]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [177]:
np.arange(0,1.1,0.2)

array([0. , 0.2, 0.4, 0.6, 0.8, 1. ])

In [178]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


In [179]:
val = 'a,b, guido'

In [180]:
val.split(',')

['a', 'b', ' guido']

In [181]:
pieces = [x.strip() for x in val.split(',')]

In [182]:
pieces

['a', 'b', 'guido']

In [184]:
first, second, third = pieces

In [186]:
first + '::' + second + '::' + third

'a::b::guido'

In [187]:
'guido' in val

True

In [188]:
val.index(',')

1

In [189]:
val.find(':')

-1

In [190]:
val.index(':')

ValueError: substring not found

In [191]:
val.count(',')

2

In [192]:
val.replace(',', '::')

'a::b:: guido'

In [193]:
val.replace(',','')

'ab guido'

In [197]:
import re
text = "foo bar\t baz \tqux"
re.split('\s+',text)

['foo', 'bar', 'baz', 'qux']

In [202]:
regex = re.compile('\s+')
#如果打算对许多字符串应⽤同⼀条正则表达式，强烈建议通过
#re.compile创建regex对象。这样将可以节省⼤量的CPU时间

In [203]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [204]:
regex.findall(text)

[' ', '\t ', ' \t']

In [205]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [206]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [207]:
m = regex.search(text)

In [208]:
m

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

In [209]:
text[m.start(): m.end()]

'dave@google.com'

In [210]:
print(regex.match(text))

None


In [211]:
print(regex.sub('REDACTED',text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [212]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [214]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [216]:
m = regex.match('wesm@bright.net')

In [217]:
m.groups()

('wesm', 'bright', 'net')

In [218]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [219]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [220]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
 'Rob': 'rob@gmail.com', 'Wes': np.nan}

In [221]:
data = pd.Series(data)

In [222]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [223]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [224]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [225]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [226]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [227]:
matches = data.str.match(pattern, flags=re.IGNORECASE)

In [229]:
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [233]:
matches.str.get(1)

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [236]:
matches.str[0]

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [237]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

In [238]:
#chapter 8 data_pr
data = pd.Series(np.random.randn(9),
 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
 [1, 2, 3, 1, 3, 1, 2, 2, 3]])

In [239]:
data

a  1    1.007189
   2   -1.296221
   3    0.274992
b  1    0.228913
   3    1.352917
c  1    0.886429
   2   -2.001637
d  2   -0.371843
   3    1.669025
dtype: float64

In [240]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])

In [242]:
data['b']

1    0.228913
3    1.352917
dtype: float64

In [243]:
data.loc[['b','d']]

b  1    0.228913
   3    1.352917
d  2   -0.371843
   3    1.669025
dtype: float64

In [244]:
data.loc[:, 2]

a   -1.296221
c   -2.001637
d   -0.371843
dtype: float64

In [245]:
data.unstack()

Unnamed: 0,1,2,3
a,1.007189,-1.296221,0.274992
b,0.228913,,1.352917
c,0.886429,-2.001637,
d,,-0.371843,1.669025


In [246]:
data.unstack().stack()

a  1    1.007189
   2   -1.296221
   3    0.274992
b  1    0.228913
   3    1.352917
c  1    0.886429
   2   -2.001637
d  2   -0.371843
   3    1.669025
dtype: float64

In [247]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
 index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
 columns=[['Ohio', 'Ohio', 'Colorado'],
 ['Green', 'Red', 'Green']])

In [248]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11
