In [7]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import sys,csv,os,IPython,string,re
from numpy import NaN as NA
pd.options.display.max_rows=10
pd.options.display.max_columns=10

# 7.1Handling Missing Data

In [8]:
# None~NA and NaN is a data that doesn't exist or exist but wasn't be observed
# Create a seires that has None(~NA) and np.NaN in value
series=pd.Series(np.array([None,np.NaN,'thach','thach','ngoc','ngoc']),index=[x for x in range(0,6)])
# condition to check series is null
series.isnull()
# Fill every NA value is string of this. Use the pd.fillna(values,inplace)
series.fillna(value={i:str(x) for i,x in enumerate(series[series.isnull()])},inplace=True)
type(series.loc[0])# check the type of NA value is string or not
# Use dropna
series.dropna(inplace=True,axis=0)# it's not change because we assign NA values into string at the previous code
# Drop duplicate with keep unique value or false to del it 
series.drop_duplicates(inplace=True,keep='first')# keep use 'first','last' to take the position index or False to drop all

### Filtering Out Missing Data

In [9]:
# create a value with type is float
series=pd.Series([NA,1.4,9,100,100,NA,None])# None will change to NaN because the type is float
# drop every value is NA
series.dropna(axis=0)
# choose the location of series that  dont contain NA value
series.loc[series.notnull()]
# create a dataframe that has many NA value at columns and index
data=pd.DataFrame([[1.,6.5,3.],[1.,NA,NA],[1,NA,NA],[2,5.6,3.]],columns=['a','b','c'],index=[0,1,2,3])
#drop every row that contains any or all value is NA
data.dropna(axis=0)# del row
# use subset with the axis to choose the subset we need to consider to dropna
data.dropna(axis=1,how='any',subset=[1,2,3])# drop every col that contain any or all value is NA
data.dropna(how='any')# how='any' is drop every row or col that has at least one NA value
data.dropna(axis=0,how='all',subset=['a','b'])# dropna in subset that have axis=0 with col=['a','b']
# create a new columns named 'd' and index 4  with every value is NA except loc[4,'d']
data.loc[4,'d']=10
print(data)
# in subset=[0,2,4][axis=1], where the columns has at least 2 non-NA value and drop every col has non-NA<2
data.dropna(axis=1,subset=[0,2,4],how='any',thresh=2)# thresh is require the non_NA

     a    b    c     d
0  1.0  6.5  3.0   NaN
1  1.0  NaN  NaN   NaN
2  1.0  NaN  NaN   NaN
3  2.0  5.6  3.0   NaN
4  NaN  NaN  NaN  10.0


Unnamed: 0,a
0,1.0
1,1.0
2,1.0
3,2.0
4,


### Filling in Missing Data

In [10]:
# fill NA value at particular col with the particular value
noNa_data=data.fillna({'b':data['b'].mean(),'c':120})
# set the subset of dataframe is NA
noNa_data.loc[1:3,['b','c']]=NA
# use method is ffill by fill the nearest value by axis
noNa_data.fillna(axis=1,method='ffill',limit=2)# fill the NA value as the beside value
# use interpolate to fill NA value as a value that between the min and max 
#set the min value of d at first row
noNa_data.loc[0,'d']=1.5
# return every value that in between min and max
noNa_data.interpolate(axis=0,limit=2)# limit 2 NA value can be replace at axis=0

Unnamed: 0,a,b,c,d
0,1.0,6.5,3.0,1.5
1,1.0,6.3875,32.25,3.625
2,1.0,6.275,61.5,5.75
3,2.0,,,
4,2.0,6.05,120.0,10.0


# 7.2 Data Transformation
### Removing Duplicates

In [11]:
# create a df that have duplicate
data=pd.DataFrame({'key1':['one','two']*3+['three','three'],'value':[1,2]*3+[3,3]})
# Condition for duplicated value that have every value at every col is the same
data.duplicated()
# Show the set of unique value
data[data.duplicated(keep='first')]
# drop duplicated
noDuplicated=data.drop_duplicates(keep='first')
# choose the col to drop duplicated value
data['v1']=range(8)# this is no duplicate because each value of col'v1' is different to remainder
data.drop_duplicates(subset=['key1','value'],keep='first')# choose the subset to consider to drop duplicate if existed

Unnamed: 0,key1,value,v1
0,one,1,0
1,two,2,1
6,three,3,6


### Transforming Data Using a Function or Mapping

In [12]:
#create a df that have food and ounces
data=pd.DataFrame({
    'food':['bacon','pulled pork','bacon','Pastrami','corned beef',
            'Bacon','pastrami','honey ham','nova lox'],
    'ounces':[4,3,12,6,7.5,8,3,5,6]})
meat_to_animal={
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'}
# convert every value in data['food'] to lower by using lambda x:x.lower()
lower_food=data['food'].map(lambda x: x.lower())
# Use map to create a new series with the input correspondence dict,list,..
data['animal']=lower_food.map(arg=meat_to_animal,na_action=None)# arg is mappind correspondence
#------------------------------------------------------------------
# Another way to transform is create a list of 'animal' columns then assign it to 'animal' col
food=data['food']
data['animal']=[meat_to_animal[x.lower()] if x.lower() in meat_to_animal else None for x in food]
# convert every null value to NA value
data[data.isnull()]=NA

### Replacing Values

In [13]:
data=pd.DataFrame({'a':[10,-999,1000,1000,-999,2],'b':[10,-999,1000,1000,-999,2]})
# use dict to enumerate the key-value to replace
data.replace({-999:NA,1000:0})
# use to list to put the correspondencee value 
data.replace([-999,1000],[NA,0])
# replace in each series by convert the data['a'].replace to data['a']
data['a']=data['a'].replace([-999,1000],[NA,0])
# create  a list to replace
replace_dict={-999:NA,1000:0}
data['a']=data['a'].map(replace_dict)

### Renaming Axis indexes 

In [14]:
str.upper
data=pd.DataFrame(np.arange(12).reshape(3,4),
                  index=['Ohio','Colorado','New York'],
                  columns=['one','two','three','four'])
# use map correspondence on data.index
data.index=data.index.map(lambda x: x[:4].upper())# x[:4] is a short-for written
# Use data.rename
data.rename(columns=str.upper,index=str.title,inplace=True)
# Use data.rename to rename some value of index or columns
data.rename(columns={'ONE':'OnE','THREE':'Three'},index={'Ohio':'OhiO','New York':'NewYork'},inplace=True)

### Discretization and Binning
###### Category type

In [15]:
# discretized is cut the value for the each of range of value
# create a series of numeric value
num=[10,20,30,40,50,19,18,2,-3,-10]
# check the min max to cut the range
min(num);max(num)# min=-10, max=50
# creat a range list
bins=[-20,0,20,40,60]# create 4 category class
# put each value of num to range that be created by bins
cats=pd.cut(x=num,bins=bins)# Categorical object type
# count the number of categories 
cats.value_counts()
# check the position of bins
cats.codes
# check the categories
cats.categories
# put the label name for each categories
cats=pd.cut(x=num,bins=bins,labels=['one','two','three','four'])
# give the precision for the decimal number and auto divide the categories
data=np.random.randn(20)
# pass the number for bins and the name for the label name
cats=pd.cut(x=data,bins=5,precision=3,labels=list('abcde'))
cats
#-------------------------------------------------------
# Use qcut to get each categories that has the same quantile value
# create an array that contains 1000 value 
data=np.random.randn(1000)
# divide to 4 categories and put the name for each categories.Then test the value counts of each categories
pd.qcut(x=data,q=4,labels=list('abcd'),precision=2).value_counts()

a    250
b    250
c    250
d    250
dtype: int64

### Detecting and Filtering Outliers 

In [16]:
# create a df with 4000 value randint(1,1000)
data=pd.DataFrame(data=np.array([random.randint(-1000,1000) for x in range(4000)]).reshape(1000,4),columns=list('ABCD'),dtype=np.int64)
# give the descriptive about the df
data.describe()
# choose the columns ('A','B','C') that have the |value of 'B'|>900 
abs1=data[['A','B','C']][np.abs(data['B'])>900]
# pick the data that have at least |data|>990 apeeared any each row (~axis=1)
abs2=data[(np.abs(data)>990).any(axis=1)]
# another way to pick the value |data|>990 is use sign(x)*n
# create the data that only contain only value (990,-990)
sign=np.sign(data)*990# np.sign return the value positive(1) and negative(-1)
# choose the row that has at least value of data> |sign|
data[(data>np.abs(sign)).any(1)]

Unnamed: 0,A,B,C,D
58,499,397,993,225
76,-898,1000,509,405
101,-994,524,735,999
112,740,-131,995,-351
139,302,-252,997,812
...,...,...,...,...
882,236,417,996,245
925,1000,-90,-920,-254
957,-224,996,544,860
966,994,533,723,-448


## Permutation and Random Sampling 

In [17]:
# create df (20,20)
data=pd.DataFrame(data=np.array([random.randint(-10,10) for x in range(400)]).reshape(20,20))
# create the permutation by random the sequence of col or row and then assign it
data.reindex(columns=np.random.permutation(20),index=np.random.permutation(20))
# we can create a new sequence for permutation
permu=np.random.permutation(20)
# let the df take the permu for axis=1
data.take(permu,axis=1)


Unnamed: 0,5,7,4,16,19,...,14,8,9,10,0
0,-9,3,3,-5,-3,...,7,-3,-6,-8,3
1,2,10,6,3,-10,...,-3,-10,-7,-10,3
2,0,-1,-7,1,-3,...,1,5,2,8,6
3,2,4,0,4,8,...,6,-2,-3,10,4
4,3,-7,-9,-6,-8,...,1,-4,9,5,-10
...,...,...,...,...,...,...,...,...,...,...,...
15,-2,8,10,6,-8,...,1,0,-5,1,8
16,-7,4,-4,6,-6,...,8,-5,7,-5,-2
17,-4,-4,-5,1,-9,...,6,10,7,1,4
18,-9,8,-2,-7,9,...,0,-7,0,5,2


### Computing Indicator/Dummy Variables 

In [18]:
# create a df having shape(12,4)
df=pd.DataFrame({'key':['a','b','c',NA]*3,
                'value':['apple','banana','coconut',NA]*3,
                'numth':[x for x in range(0,12)]})
# dummies the distinct value of df['key'] and convert to 0s and 1s
key_dummies=pd.get_dummies(data=df['key'],prefix='key',dummy_na=True) # 1 is True and 0 is False
value_dummies=pd.get_dummies(data=df['value'],prefix='value',dummy_na=True)
numth_dummies=pd.get_dummies(data=df['numth'],prefix='numth',dummy_na=True)
# join df with key-value_ dummies
df_dummies=df[['numth']].join([key_dummies,value_dummies])
# choose the df_dummies where the key_a appeared and numth==4
df_dummies[((df_dummies['key_a']==1)&(df_dummies['numth']==4))]
#-------------------------------------------
# read the file movies.dat
f=open('Pydata-book/pydata-book-2nd-edition/datasets/movielens/movies.dat')
# read file 'f' with names,sep, and engine='python' to deal with 'sep' has 2 character
movie=pd.read_table(f,sep='::',header=None,names=['movie_id','title','genres'],engine='python')
# Apply function to split on movie['genres']
genres=list(movie['genres'].apply(lambda x: x.split('|')))
# add every element the movie['genres'] having to the list
all_genres=[]
for x in genres:
    if x not in all_genres:
        all_genres.extend(x)
    else:
        continue
# take the set unique of list 
unique=set(all_genres)
# create the matrix with shape(len(movie),len(unique)) to join with the df movie
zero_matrix=np.zeros((len(movie),len(unique)))# shape(3882,18)
# put the name for the  zero_matrix
dummies=pd.DataFrame(zero_matrix,columns=unique)
# manually change the value 0s,1s in the dummies
for i,gen in enumerate(movie.genres):
    # get the index list and use it to change the value 1 at each position
    indices=dummies.columns.get_indexer(gen.split('|'))
    # choose the location of df and change it
    dummies.iloc[i,indices]=1
# join the df 'movie' with the dummies with the same index
movie_windic=movie.join(dummies.add_prefix('Genre_'))
movie_windic.iloc[0]
#-------------------------------------------------
# Combine the pd.cut and pd.get_dummies
data=pd.DataFrame({'a':[random.randint(1,100) for x in range(10)],
                   'b':[random.randint(1,100) for x in range(10)]})
lst_range=[20,30,50,70,90,100]
# cut categories for each columns of df
for i in data.columns:# consider each name of col
    # change the original df 
    data[i]=pd.cut(x=data[i],bins=lst_range,labels=['one','two','three','four','five'])
# Get the dummies the each value of each columns
dummies_cut=pd.get_dummies(data=data,columns=['a','b'],prefix='col',dummy_na=True)

# 7.3 String Manipulation
### String Object Methods

In [19]:
# given the string
val=' a ,b ,  geníous,  '
# separate the string to list with max split
lst_val=val.split(sep=',',maxsplit=2)
# strip each element in lst_val
lst_strip=[x.strip() for x in lst_val]
# check the exist substring in string
',' in val
# return the first index of substring appeared( can raise the error if not existed)
val.index(',')
# find the position of index (return -1 if no result)
val.find(', ')# ~val.index()
# replace the substring
val.replace(',','?')
# use lower,upper,title,capitalize
val.capitalize()
# check the number of string ','
count=val.count(',')
# unpack the list as 3 string
first,second,third=lst_strip
# concanate 3 string with each sep=':'
first+':'+second+':'+third
# join can be used to concanate string
':'.join(lst_strip)

'a:b:geníous,'

### Regular Expressions 

In [20]:
text='::ngoc::thach::son,the,thach'
# compile the string to the pattern can be used as a regex (~ a single expression)
regex=re.compile(pattern='::')
regex1=re.compile(pattern=',')
# spliit the text with pattern_sep=regex
regex.split(text)
# continue split the string that dont have anymore '::' pattern
regex1.split(','.join(regex.split(text)))# use join to concanate element of list to one string
# compile to more pattern in one expression
regex2=re.compile(pattern='{0}|{1}'.format('::',','))# return the pattern is '::' or','
regex2.split(text)# unpack every element in text
regex2.findall(text)# return the list of regex2 is used to split text
# return the first position of the first pattern appeared in strinng
regex2.match(text)# only use matches at the beginning of string
regex2.search(text)# return the span of the first pattern appeared
#----------------------------------------------------------------
#Create the complex pattern of string having some special character[$#@,.+]
email='''
Ngoc dinhthengoc2021998$gmail,comde
Ngoc1 dinhthengoc1998@yahoo.com
thach dinhthethach4994#google+com
'''
# [A-Z0-9] means to the string that have letter or digit from[0-9]
# [@$#] accept the string that have one character is in [@] or [$] or [#]
# {2-4} is the chunksize for the chardter [A-Z]
pattern='[A-Z0-9]+[@$#][A-Z0-9]+[.,+][A-Z]{2,4}'
# assign the complex pattern to regex3 without care about lower,upper,.. CASE
regex3=re.compile(pattern=pattern,flags=re.IGNORECASE)
# findall and take list of string that has the same pattern with the regex
regex3.findall(email)
# search the span of the pattern that firstly appeared 
search=regex3.search(email)
# take the string of the first string that have the same pattern
search.start()# return the start position of string 'search'
search.end()# return the end position of string 'search'
email[search.start():search.end()]# extract the substring from start:end of string'search'
# replace the substring that having the complex pattern
new_email=regex3.sub(repl='Nothing',string=email)
#---------------------------------------------------------------
# unpack the email to 3 component is username,domain name,domain suffix
# put parentheses to each position to put the parentheses pattern to one string
# ([A-Z]{0,3}) is one string with pattern character capitalize from{0,3}
new_pattern='([A-Z0-9._]+)[@#$]([A-Z0-9]+)[,.+]([A-Z]{0,3})'
new_regex=re.compile(pattern=new_pattern,flags=re.IGNORECASE)
unpack_email=new_regex.findall(email)
#---------------------------------------------------------
# create the empty list of user, domain_name,domain_suffix 
user,domain_name,domain_suffix=[],[],[]
# append each value to correspondence list
for x in unpack_email:# consider each tuple in list
    for i,m in enumerate([user,domain_name,domain_suffix]):# do with each empty list
        m.append(x[i])# append string of each tuple in list
# create an empty dataframe with name of col,index
df=pd.DataFrame(columns=['user','domain_name','domain_suffix'],index=range(len(user)))
# add value of columns as list 
lst_columns=[user,domain_name,domain_suffix]
for i,m in enumerate(lst_columns):
    df.iloc[:,i]=m # subset each columns=list
#--------------------------------------------------------
# use dictionary to create df1
df1=pd.DataFrame(data={'user':user,'domain_name':domain_name,'domain_suffix':domain_suffix},
                 columns=['user','domain_name','domain_suffix'],
                 index=range(len(user)))
#--------------------------------------------------------
# build the data as a nested list then assign to df
[list(x) for x in list(zip(user,domain_name,domain_suffix))]# [list(x)]
df=pd.DataFrame(data=[list(x) for x in list(zip(user,domain_name,domain_suffix))],
                     columns=['user','domain_name','domain_suffix'],
                 index=range(len(user)))

### Vectorized String Functions in pandas 

In [26]:
data = {'Dave': 'dave@google.com','Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': NA}
df=pd.DataFrame(data={'name':{'Dave': 'dave@google.comasd','Steve': 'steve@gmail.com',
                    'Rob': 'rob@gmail.com', 'Wes': NA}})
# create a regex with a pattern to split string
pattern='([A-Z0-9._]+)[@]([A-Z0-9._]+)[.]([A-Z]{0,3})'
regex=re.compile(pattern=pattern,flags=re.IGNORECASE)
# choose the row that df['name'] has 'gmail' in string
df[df['name'].apply(lambda x: 'gmail' in str(x))]
# choose the row has df['name'] is NA
df[df['name'].map(lambda x: x is NA)]
# apply function (plit string) for df['name']
df['name'].apply(lambda x: regex.findall(str(x)))
# check the pattern in df['name']
matches=df['name'].apply(lambda x: regex.search(str(x)))# type series
# create new columns to take the string corresponding with pattern
df=df.join(matches,rsuffix=' matches')
# change value of 'name matches' with the same index
for i in range(len(df.index)):
    if df.iloc[i,0] is not NA:
        df.iloc[i,1]=(df.iloc[i,0])[matches[i].start():matches[i].end()]
    else: 
        df.iloc[i,1]=NA
#----------------------------------------------------------
# unpack the [user,domain_name,domain_suffix]then join with df 
new_df=df.join(pd.DataFrame(columns=['user','domain_name','domain_suffix'],index=df.index))
# choose a notnull df because can unpack NA value
notnull=new_df[new_df['name'].notnull()]
# apply regex to find all string that have the same pattern for each group()
unpack=notnull['name matches'].apply(lambda x: regex.findall(str(x)))
# create data for value of df['user','domain_name','domain_suffix']
data=[]
for i,x in enumerate(unpack):
    # change type of each unpack to list, not list of tuple
    unpack[i]=list(x[0])
    # add value to data
    data.append(unpack[i])
# join df with df['user','domain_name','domain_suffix']  
new_df=df.join(pd.DataFrame(data=data,columns=['user','domain_name','domain_suffix'],index=notnull.index))
new_df

Unnamed: 0,name,name matches,user,domain_name,domain_suffix
Dave,dave@google.comasd,dave@google.com,dave,google,com
Rob,rob@gmail.com,rob@gmail.com,rob,gmail,com
Steve,steve@gmail.com,steve@gmail.com,steve,gmail,com
Wes,,,,,
