# Imputation for missing values

## KNN Imputer

In [2]:
import numpy as np
from sklearn.impute import KNNImputer

In [3]:
X = [[1,2,np.nan],[3,4,3],[np.nan,6,5],[8,8,7]]
imputer = KNNImputer(n_neighbors = 2)
imputer.fit_transform(X)

array([[1. , 2. , 4. ],
       [3. , 4. , 3. ],
       [5.5, 6. , 5. ],
       [8. , 8. , 7. ]])

## Simple Imputer

In [10]:
import numpy as np
from sklearn.impute import SimpleImputer

In [12]:
imp_mean = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imp_mean.fit([[7,2,3],[4,np.nan,6],[10,5,9]])
X = [[np.nan, 2, 3],[4, np.nan, 6], [10, np.nan, 9]]
print(imp_mean.transform(X))

[[ 7.   2.   3. ]
 [ 4.   3.5  6. ]
 [10.   3.5  9. ]]


In [13]:
imp_mean.fit_transform(X)

array([[ 7.,  2.,  3.],
       [ 4.,  2.,  6.],
       [10.,  2.,  9.]])

## Nominal Variables

In [16]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit_transform(['yes', 'no', 'no', 'yes', 'maybe'])

array([[0, 0, 1],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0]])

In [17]:
lb.classes_

array(['maybe', 'no', 'yes'], dtype='<U5')

## Pandas to one-hot encode

In [19]:
import pandas as pd
df = pd.DataFrame({
    'X': ['a', 'b', 'a'],
    'Y': ['B', 'A', 'C'],
    'Z': [1, 2, 3]
})
df

Unnamed: 0,X,Y,Z
0,a,B,1
1,b,A,2
2,a,C,3


In [20]:
pd.get_dummies(df, prefix=['colX', 'colYO'])

Unnamed: 0,Z,colX_a,colX_b,colYO_A,colYO_B,colYO_C
0,1,1,0,0,1,0
1,2,0,1,1,0,0
2,3,1,0,0,0,1


## Ordinal Variables

In [22]:
dataframe = pd.DataFrame({
    'Score': ['Low', 'Low', 'Medium', 'Medium', 'High', 'Barely more than medium']
})
scale_mapper = {
    'Low': 1,
    'Medium': 2,
    'Barely more than medium': 3,
    'High': 4
}
dataframe['Score'].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

## Text data

In [23]:
text_data = [
    '   Interrobang. By Aishwarya Henriette     ',
    'Parking and going. by Karl Gautiere',
    '   Today is the night. By Jarek Prakash'
]

In [26]:
strip_whitespace = [string.strip() for string in text_data]
strip_whitespace

['Interrobang. By Aishwarya Henriette',
 'Parking and going. by Karl Gautiere',
 'Today is the night. By Jarek Prakash']

In [27]:
remove_periods = [string.replace('.', '') for string in strip_whitespace]
remove_periods

['Interrobang By Aishwarya Henriette',
 'Parking and going by Karl Gautiere',
 'Today is the night By Jarek Prakash']

In [28]:
def capitalizer(string: str) -> str:
    return string.upper()

In [30]:
capitalized = [capitalizer(string) for string in remove_periods]
capitalized

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIERE',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

## Regular expressions

In [35]:
import re

In [36]:
def replace_letters_with_x(string: str)-> str:
    return re.sub(r"[a-zA-Z]", "X", string)

In [37]:
[replace_letters_with_x(string) for string in capitalized]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

In [38]:
text = 'Hi! this is just and example'
re.sub(r'[^\w\s]','', text)

'Hi this is just and example'

## Beautiful soup

# Stop words

In [39]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [40]:
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [57]:
text = 'A sentence is a textual unit consisting of words that are grammatically linked.'

In [58]:
from nltk.tokenize import word_tokenize
tokenized_word = word_tokenize(text.lower())
tokenized_word

['a',
 'sentence',
 'is',
 'a',
 'textual',
 'unit',
 'consisting',
 'of',
 'words',
 'that',
 'are',
 'grammatically',
 'linked',
 '.']

In [59]:
filtered_tokens = [word for word in tokenized_word if word not in stopwords]
filtered_tokens

['sentence',
 'textual',
 'unit',
 'consisting',
 'words',
 'grammatically',
 'linked',
 '.']

### Sentence splitting

In [60]:
samplestring = "Let's make this our sample paragraph. It even knows the the period in Mr. Jones is not the end. Try it out!"

from nltk.tokenize import sent_tokenize
tokenized_sent = sent_tokenize(samplestring)
tokenized_sent

["Let's make this our sample paragraph.",
 'It even knows the the period in Mr. Jones is not the end.',
 'Try it out!']

### Word tokenizing

In [61]:
msg = 'Hey everyone! The party starts in 10mins. Be there ASAP!'
msg.split()

['Hey',
 'everyone!',
 'The',
 'party',
 'starts',
 'in',
 '10mins.',
 'Be',
 'there',
 'ASAP!']

In [63]:
from nltk.tokenize import word_tokenize
tokenized_word = word_tokenize(msg)
tokenized_word

['Hey',
 'everyone',
 '!',
 'The',
 'party',
 'starts',
 'in',
 '10mins',
 '.',
 'Be',
 'there',
 'ASAP',
 '!']