### Data processing
- Data errors exploration
    - categorical outliers
    - numerical outliers
- Data transformation: 
    - Numerical data normalization
    - Text: tokenization, stemming and lemmatization
- Feature engineering: 
    - Numerical
    - Categorical
    - Text


### Data transformation

#### numerical data normalization

- Minmax scaler

In [1]:
import sklearn
from sklearn.preprocessing import MinMaxScaler

In [3]:
data = [[-1, 2], 
        [-0.5, 6], 
        [0, 10], 
        [1, 18]]
scaler = MinMaxScaler() # feature_range=(0,1) by default

In [4]:
scaler.fit(data)

In [5]:
scaler.data_min_, scaler.data_max_

(array([-1.,  2.]), array([ 1., 18.]))

In [7]:
# transform data from original range to [0,1]
scaler.transform(data)

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [6]:
scaler.transform([[2, 2]])

array([[1.5, 0. ]])

- Self-practice with StandardScaler 
    - https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
   

#### Data transformation: text data tokenization, stemming/lemmatization

In [9]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize

In [10]:
nltk.sent_tokenize("At eight o'clock on Thursday morning. Arthur didn't feel very good.")

["At eight o'clock on Thursday morning.", "Arthur didn't feel very good."]

In [11]:
nltk.word_tokenize("At eight o'clock on Thursday morning. Arthur didn't feel very good.")

['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 '.',
 'Arthur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']

In [12]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
word = ("leaves") 
ps.stem(word)

'leav'

In [13]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("leaves")

'leaf'

### Feature engineering

#### Feature engineering: categorical data 

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
oh_enc = OneHotEncoder(handle_unknown='ignore')
X = [['Male'], 
     ['Female'], 
     ['Female']]

In [10]:
oh_enc.fit(X)

In [11]:
oh_enc.transform(X).toarray()

array([[0., 1.],
       [1., 0.],
       [1., 0.]])

In [12]:
oh_enc.categories_

[array(['Female', 'Male'], dtype=object)]

In [13]:
oh_enc.transform([['Female'], ['Male']]).toarray()

array([[1., 0.],
       [0., 1.]])

In [29]:
oh_enc.get_feature_names_out(['gender'])

array(['gender_Female', 'gender_Male'], dtype=object)

In [30]:
oh_enc.inverse_transform([[0, 1], [1, 0]])

array([['Male'],
       ['Female']], dtype=object)

- Ordinal encoder

In [31]:
from sklearn.preprocessing import OrdinalEncoder
od_enc = OrdinalEncoder()

In [32]:
X = [['Male', 1], 
     ['Female', 3], 
     ['Female', 2]]

In [33]:
od_enc.fit(X)

OrdinalEncoder()

In [34]:
od_enc.categories_

[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]

In [35]:
import numpy as np
np.array(X)

array([['Male', '1'],
       ['Female', '3'],
       ['Female', '2']], dtype='<U21')

In [36]:
od_enc.fit_transform(X)

array([[1., 0.],
       [0., 2.],
       [0., 1.]])

In [37]:
od_enc.transform([['Female', 3], ['Male', 1]])

array([[0., 2.],
       [1., 0.]])

#### Feature engineering: text data

- CountVectorizer
    - https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [16]:
vectorizer = CountVectorizer()

In [19]:
X = vectorizer.fit_transform(corpus)
# explore other parameter settings: lowercase, stop_words, ngram_range(), max_df, min_df, binary
X.shape

(4, 9)

In [18]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [44]:
print(X.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [45]:
X.shape

(4, 9)

- TfidfVectorizer
    - https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [24]:
vectorizer = TfidfVectorizer()

- self practice: explore other parameter settings in TfidfVectorizer: 
    - E.g., lowercase, stop_words, ngram_range(), max_df, min_df, binary, use_idf, smooth_idf

In [25]:
X = vectorizer.fit_transform(corpus)

In [26]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [27]:
print(X.shape)

(4, 9)


In [28]:
X.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

- Word embedding

In [68]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



In [75]:
np.round(wv['king'] - wv['man'],2)

array([-0.2 , -0.1 , -0.03,  0.22, -0.12,  0.01,  0.31, -0.21, -0.09,
        0.36, -0.27, -0.05, -0.09,  0.13, -0.07, -0.21,  0.13, -0.05,
        0.02,  0.23,  0.27,  0.11,  0.03,  0.41,  0.04,  0.01, -0.17,
       -0.2 , -0.03,  0.01, -0.06, -0.07,  0.19,  0.42, -0.23, -0.17,
       -0.21,  0.13, -0.  , -0.15,  0.11, -0.19,  0.07,  0.15,  0.12,
        0.11, -0.03,  0.02, -0.  , -0.02, -0.03,  0.03, -0.32,  0.2 ,
       -0.24,  0.09, -0.07, -0.01, -0.04, -0.01,  0.01,  0.14,  0.05,
        0.15,  0.01, -0.18, -0.07, -0.02, -0.25,  0.31,  0.04, -0.09,
        0.1 ,  0.05,  0.09, -0.11, -0.12,  0.11,  0.15, -0.08, -0.16,
        0.18, -0.11,  0.17,  0.28, -0.06, -0.01,  0.03, -0.02,  0.09,
        0.38, -0.11, -0.19, -0.14, -0.09,  0.06,  0.11, -0.07,  0.04,
       -0.08,  0.17,  0.21, -0.13, -0.24, -0.51,  0.31, -0.64, -0.01,
        0.14,  0.13,  0.24,  0.22,  0.01, -0.12,  0.33, -0.23,  0.08,
       -0.18,  0.12,  0.07,  0.12,  0.13,  0.06, -0.06,  0.19, -0.02,
        0.05, -0.02,

In [70]:
np.round(wv['queen'] - wv['woman'], 2)

array([-0.23791504, -0.06640625,  0.03369141,  0.23095703,  0.01367188,
        0.01855469,  0.04296875, -0.24243164, -0.30908203,  0.00708008,
        0.02148438, -0.03710938,  0.16699219,  0.12418365,  0.15991211,
        0.01855469,  0.2602539 ,  0.0065918 ,  0.00341797,  0.12475586,
        0.21948242,  0.18481445, -0.18115234,  0.28955078,  0.05529785,
        0.17919922, -0.13769531, -0.171875  , -0.16113281,  0.04492188,
       -0.2998047 , -0.12463379,  0.1430664 ,  0.31835938, -0.23925781,
       -0.2548828 , -0.21044922,  0.3125    ,  0.00097656, -0.015625  ,
        0.26416016, -0.38793945,  0.11132812, -0.00341797,  0.04589844,
       -0.02294922,  0.072052  ,  0.11010742,  0.18688965,  0.11523438,
       -0.07043457,  0.09702301, -0.00268555, -0.07080078, -0.0859375 ,
        0.01318359, -0.25146484, -0.10253906, -0.08886719, -0.10565186,
       -0.1640625 ,  0.10351562, -0.00439453,  0.03466797,  0.13452148,
       -0.2993164 , -0.10662842, -0.02734375, -0.3828125 ,  0.19

In [74]:
sklearn.metrics.pairwise.cosine_similarity([wv['king'] - wv['man']], [wv['queen'] - wv['woman']])

array([[0.7580352]], dtype=float32)