# cat2vec

## Aim of the Notebook
The main idea of this notebook is to develop a snippet of code to transform categorical features into embeddings using the Gensim Word2Vec library. Below is an initial draft of the methods that will be used:

### Method 1: `apply_w2v`
This method takes sentences, a Word2Vec model, and the number of features as input. It returns the average word vectors for each sentence.

```python
def apply_w2v(sentences, model, num_features):
    # ...
```

### Method 2: `gen_cat2vec_sentences`
This method takes a DataFrame and returns a list of sentences, where each sentence is a list of categories.

```python
def gen_cat2vec_sentences(df):
    # ...
```

### Method 3: `fit_cat2vec_model`
This method takes a DataFrame, the number of features, and the window size for the Word2Vec model. It returns a trained Word2Vec model.

```python
def fit_cat2vec_model(df, n_cat2vec_features, n_cat2vec_window):
    # ...
```

We will load a toy classification dataset and create an example using a neural network model.

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the Titanic dataset
data = fetch_openml('titanic', version=1, as_frame=True)
df = data['data']
df['target'] = data['target']

# Split the dataset into training and test sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Show the first few rows of the training data
df_train.head()

In [None]:
from gensim.models import Word2Vec
from random import shuffle
import numpy as np

def apply_w2v(sentences, model, num_features):
    def _average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        n_words = 0.
        for word in words:
            if word in vocabulary:
                n_words = n_words + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if n_words:
            feature_vector = np.divide(feature_vector, n_words)
        return feature_vector

    vocab = set(model.wv.index_to_key)
    feats = [_average_word_vectors(s, model, vocab, num_features) for s in sentences]
    return np.array(feats)

def gen_cat2vec_sentences(df):
    X_w2v = df.copy(deep=True)
    names = list(X_w2v.columns.values)
    for c in names:
        X_w2v[c] = X_w2v[c].fillna('unknow').astype('category')
        X_w2v[c].cat.categories = ["%s %s" % (c,g) for g in X_w2v[c].cat.categories]
    X_w2v = X_w2v.values.tolist()
    return X_w2v

def fit_cat2vec_model(df, n_cat2vec_features, n_cat2vec_window):
    X_w2v = gen_cat2vec_sentences(df.sample(frac=0.6))
    for i in X_w2v:
        shuffle(i)
    model = Word2Vec(X_w2v, vector_size=n_cat2vec_features, window=n_cat2vec_window)
    return model