## Word embeddings of the data

In [1]:
import os

import gensim
from nltk import download
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd

from topicViz.textinjupyter import markdown_scape_chars
from topicViz.embedding import NLPProcessor

In [2]:
data_directory = '/home/edgar/topic-visualization/data'
model_path = f'{data_directory}/SO_vectors_200.bin'

In [3]:
# download and load NLP model
if not os.path.exists(model_path):
    print('Downloading models...')
    
    !wget -O /home/edgar/topic-visualization/data/SO_vectors_200.bin https://zenodo.org/record/1199620/files/SO_vectors_200.bin?download=1

    print('Models downloaded.')

else:

    print('Models already downloaded.')


Models already downloaded.


In [4]:
# load model
model = gensim.models.KeyedVectors.load_word2vec_format(
    model_path,
    binary=True,
)

In [5]:
# set resources for NLTK
download('punkt')
download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to /home/edgar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/edgar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# load data
html_clean_df = pd.read_csv(
    "data/html_clean.csv",
    index_col='Id',
)

text_html_clean_df = pd.read_csv(
    "data/text_html_clean.csv",
    index_col='Id',
)

In [7]:
# html_clean_df.head()

### NLTK preprocessing

In [8]:
nltk_processor = NLPProcessor()

In [9]:
# text: title + body
html_clean_df['post'] = (
    html_clean_df['title'] + ' ' + html_clean_df['body']
)

text_html_clean_df['post'] = (
    text_html_clean_df['title'] + ' ' + text_html_clean_df['body']
)

# drop nan values from post column
html_clean_df.dropna(subset=['post'], inplace=True)
text_html_clean_df.dropna(subset=['post'], inplace=True)

In [10]:
# n_row = 26052
# text_row = html_clean_df.iloc[n_row]['post']
# tokenized_row = nltk_processor.tokenize(text_row)
# no_stopwords_row = nltk_processor.remove_stopwords(
#     tokenized_text=tokenized_row,
#     stopwords=stop_words
# )

# markdown_scape_chars(text_row)
# print(len(tokenized_row))
# markdown_scape_chars(tokenized_row)
# print(len(no_stopwords_row))
# markdown_scape_chars(no_stopwords_row)

# print('#'*70)
# text_row = text_html_clean_df.iloc[n_row]['post']
# tokenized_row = nltk_processor.tokenize(text_row)
# no_stopwords_row = nltk_processor.remove_stopwords(
#     tokenized_text=tokenized_row,
#     stopwords=stop_words
# )

# markdown_scape_chars(text_row)
# print(len(tokenized_row))
# markdown_scape_chars(tokenized_row)
# print(len(no_stopwords_row))
# markdown_scape_chars(no_stopwords_row)

In [11]:
# tokenize and remove stopwords from post column

html_clean_df['tokenizedPost'] = html_clean_df['post'].apply(
    nltk_processor.tokenize
)

html_clean_df['noStopWordsTokenizedPost'] = (
    html_clean_df['tokenizedPost'].apply(
        nltk_processor.remove_stopwords,
        stopwords=stop_words
    )
)

In [12]:
html_clean_df.shape

(44999, 8)

In [13]:
text_html_clean_df['tokenizedPost'] = (
    text_html_clean_df['post'].apply(
        nltk_processor.tokenize
    )
)

text_html_clean_df['noSopWordsTokenizedPost'] = (
    text_html_clean_df['tokenizedPost'].apply(
        nltk_processor.remove_stopwords, stopwords=stop_words
    )
)

In [14]:
text_html_clean_df.shape

(44999, 8)

### Vector representatin

In [15]:
# # vector representation of post column
# n_row = html_clean_df.shape[0]
# n_col = 200 # gensim model dimensionality

# embedding_html_clean = np.empty((n_row, n_col))
# embedding_text_html_clean = np.empty((n_row, n_col))

embedding_html_clean = np.stack(
    html_clean_df['noStopWordsTokenizedPost'].apply(
        model.get_mean_vector,
    )
)
np.save('data/embedding_html_clean.npy', embedding_html_clean)
np.save(
    'data/index_embedding_html_clean.npy',
    html_clean_df.index.values
)
embedding_html_clean.shape


(44999, 200)

In [16]:
embedding_text_html_clean = np.stack(
    text_html_clean_df['noSopWordsTokenizedPost'].apply(
        model.get_mean_vector,
    )
)

np.save(
    'data/embedding_text_html_clean.npy',
    embedding_text_html_clean
)
np.save(
    'data/index_embedding_text_html_clean.npy',
    text_html_clean_df.index.values
)
embedding_text_html_clean.shape

(44999, 200)