## Word embeddings of the data

In [1]:
import os

import gensim
from nltk import download
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd

from topicViz.textinjupyter import markdown_scape_chars
from topicViz.embedding import NLPProcessor

In [2]:
data_directory = '/home/edgar/topic-visualization/data'
model_path = f'{data_directory}/SO_vectors_200.bin'

In [3]:
# download and load NLP model
if not os.path.exists(model_path):
    print('Downloading models...')
    
    !wget -O /home/edgar/topic-visualization/data/SO_vectors_200.bin https://zenodo.org/record/1199620/files/SO_vectors_200.bin?download=1

    print('Models downloaded.')

else:

    print('Models already downloaded.')


Models already downloaded.


In [4]:
# load model
model = gensim.models.KeyedVectors.load_word2vec_format(
    model_path,
    binary=True,
)

In [5]:
# set resources for NLTK
download('punkt')
download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to /home/edgar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/edgar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# load data
html_clean_df = pd.read_csv(
    "data/html_cleaned.csv",
    index_col='Id',
)

text_html_clean_df = pd.read_csv(
    "data/text_html_cleaned.csv",
    index_col='Id',
)

In [7]:
html_clean_df.head()

Unnamed: 0_level_0,title,body,tags,creationdate,y
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
34552656,Java: Repeat Task Every Random Seconds,I'm already familiar with repeating tasks ever...,,2016-01-01 00:21:59,LQ_CLOSE
34553034,Why are Java Optionals immutable?,I'd like to understand why Java 8 Optionals we...,,2016-01-01 02:03:20,HQ
34553174,Text Overlay Image with Darkened Opacity React...,I am attempting to overlay a title over an ima...,,2016-01-01 02:48:24,HQ
34553318,Why ternary operator in swift is so picky?,"The question is very simple, but I just could ...",,2016-01-01 03:30:17,HQ
34553755,hide/show fab with scale animation,I'm using custom floatingactionmenu. I need to...,,2016-01-01 05:21:48,HQ


### NLTK preprocessing

In [8]:
nltk_processor = NLPProcessor()

In [9]:
# text: title + body
html_clean_df['post'] = (
    html_clean_df['title'] + ' ' + html_clean_df['body']
)

text_html_clean_df['post'] = (
    text_html_clean_df['title'] + ' ' + text_html_clean_df['body']
)

# drop nan values from post column
html_clean_df.dropna(subset=['post'], inplace=True)
text_html_clean_df.dropna(subset=['post'], inplace=True)

In [10]:
n_row = 26052
text_row = html_clean_df.iloc[n_row]['post']
tokenized_row = nltk_processor.tokenize(text_row)
no_stopwords_row = nltk_processor.remove_stopwords(
    tokenized_text=tokenized_row,
    stopwords=stop_words
)

markdown_scape_chars(text_row)
print(len(tokenized_row))
markdown_scape_chars(tokenized_row)
print(len(no_stopwords_row))
markdown_scape_chars(no_stopwords_row)

print('#'*70)
text_row = text_html_clean_df.iloc[n_row]['post']
tokenized_row = nltk_processor.tokenize(text_row)
no_stopwords_row = nltk_processor.remove_stopwords(
    tokenized_text=tokenized_row,
    stopwords=stop_words
)

markdown_scape_chars(text_row)
print(len(tokenized_row))
markdown_scape_chars(tokenized_row)
print(len(no_stopwords_row))
markdown_scape_chars(no_stopwords_row)

```
"I need to know, is it possible to change some product's IP address using python scripts? If possible then how ? Includes Printer and other devices i am currently on some project. I just want to know if we have more than one products  and need to assign IP address to them **automatically** using python , is this possible. If yes please help me ."
```

74


```
['I', 'need', 'to', 'know', ',', 'is', 'it', 'possible', 'to', 'change', 'some', 'product', "'s", 'IP', 'address', 'using', 'python', 'scripts', '?', 'If', 'possible', 'then', 'how', '?', 'Includes', 'Printer', 'and', 'other', 'devices', 'i', 'am', 'currently', 'on', 'some', 'project', '.', 'I', 'just', 'want', 'to', 'know', 'if', 'we', 'have', 'more', 'than', 'one', 'products', 'and', 'need', 'to', 'assign', 'IP', 'address', 'to', 'them', '*', '*', 'automatically', '*', '*', 'using', 'python', ',', 'is', 'this', 'possible', '.', 'If', 'yes', 'please', 'help', 'me', '.']
```

47


```
['I', 'need', 'know', ',', 'possible', 'change', 'product', "'s", 'IP', 'address', 'using', 'python', 'scripts', '?', 'If', 'possible', '?', 'Includes', 'Printer', 'devices', 'currently', 'project', '.', 'I', 'want', 'know', 'one', 'products', 'need', 'assign', 'IP', 'address', '*', '*', 'automatically', '*', '*', 'using', 'python', ',', 'possible', '.', 'If', 'yes', 'please', 'help', '.']
```

######################################################################


```
"I need to know, is it possible to change some product's IP address using python scripts? If possible then how ? Includes Printer and other devices i am currently on some project  i just want to know if we have more than one products and need to assign ip address to them **automatically** using python   is this possible  if yes please help me  "
```

70


```
['I', 'need', 'to', 'know', ',', 'is', 'it', 'possible', 'to', 'change', 'some', 'product', "'s", 'IP', 'address', 'using', 'python', 'scripts', '?', 'If', 'possible', 'then', 'how', '?', 'Includes', 'Printer', 'and', 'other', 'devices', 'i', 'am', 'currently', 'on', 'some', 'project', 'i', 'just', 'want', 'to', 'know', 'if', 'we', 'have', 'more', 'than', 'one', 'products', 'and', 'need', 'to', 'assign', 'ip', 'address', 'to', 'them', '*', '*', 'automatically', '*', '*', 'using', 'python', 'is', 'this', 'possible', 'if', 'yes', 'please', 'help', 'me']
```

41


```
['I', 'need', 'know', ',', 'possible', 'change', 'product', "'s", 'IP', 'address', 'using', 'python', 'scripts', '?', 'If', 'possible', '?', 'Includes', 'Printer', 'devices', 'currently', 'project', 'want', 'know', 'one', 'products', 'need', 'assign', 'ip', 'address', '*', '*', 'automatically', '*', '*', 'using', 'python', 'possible', 'yes', 'please', 'help']
```

In [11]:
# tokenize and remove stopwords from post column

html_clean_df['tokenizedPost'] = html_clean_df['post'].apply(
    nltk_processor.tokenize
)

html_clean_df['noStopWordsTokenizedPost'] = (
    html_clean_df['tokenizedPost'].apply(
        nltk_processor.remove_stopwords,
        stopwords=stop_words
    )
)

In [13]:
text_html_clean_df['tokenizedPost'] = (
    text_html_clean_df['post'].apply(
        nltk_processor.tokenize
    )
)

text_html_clean_df['noSopWordsTokenizedPost'] = (
    text_html_clean_df['tokenizedPost'].apply(
        nltk_processor.remove_stopwords, stopwords=stop_words
    )
)

### Vector representatin

In [35]:
# vector representation of post column
n_row = html_clean_df.shape[0]
n_col = 200 # gensim model dimensionality

embedding_html_clean = np.empty((n_row, n_col))
embedding_text_html_clean = np.empty((n_row, n_col))

embedding_html_clean = np.stack(
    html_clean_df['noStopWordsTokenizedPost'].apply(
        model.get_mean_vector,
    )
)
np.save('data/embedding_html_clean.npy', embedding_html_clean)
np.save(
    'data/index_embedding_html_clean.npy',
    html_clean_df.index.values
)

embedding_text_html_clean = np.stack(
    text_html_clean_df['noSopWordsTokenizedPost'].apply(
        model.get_mean_vector,
    )
)

np.save(
    'data/embedding_text_html_clean.npy',
    embedding_text_html_clean
)
np.save(
    'data/index_embedding_text_html_clean.npy',
    text_html_clean_df.index.values
)