In [18]:
#Libraries

from sklearn.cluster import KMeans
from pandas import DataFrame
from sklearn.metrics import pairwise_distances_argmin_min
import numpy as np
import io
import nltk
import string
import pandas as pd
from nltk.corpus import stopwords

In [4]:
#Text preparation functions

def delete_numbers(text):
    #input: text as string
    #output: string as original text without numbers
    new_text = ''.join([i for i in text if not i.isdigit()])
    return new_text

def tokenization_sent(text):
    #input: text as string
    #output: list of sentences
    return nltk.sent_tokenize(text)

def tokenization_words(sentence):
    #input: sentence as string
    #output: list of words
    return nltk.word_tokenize(sentence)

def delete_punct(text):
    #input: text as string
    #output: string as original text without punctuation
    return text.translate(str.maketrans('', '', string.punctuation))

def delete_stopwords(text, stopwords):
    #input: text as string, stopwords as list of stopwords
    #output: string as original text without stopwords
    new_text = [word for word in tokenization_words(text) if word not in stopwords]
    return new_text

In [7]:
#Load data

en_data = pd.read_csv('en_data.csv')
et_data = pd.read_csv('et_data.csv')
for i in en_data.columns:
    if 'Unnamed' in i:
        del en_data[i]
for i in et_data.columns:
    if 'Unnamed' in i:
        del et_data[i]
en_data = en_data[~pd.isnull(en_data['Nouns'])]
et_data = et_data[~pd.isnull(et_data['Nouns'])]

In [8]:
#Load Word2Vec model from FastText and use it on Nouns

#Detect tokens to import further in Word2Vec
def get_tokens(data_original):
    #input: data_original as dataframe 
    #output: list of unique words from 'Nouns' column
    #delete nans
    data_original = data_original[~pd.isnull(data_original['Nouns'])]
    #make copy
    data = data_original
    #get tokens as list of unique words
    data['tokens'] = data['Nouns'].apply(lambda x: [i for i in x.split(' ')])
    corpora = data['tokens'].tolist()
    tokens = set([item for sublist in corpora for item in sublist])
    return tokens

#read .vec file as Word2Vec model
def load_vectors(fname, corpora_tokens):
    #input: fmane as path to mpdel in vec format, corpora_tokens aslist of words in corpora
    #output: data as model with all nessasary words
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        if tokens[0] in corpora_tokens:
            data[tokens[0]] = list(map(float, tokens[1:]))
    return data

#transform sentence to vector
def sentence_vectorization(sentence, model):
    #input: sentence as string, model as pretrained Word2Vec model
    #output: vector of input sentence
    words = tokenization_words(sentence)
    vectors_list = []
    for word in words:
        v = word_vectorization(word, model)
        vectors_list.append(v)
    mean_v = np.mean(vectors_list, axis=0)
    return mean_v

#transform word to vector
def word_vectorization(word, model):
    #input: word as string, model as pretrained Word2Vec model
    #output: vector of input word
    try:
        v = model[word]
    except:
        v = np.zeros(300)
    return v

tokens_en = get_tokens(en_data)
model_en = load_vectors('C:/Users/Olha/Documents/cc.en.300.vec', tokens_en)
en_data['Description_vector'] = en_data['Nouns'].apply(lambda x: sentence_vectorization(x, model_en))
en_data['Tokens_vectors'] = en_data['Nouns'].apply(lambda x: [word_vectorization(i, model_en) for i in tokenization_words(x)])
tokens_et = get_tokens(et_data)
model_et = load_vectors('C:/Users/Olha/Documents/cc.et.300.vec', tokens_et)
et_data['Description_vector'] = et_data['Nouns'].apply(lambda x: sentence_vectorization(x, model_et))
et_data['Tokens_vectors'] = et_data['Nouns'].apply(lambda x: [word_vectorization(i, model_et) for i in tokenization_words(x)])

In [9]:
#use vectors and tokens to make a summary
def clustering_summarization(vectors, sentence, number_of_sentences_summary):
    #input: vectors as list of vectors for input words, sentence as string, number_of_sentences_summary a number of sentences expected to obtain in summary
    #output: summary as string 
    tokens = tokenization_words(sentence)
    kmeans = KMeans(n_clusters = number_of_sentences_summary)
    vectors_to_ndarray = np.array([np.array(x) for x in vectors])
    kmeans = kmeans.fit(vectors_to_ndarray)
    avg = []
    for j in range(number_of_sentences_summary):
        idx = np.where(kmeans.labels_ == j)[0]
        avg.append(np.mean(idx))
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, vectors_to_ndarray)
    ordering = sorted(range(number_of_sentences_summary), key=lambda k: avg[k])
    summary = [tokens[closest[idx]] for idx in ordering]
    return summary

#make summary for each product (to define max 4-5 keywords)
en_data['Summary'] = en_data.apply(lambda x: clustering_summarization(x.Tokens_vectors, x.Nouns, min(4, len(tokenization_words(x.Nouns)))), axis = 1)
et_data['Summary'] = et_data.apply(lambda x: clustering_summarization(x.Tokens_vectors, x.Nouns, min(5, len(tokenization_words(x.Nouns)))), axis = 1)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [10]:
en_data.head()

Unnamed: 0,link,node,text_join,Nouns,Description_vector,Tokens_vectors,Summary
0,<https::nailin.ee::/et/27-akruulvarvid::null::...,node517e8478b3641044f1e2e2bbc31afa,AkruFCuFClvuErv One Stroke Phthalo Green ml,AkruFCuFClvuErv Stroke Phthalo Green ml,"[0.0528, -0.02636, -0.03886, -0.05032, -0.0188...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[Stroke, AkruFCuFClvuErv, Phthalo, ml]"
1,<http::www.kalastussport.ee::/136-taliridvad::...,nodef5d0383018d768c2279c3541adc8f1a,Winter rod AKARA Legend,Winter rod AKARA Legend,"[-0.037375, -0.014125000000000002, 0.0134, 0.0...","[[0.0187, 0.0226, -0.0533, 0.0044, 0.0299, -0....","[Winter, rod, AKARA, Legend]"
2,<http::www.ittgroup.ee::/en/new-products::null...,nodefeb891dd8590e0d85f9c685e4642449,Digital pressure sensor BMP,Digital pressure sensor BMP,"[0.013125000000000005, 0.0393, 0.06575, -0.000...","[[-0.0496, 0.0661, 0.0464, -0.0271, -0.0263, 0...","[Digital, pressure, sensor, BMP]"
3,<http::www.ittgroup.ee::/en/31-converters::nul...,node67109f1f1de73f41d2979d4561221,Analogdigital converter bit ADS,converter bit ADS,"[-0.0014333333333333327, -0.026866666666666667...","[[-0.0345, -0.0349, 0.0286, 0.0241, 0.0228, 0....","[converter, bit, ADS]"
4,<http::www.ittgroup.ee::/en/38-prototyping-and...,node3f349ca61d31b5f8c6bfe56bdb8081,Power Charger VA,Power Charger VA,"[-0.060733333333333334, 0.0726, 0.207433333333...","[[-0.0726, 0.062, 0.1646, -0.0548, -0.0151, 0....","[Power, Charger, VA]"


In [26]:
# Have keywords, need clusters
# Link words with at least one common keywords

def get_unique_keywords(data):
    keywords_series = data.Summary
    all_keywords = []
    for keywords_list in keywords_series:
        for keyword in keywords_list:
            all_keywords.append(keyword)
    unique_keywords = list(set(all_keywords))
    Data = {'Keyword_unique':  unique_keywords, 'Keyword_number': list(range(0, len(unique_keywords)))}
    df = DataFrame (Data, columns = ['Keyword_unique', 'Keyword_number'])
    return df

keywords_en = get_unique_keywords(en_data)
keywords_et = get_unique_keywords(et_data)

In [31]:
def detect_keywords_connection(summary, keywords):
    numbers = []
    for key in summary:
        numbers.append(keywords.Keyword_number[keywords['Keyword_unique'] == key])
    return numbers

en_data['Keywords_numbers'] = en_data.Summary.apply(lambda x: detect_keywords_connection(x, keywords_en))
et_data['Keywords_numbers'] = et_data.Summary.apply(lambda x: detect_keywords_connection(x, keywords_et))

In [32]:
en_data.head()

Unnamed: 0,link,node,text_join,Nouns,Description_vector,Tokens_vectors,Summary,Keywords_numbers
0,<https::nailin.ee::/et/27-akruulvarvid::null::...,node517e8478b3641044f1e2e2bbc31afa,AkruFCuFClvuErv One Stroke Phthalo Green ml,AkruFCuFClvuErv Stroke Phthalo Green ml,"[0.0528, -0.02636, -0.03886, -0.05032, -0.0188...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[Stroke, AkruFCuFClvuErv, Phthalo, ml]","[[2159], [9833], [12269], [14957]]"
1,<http::www.kalastussport.ee::/136-taliridvad::...,nodef5d0383018d768c2279c3541adc8f1a,Winter rod AKARA Legend,Winter rod AKARA Legend,"[-0.037375, -0.014125000000000002, 0.0134, 0.0...","[[0.0187, 0.0226, -0.0533, 0.0044, 0.0299, -0....","[Winter, rod, AKARA, Legend]","[[7892], [9829], [23856], [30916]]"
2,<http::www.ittgroup.ee::/en/new-products::null...,nodefeb891dd8590e0d85f9c685e4642449,Digital pressure sensor BMP,Digital pressure sensor BMP,"[0.013125000000000005, 0.0393, 0.06575, -0.000...","[[-0.0496, 0.0661, 0.0464, -0.0271, -0.0263, 0...","[Digital, pressure, sensor, BMP]","[[23158], [23248], [21563], [28042]]"
3,<http::www.ittgroup.ee::/en/31-converters::nul...,node67109f1f1de73f41d2979d4561221,Analogdigital converter bit ADS,converter bit ADS,"[-0.0014333333333333327, -0.026866666666666667...","[[-0.0345, -0.0349, 0.0286, 0.0241, 0.0228, 0....","[converter, bit, ADS]","[[6799], [636], [5169]]"
4,<http::www.ittgroup.ee::/en/38-prototyping-and...,node3f349ca61d31b5f8c6bfe56bdb8081,Power Charger VA,Power Charger VA,"[-0.060733333333333334, 0.0726, 0.207433333333...","[[-0.0726, 0.062, 0.1646, -0.0548, -0.0151, 0....","[Power, Charger, VA]","[[17862], [22422], [8494]]"


In [None]:
et_data.head()

In [34]:
en_data_copy = en_data
en_data_copy = en_data_copy.drop(columns = ['Description_vector', 'Tokens_vectors'])
en_data_copy.to_csv('Results/en_data_wod2vec_kmeans.csv')

et_data_copy = et_data
et_data_copy = et_data_copy.drop(columns = ['Description_vector', 'Tokens_vectors'])
et_data_copy.to_csv('Results/et_data_wod2vec_kmeans.csv')

In [49]:
#make summary for all products dataset
topics_en = clustering_summarization(en_data.Description_vector, en_data.Nouns, 20)
topics_et = clustering_summarization(et_data.Description_vector, et_data.Nouns, 20)

In [51]:
topics_en

'Vogue VOS N. DS OSRAM XENARC COOL BLUE INTENSE xenon HID pirn. Komplekt KING laud ja tooli. mm Aquamarine F. ASUS VGA PCIE GT GB GDDRGTGD ASUSASUS VGA PCIE GT GB GDDRGTGD ASUS. PRODUCTuABookend homeoffice decorationDETAILS Did tigers patterns meanings sign head symbol foruAa King Chinese Wow tiger Zuny house pieces world name Mateo materials injure animals production mood things lifeuAUSEuAhis books doors decoration table shelf ItuAlooks varietyuAof interiorsuAand WEIGHTuAFEATURESHANDMADEuAand weightLIMITED EDITIONuA pcs family belly name belonginguAto edition AlsouAsome models number auAPASSPORTuAwithuAinformation Zuny FauxuALeatheruAMicro SuedeInside uAuA Iron PelletsuASuperfineuAPolyester FiberCOLOUR uATan Edition pcs MateouAuFuuuBuuuuuuuF uFuuuFuEuu ZunyuCuuuuuuuBuBuuDuuuuu uDuAuEuAuEuu NOTES PLEASE NOTE NOT toy children years SHIPPING INFORMATION costuAuA FREE shipping worldwide Ships touA EU Russia Ukraine Belarus USA PolicyuA EASYuAThis item explanations months accordance Retur

In [60]:
#get indexes of products that formed summary for whole en/et dataset
def get_indexes(data, topics):
    key_indexes = []
    for i in range(len(data)):
        if data['Nouns'][i] in topics:
            key_indexes.append(i)
    return key_indexes

indexes_en = get_indexes(en_data, topics_en)
indexes_et = get_indexes(et_data, topics_et)

KeyError: 149

In [59]:
srting = 'CT Rain Wind Deflectors visors air weather conditions fogging passenger comfort slimline profile CT deflectors model level fit finishCitroen CCrosser'
if srting in topics_en:
    print('yes')

yes
