In [1]:
import pandas as pd
from gensim.models import word2vec
import numpy as np
import re                                  # library for regular expression operations
import string     
import nltk                         # for string operations
nltk.download('stopwords')                 # download the stopwords from NLTK
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
import nltk
import gensim
from gensim.models import word2vec

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\doguc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df_rew = pd.read_csv('datas/reviews.csv')
df_rew

Unnamed: 0,product_content_id,rate,comment,review_like_count,supplier_id
0,35244093,5,75c göğüse S beden aldım spor için daha da sar...,,968
1,97842293,5,fiyatı yüksek ama çok memnunum,,367
2,86701595,5,yorumlara ve beden tablosuna bakarak 1 beden b...,1.0,191874
3,107237246,3,çok kısa iade ettim. sıradan geldi,,968
4,45019480,5,Renkler daha canlı. Hoş,,968
...,...,...,...,...,...
6176836,53600107,4,Çok güzel beğendim 👍,,637
6176837,42373905,1,pamuk değil sentetik.. ve beklediğimiz ürün de...,,142033
6176838,54086523,5,Kışa hazırlık bu fiyata çok iyi aldık. Fakat ş...,,1188
6176839,95633171,3,Ürün fotografta gözüktüğü gibi fakat kalıbı aş...,1.0,968


In [3]:
# checking number of nan values for per column
df_rew.isna().sum()

product_content_id          0
rate                        0
comment                     0
review_like_count     5525753
supplier_id                 0
dtype: int64

In [4]:
# nan values of review_like_count filled with most repetitive value(0).
df_rew.fillna(df_rew.review_like_count.mode()[0],inplace=True)

In [5]:
# Comment Processing

In [6]:
from TurkishStemmer import TurkishStemmer

In [7]:
#Option-1 : Preprocessing with Turkish words Stemmizer
def preprocess_func_with_turkish_stemmer(name):
    # Instantiate stemming class
    #stemmer = PorterStemmer()
    stemmer = TurkishStemmer()
    #Import the english stop words list from NLTK
    stopwords_english = stopwords.words('turkish')
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    # tokenize names
    name_tokens = tokenizer.tokenize(name)
    names_clean = []
    for word in name_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # names_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            if re.findall(r'\w*\d\w*',stem_word) == []:
                names_clean.append(stem_word)
    return names_clean

In [8]:
#Option-2 : Preprocessing without any Stemmizer
def preprocess_func(name):

    stopwords_english = stopwords.words('turkish')
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    # tokenize names
    name_tokens = tokenizer.tokenize(name)
    names_clean = []
    for word in name_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # names_clean.append(word)
            #stem_word = stemmer.stem(word)  # stemming word
            if re.findall(r'\w*\d\w*',word) == []:
                names_clean.append(word)
    return names_clean

In [9]:
df_rew.comment.iloc[0]

'75c göğüse S beden aldım spor için daha da sarmasını isterdim fakat xsde küçük olacak gibi'

In [10]:
preprocess_func_with_turkish_stemmer(df_rew.comment.iloc[0])

['göğü',
 's',
 'be',
 'al',
 'spor',
 'sarma',
 'ister',
 'fakat',
 'xsde',
 'küçük',
 'olacak']

In [11]:
preprocess_func(df_rew.comment.iloc[0])

['göğüse',
 's',
 'beden',
 'aldım',
 'spor',
 'sarmasını',
 'isterdim',
 'fakat',
 'xsde',
 'küçük',
 'olacak']

In [12]:
#Preprocessed tokens are merged like sentences and saved in comment_prep column.
def preprocessed_comments(x):
    return ' '.join(preprocess_func(x))

In [13]:
df_rew['comment_prep'] = df_rew.apply(lambda x: preprocessed_comments(x.comment),axis=1)
df_rew.head()

Unnamed: 0,product_content_id,rate,comment,review_like_count,supplier_id,comment_prep
0,35244093,5,75c göğüse S beden aldım spor için daha da sar...,1.0,968,göğüse s beden aldım spor sarmasını isterdim f...
1,97842293,5,fiyatı yüksek ama çok memnunum,1.0,367,fiyatı yüksek memnunum
2,86701595,5,yorumlara ve beden tablosuna bakarak 1 beden b...,1.0,191874,yorumlara beden tablosuna bakarak beden büyük ...
3,107237246,3,çok kısa iade ettim. sıradan geldi,1.0,968,kısa iade ettim sıradan geldi
4,45019480,5,Renkler daha canlı. Hoş,1.0,968,renkler canlı hoş


In [14]:
#df_rew.to_csv('reviews_prep_v1.csv',index=False)

In [15]:
#df_rew = pd.read_csv('reviews_prep_v1.csv')
#df_rew.head(1)

In [16]:
def get_corpus(df,column_name):
    #Some of comments are gone when preprocessing is applied, so nan values are set as '-1'.
    df[column_name].fillna('-1',inplace=True)
    #preprocessed comments are converted to token list for creating a corpus.
    corpus = [tokens.split(' ') for tokens in df[column_name].values]
    return corpus

In [17]:
#get comment orpus for training of word2vec model
comment_corpus = get_corpus(df_rew,'comment_prep')

In [18]:
#In this NLP problem which is actually representation of comments of all products we have, word2vec model is selected. It means that co occurance of tokens
#takes first importance.
#Other ways might be more efficient.(FastText,GloVe,Bert or embedding layer etc.)
model_comment = word2vec.Word2Vec(comment_corpus, vector_size=30, window=3, min_count=2, workers=4)

In [19]:
#It convert comment tokens to vector and then get mean of all token vector of comment to
#get representation of all comment.
def add_word2vec_column(x,model,emded_size):
    temp_list = []
    #df_temp = x.product_name_prep
    for token in x:
        if token in model.wv.key_to_index:
                wordvec = model.wv.get_vector(token)
                temp_list.append(wordvec)
        else:
            temp_list.append([0 for x in range(emded_size)])
    try:
        product_name_vector = np.mean(temp_list,axis=0)
        return product_name_vector
    except:
        print(1)
    

In [20]:
df_rew['embed_comments'] = df_rew.apply(lambda x: add_word2vec_column(x.comment_prep,model_comment,30),axis=1)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [21]:
df_rew.head()

Unnamed: 0,product_content_id,rate,comment,review_like_count,supplier_id,comment_prep,embed_comments
0,35244093,5,75c göğüse S beden aldım spor için daha da sar...,1.0,968,göğüse s beden aldım spor sarmasını isterdim f...,"[-1.3028405054253251, -1.9299605230198187, -1...."
1,97842293,5,fiyatı yüksek ama çok memnunum,1.0,367,fiyatı yüksek memnunum,"[-1.050075696950609, -2.095573910258033, -1.28..."
2,86701595,5,yorumlara ve beden tablosuna bakarak 1 beden b...,1.0,191874,yorumlara beden tablosuna bakarak beden büyük ...,"[-1.0251962876819232, -2.263921338668529, -1.2..."
3,107237246,3,çok kısa iade ettim. sıradan geldi,1.0,968,kısa iade ettim sıradan geldi,"[-0.917388436609301, -2.333944900282498, -1.28..."
4,45019480,5,Renkler daha canlı. Hoş,1.0,968,renkler canlı hoş,"[-0.9707606955267051, -2.207495289690354, -1.1..."


In [22]:
df_rew.columns

Index(['product_content_id', 'rate', 'comment', 'review_like_count',
       'supplier_id', 'comment_prep', 'embed_comments'],
      dtype='object')

In [23]:
df_rew_last = df_rew.loc[:,['product_content_id', 'rate','review_like_count',
       'supplier_id','embed_comments']]
df_rew_last

Unnamed: 0,product_content_id,rate,review_like_count,supplier_id,embed_comments
0,35244093,5,1.0,968,"[-1.3028405054253251, -1.9299605230198187, -1...."
1,97842293,5,1.0,367,"[-1.050075696950609, -2.095573910258033, -1.28..."
2,86701595,5,1.0,191874,"[-1.0251962876819232, -2.263921338668529, -1.2..."
3,107237246,3,1.0,968,"[-0.917388436609301, -2.333944900282498, -1.28..."
4,45019480,5,1.0,968,"[-0.9707606955267051, -2.207495289690354, -1.1..."
...,...,...,...,...,...
6176836,53600107,4,1.0,637,"[-0.8063289279816672, -1.8782661631703377, -1...."
6176837,42373905,1,1.0,142033,"[-1.0078904136654832, -2.0575201953611066, -1...."
6176838,54086523,5,1.0,1188,"[-0.7174545758827167, -2.217012132297863, -0.9..."
6176839,95633171,3,1.0,968,"[-0.5624091052057514, -1.7643309436648724, -0...."


In [25]:
#All features of review datframe is grouped by product content id and get mean of their values including comment vectors.
# In here, products which product content id is common for them has common embedding vector which means common representation.
df_rew_groupby_prod_content = df_rew.groupby(['product_content_id'])['rate','review_like_count','embed_comments'].apply(np.mean)

  df_rew_groupby_prod_content = df_rew.groupby(['product_content_id'])['rate','review_like_count','embed_comments'].apply(np.mean)


In [26]:
df_rew_groupby_prod_content.head(1)

Unnamed: 0_level_0,rate,review_like_count,embed_comments
product_content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
48921,3.333333,1.0,"[-0.8131187948386981, -2.1666808046712247, -1...."


In [27]:
len(df_rew_groupby_prod_content)

462323

In [None]:
df_rew.dropna(inplace=True)

In [None]:
# Embedded comments are converved to dataframe
df_new3 = pd.DataFrame(columns=np.arange(0,30,1),data=list(df_rew.embed_comments))
df_new3

In [None]:
# Embedded dataframe and main comments dataframe are merged.
df_rew = pd.concat([df_rew,df_new3],axis=1)

In [None]:
#Reduntant columns are dropped.
df_rew.drop(columns=['comment','comment_prep','embed_comments'],inplace=True)

In [28]:
#df_rew_groupby_prod_content.reset_index().to_pickle('df_rew_groupby_prod_content_v1.pkl')