In [123]:
import numpy as np
import re
import pandas as pd
import nltk.data
import gensim
from distutils.version import LooseVersion, StrictVersion
import os
import codecs
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
global word2vec_model


In [105]:
class DataSource(object):
    def _load_raw_data(self,filename, is_train=True):
        a = []
        b = []
        regex = 'train_'
        if not is_train:
            regex = 'test_'
        with open(filename, 'r', encoding="utf8") as file:
            for line in file :
                if regex in line:
                    b.append(a)
                    a = [line]
                elif line!='\n':
                    a.append(line)       
        b.append(a)      
        return b[1:]
    
    def _create_row(self, sample, is_train=True):
        d = {}
        d['id'] = sample[0].replace('\n','')
        review = ""
        if is_train:
            for clause in sample[1:-1]:
                review+= clause.replace('\n','').strip()
            d['label'] = int(sample[-1].replace('\n',''))          
        else:         
            for clause in sample[1:]:
                review+= clause.replace('\n','').strip()
        d['review'] = review
        return d
    
    
    def load_data(self, filename, is_train=True):
        raw_data = self._load_raw_data(filename, is_train)
        lst = []
        for row in raw_data:
            lst.append(self._create_row(row, is_train))
        return lst

In [106]:
#Load stopwords
stopwords_file = 'vietnamese-stopwords.txt'
stopwords = []
with open(stopwords_file, 'r', encoding="utf8") as file:
    for line in file :
        stopwords.append(line.replace('\n','').strip())

In [107]:
vietnamese_chars = "[^a-z0-9A-Z_√Ä√Å√Ç√É√à√â√ä√å√ç√í√ì√î√ï√ô√öƒÇƒêƒ®≈®∆†√†√°√¢√£√®√©√™√¨√≠√≤√≥√¥√µ√π√∫ƒÉƒëƒ©≈©∆°∆ØƒÇ·∫†·∫¢·∫§·∫¶·∫®·∫™·∫¨·∫Æ·∫∞·∫≤·∫¥·∫∂·∫∏·∫∫·∫º·ªÄ·ªÄ·ªÇ∆∞ƒÉ·∫°·∫£·∫•·∫ß·∫©·∫´·∫≠·∫Ø·∫±·∫≥·∫µ·∫∑·∫π·∫ª·∫Ω·ªÅ·ªÅ·ªÉ·ªÑ·ªÜ·ªà·ªä·ªå·ªé·ªê·ªí·ªî·ªñ·ªò·ªö·ªú·ªû·ª†·ª¢·ª§·ª¶·ª®·ª™·ªÖ·ªá·ªâ·ªã·ªç·ªè·ªë·ªì·ªï·ªó·ªô·ªõ·ªù·ªü·ª°·ª£·ª•·ªß·ª©·ª´·ª¨·ªÆ·ª∞·ª≤·ª¥√ù·ª∂·ª∏·ª≠·ªØ·ª±·ª≥·ªµ·ª∑·ªπ]"
def review_wordlist(review, remove_stopwords= False):
    review_text = str(review)
    # 2. Removing non-letter.
    review_text = re.sub(vietnamese_chars," ",review_text)
    # 3. Converting to lower case and splitting
    words = review_text.lower().split()
    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords)     
        words = [w for w in words if not w in stops]
    return(words)

In [110]:
ds = DataSource()
train_data = pd.DataFrame(ds.load_data('dataset/train.crash'))
test_data = pd.DataFrame(ds.load_data('dataset/test.crash', is_train=False))
train_data['review'] = train_data['review'].fillna("none")
test_data['review'] = test_data['review'].fillna("none")

In [111]:
train_data.head()

Unnamed: 0,id,label,review
0,train_000000,0,"""Dung dc sp tot cam onshop ƒê√≥ng g√≥i s·∫£n ph·∫©m r..."
1,train_000001,0,""" Ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m tuy·ªát v·ªùi . Son m·ªãn nh∆∞n..."
2,train_000002,0,""" Ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m tuy·ªát v·ªùi nh∆∞ng k c√≥ h·ªôp..."
3,train_000003,1,""":(( M√¨nh h∆°i th·∫•t v·ªçng 1 ch√∫t v√¨ m√¨nh ƒë√£ k·ª≥ v..."
4,train_000004,1,"""L·∫ßn tr∆∞·ªõc m√¨nh mua √°o gi√≥ m√†u h·ªìng r·∫•t ok m√† ..."


In [77]:
import emoji

def extract_emojis(str):
    return [c for c in str if c in emoji.UNICODE_EMOJI]

good_df = train_data[train_data['label'] == 0]
good_comment = good_df['review'].values
good_emoji = []
for c in good_comment:
      good_emoji += extract_emojis(c)

good_emoji = np.unique(np.asarray(good_emoji))


bad_df = train_data[train_data['label'] == 1]
bad_comment = bad_df['review'].values

bad_emoji = []
for c in bad_comment:
    bad_emoji += extract_emojis(c)

bad_emoji = np.unique(np.asarray(bad_emoji))

In [124]:
model = './word2vec/wiki.vi.model.bin'
#Load word2vec model
if os.path.isfile(model):
    print ('Loading word2vec model ...')
if LooseVersion(gensim.__version__) >= LooseVersion("1.0.1"):
    from gensim.models import KeyedVectors
    word2vec_model = KeyedVectors.load_word2vec_format(model, binary=True)
else:
    from gensim.models import Word2Vec
    word2vec_model = Word2Vec.load_word2vec_format(model, binary=True)
word2vec_model.wv.syn0.shape



2019-02-20 16:00:47,010 : INFO : loading projection weights from ./word2vec/wiki.vi.model.bin


Loading word2vec model ...


2019-02-20 16:01:04,044 : INFO : loaded (231486, 400) matrix from ./word2vec/wiki.vi.model.bin
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


(231486, 400)

In [125]:
output = []
try:
    sim_list = word2vec_model.most_similar("th√≠ch")
    print(sim_list)
    #output = word2vec_model.most_similar('u' + '\"' + 'A' + '\"', topn=5)

    for wordsimilar in sim_list:
        # output[wordsimilar[0]] = wordsimilar[1]
        output.append(wordsimilar[0] + ' - '+ str(wordsimilar[1]))
except:
    print('except')
print(output)

2019-02-20 16:01:31,856 : INFO : precomputing L2-norms of word weight vectors


[('t·ªÖu', 0.5424543023109436), ('chu·ªông', 0.5406407117843628), ('tho√≤ng', 0.5046262741088867), ('ch·ªông', 0.4467487633228302), ('hydrophilic', 0.44052204489707947), ('√∞√°ng', 0.3866061270236969), ('karlspreis', 0.3651212751865387), ('thik', 0.35866597294807434), ('c√∫n', 0.3559001088142395), ('ghi', 0.3471173048019409)]
['t·ªÖu - 0.5424543023109436', 'chu·ªông - 0.5406407117843628', 'tho√≤ng - 0.5046262741088867', 'ch·ªông - 0.4467487633228302', 'hydrophilic - 0.44052204489707947', '√∞√°ng - 0.3866061270236969', 'karlspreis - 0.3651212751865387', 'thik - 0.35866597294807434', 'c√∫n - 0.3559001088142395', 'ghi - 0.3471173048019409']


In [126]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [127]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [130]:
import emoji
def extract_emojis(str):
    return [c for c in str if c in emoji.UNICODE_EMOJI]
emojis_vocab = []
for r in train_data['review']:
    emojis_vocab += extract_emojis(r)
emojis_vocab = np.unique(np.asarray(emojis_vocab))

In [131]:
def getEmojiBowFeatures(reviews,vocab):
    bow_emoji_features = []
    for r in reviews:
        emojis = extract_emojis(r)
        bag_vector = np.zeros(len(vocab))
        # print(emojis_bow)
        for e in emojis_bow:
            for i,emojii in enumerate(emojis):
                if emojii == e: 
                    bag_vector[i] += 1
        bow_emoji_features.append(bag_vector)
    return np.asarray(bow_emoji_features)

In [132]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(train_data.review, train_data.label, test_size=0.2,
    random_state=42)

In [134]:
num_features = 400
clean_train_reviews = []
for review in x_train:
    clean_train_reviews.append(review_wordlist(review, remove_stopwords=False))
bow_train_features = getEmojiBowFeatures(x_train, emojis_vocab)

In [138]:
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, word2vec_model, num_features)

Review 0 of 11260


  


KeyboardInterrupt: 

In [51]:
# Calculating average feature vactors for test set     
clean_test_reviews = []
for review in x_val:
    clean_test_reviews.append(review_wordlist(review))
bow_train_features = getEmojiBowFeatures(x_val, emojis_vocab)

In [53]:
testDataVecs = getAvgFeatureVecs(clean_test_reviews, word2vec_model, num_features)

Review 0 of 4827


  
  app.launch_new_instance()


Review 1000 of 4827
Review 2000 of 4827
Review 3000 of 4827
Review 4000 of 4827


In [55]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)

In [58]:
df = pd.DataFrame(trainDataVecs)
forest.fit(pd.DataFrame(trainDataVecs).fillna(0), y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [59]:
y_predict = forest.predict(pd.DataFrame(testDataVecs).fillna(0))

In [60]:
from sklearn.metrics import accuracy_score

accuracy_score(y_val, y_predict)

0.8288792210482702

In [61]:
from sklearn import svm

In [63]:
clf = svm.SVC(gamma='scale',verbose=True)
clf.fit(pd.DataFrame(trainDataVecs).fillna(0), y_train)


[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [64]:
y_predict = clf.predict(pd.DataFrame(testDataVecs).fillna(0))


In [65]:
accuracy_score(y_val, y_predict)

0.8713486637663145

In [120]:
test_data

Unnamed: 0,id,review
0,test_000000,"""Ch∆∞a d√πng th·ª≠ n√™n ch∆∞a bi·∫øt"""
1,test_000001,""" Kh√¥ng ƒë√°ng ti·ªÅnV√¨ ngay ƒë·ª£t sale n√™n m·ªõi mua ..."
2,test_000002,"""C√°m ∆°n shop. ƒê√≥ng g√≥i s·∫£n ph·∫©m r·∫•t ƒë·∫πp v√† ch·∫Ø..."
3,test_000003,"""V·∫£i ƒë·∫πp.phom oki lu√¥n.qu√° ∆∞ng"""
4,test_000004,"""Chu·∫©n h√†ng ƒë√≥ng g√≥i ƒë·∫πp"""
5,test_000005,""" ƒê√≥ng g√≥i s·∫£n ph·∫©m r·∫•t ƒë·∫πp v√† ch·∫Øc ch·∫Øn Shop ..."
6,test_000006,"""Sau khi ƒëoÃ£c xong cu√¥ÃÅn truy√™Ã£n thiÃÄ caÃâm xuÃÅ..."
7,test_000007,"""Ch·ªâ c·∫£m ·ª©ng khi g·∫ßn d√¢y ƒëi·ªán ·ªï c·∫Øm ko c√≥ v·∫≠t ..."
8,test_000008,"""T·ªáüò° S·∫£n ph·∫©m ƒë·ª©t ch·ªâ t√πm lumüò° R√°ch qu√° tr·ªùi c..."
9,test_000009,"""Shop Ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m r·∫•t k√©m Shop ph·ª•c v..."


In [66]:
# Calculating average feature vactors for test set     
clean_real_test_reviews = []
for review in test_data['review']:
    clean_real_test_reviews.append(review_wordlist(review))
    
realTestDataVecs = getAvgFeatureVecs(clean_real_test_reviews, word2vec_model, num_features)

Review 0 of 10981


  
  app.launch_new_instance()


Review 1000 of 10981
Review 2000 of 10981
Review 3000 of 10981
Review 4000 of 10981
Review 5000 of 10981
Review 6000 of 10981
Review 7000 of 10981
Review 8000 of 10981
Review 9000 of 10981
Review 10000 of 10981


In [67]:
y_predict = clf.predict(pd.DataFrame(realTestDataVecs).fillna(0))


In [70]:
test_data['label'] = y_predict
test_data[['id','label']].to_csv('predictions.csv',index=False)