In [9]:
import numpy as np
import re
import pandas as pd
import nltk.data

In [10]:
class DataSource(object):

    def _load_raw_data(self,filename, is_train=True):
        
        a = []
        b = []
        
        regex = 'train_'
        if not is_train:
            regex = 'test_'
        
        with open(filename, 'r', encoding="utf8") as file:
            for line in file :
                if regex in line:
                    b.append(a)
                    a = [line]
                elif line!='\n':
                    a.append(line)
                    
        b.append(a)      
        
        return b[1:]
    
    
    def _create_row(self, sample, is_train=True):
        
        d = {}
        d['id'] = sample[0].replace('\n','')
        review = ""
        
        if is_train:
            for clause in sample[1:-1]:
                review+= clause.replace('\n','').strip()
            d['label'] = int(sample[-1].replace('\n',''))          
        else:         
            for clause in sample[1:]:
                review+= clause.replace('\n','').strip()
        
        d['review'] = review
        
        return d
    
    
    def load_data(self, filename, is_train=True):
        
        raw_data = self._load_raw_data(filename, is_train)
        lst = []
        
        for row in raw_data:
            lst.append(self._create_row(row, is_train))
            
        return lst
    

In [24]:
def review_wordlist(review):
    review_text = str(review)
    # 2. Removing non-letter.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    # 3. Converting to lower case and splitting
    words = review_text.lower().split()
    return(words)

In [25]:
ds = DataSource()
train_data = pd.DataFrame(ds.load_data('dataset/train.crash'))
test_data = pd.DataFrame(ds.load_data('dataset/test.crash', is_train=False))

In [26]:
train_data

Unnamed: 0,id,label,review
0,train_000000,0,"""Dung dc sp tot cam onshop Đóng gói sản phẩm r..."
1,train_000001,0,""" Chất lượng sản phẩm tuyệt vời . Son mịn nhưn..."
2,train_000002,0,""" Chất lượng sản phẩm tuyệt vời nhưng k có hộp..."
3,train_000003,1,""":(( Mình hơi thất vọng 1 chút vì mình đã kỳ v..."
4,train_000004,1,"""Lần trước mình mua áo gió màu hồng rất ok mà ..."
5,train_000005,0,""" Chất lượng sản phẩm tuyệt vời có điều không ..."
6,train_000006,0,"""Đã nhận đc hàng rất nhanh mới đặt buổi tối mà..."
7,train_000007,1,"""Các siêu phẩm thấy cấu hình toàn tựa tựa nhau..."
8,train_000008,0,"""Hàng ship nhanh chất lượng tốt tư vấn nhiệt..."
9,train_000009,1,"""Đồng hồ đẹp nhưng 1 cái đứt dây 1 cái k chạy..."


In [27]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [28]:
import gensim
from distutils.version import LooseVersion, StrictVersion
import os
import codecs
global word2vec_model



In [29]:
model = './word2vec/wiki.vi.model.bin'

if os.path.isfile(model):
    print ('Loading word2vec model ...')
if LooseVersion(gensim.__version__) >= LooseVersion("1.0.1"):
    from gensim.models import KeyedVectors
    word2vec_model = KeyedVectors.load_word2vec_format(model, binary=True)
    print('loose')
else:
    from gensim.models import Word2Vec
    word2vec_model = Word2Vec.load_word2vec_format(model, binary=True)
    print('strict')



2019-02-19 02:05:44,993 : INFO : loading projection weights from ./word2vec/wiki.vi.model.bin


Loading word2vec model ...


2019-02-19 02:05:50,232 : INFO : loaded (231486, 400) matrix from ./word2vec/wiki.vi.model.bin


loose


In [40]:
word2vec_model.wv.syn0.shape

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


(231486, 400)

In [31]:
try:
    sim_list = word2vec_model.most_similar(query)
    print(sim_list)
    #output = word2vec_model.most_similar('u' + '\"' + 'A' + '\"', topn=5)
    output = []
    for wordsimilar in sim_list:
        # output[wordsimilar[0]] = wordsimilar[1]
        output.append(wordsimilar[0] + ' - '+ str(wordsimilar[1]))
except:
    print('except')
print(output)

2019-02-19 02:05:52,943 : INFO : precomputing L2-norms of word weight vectors


[('mèo', 0.660437822341919), ('thỏ', 0.6416544914245605), ('lợn', 0.5719792246818542), ('sói', 0.5173439979553223), ('cún', 0.4996679723262787), ('ngựa', 0.49387115240097046), ('cừu', 0.4883502721786499), ('gấu', 0.4873932898044586), ('chồn', 0.48664021492004395), ('dê', 0.4811408519744873)]
['mèo - 0.660437822341919', 'thỏ - 0.6416544914245605', 'lợn - 0.5719792246818542', 'sói - 0.5173439979553223', 'cún - 0.4996679723262787', 'ngựa - 0.49387115240097046', 'cừu - 0.4883502721786499', 'gấu - 0.4873932898044586', 'chồn - 0.48664021492004395', 'dê - 0.4811408519744873']


In [32]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [33]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [52]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(train_data.review, train_data.label, test_size=0.3,
    random_state=42)

In [67]:
num_features = 400
clean_train_reviews = []
for review in x_train:
    clean_train_reviews.append(review_wordlist(review))
    
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, word2vec_model, num_features)

Review 0 of 11260


  
  app.launch_new_instance()


Review 1000 of 11260
Review 2000 of 11260
Review 3000 of 11260
Review 4000 of 11260
Review 5000 of 11260
Review 6000 of 11260
Review 7000 of 11260
Review 8000 of 11260
Review 9000 of 11260
Review 10000 of 11260
Review 11000 of 11260


In [66]:
# Calculating average feature vactors for test set     
clean_test_reviews = []
for review in x_val:
    clean_test_reviews.append(review_wordlist(review))
    
testDataVecs = getAvgFeatureVecs(clean_test_reviews, word2vec_model, num_features)

Review 0 of 4827


  
  app.launch_new_instance()


Review 1000 of 4827
Review 2000 of 4827
Review 3000 of 4827
Review 4000 of 4827


In [48]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)

In [77]:
trainDataVecs

array([[ 0.07030577, -0.5106581 ,  1.3010975 , ..., -0.6133558 ,
        -0.14455178, -0.02068147],
       [-0.3468156 ,  0.34537446,  0.58451986, ..., -0.6792826 ,
         0.5661435 , -1.1725346 ],
       [-0.89693165, -0.22562678,  0.73319334, ...,  0.6456311 ,
        -0.4851318 , -0.36625028],
       ...,
       [-0.2616497 , -0.25497115,  0.87429756, ..., -0.00272737,
         0.26674172, -0.16582051],
       [ 0.37995565, -0.06219149,  0.5032856 , ..., -0.19829117,
         0.10717956, -0.08671689],
       [ 0.2505056 , -0.19075295,  1.4641299 , ...,  0.09442744,
        -0.29018992, -0.15566795]], dtype=float32)

In [92]:
np.all(np.isfinite(trainDataVecs))
np.any(np.isnan(trainDataVecs))
df = pd.DataFrame(trainDataVecs)



True

In [97]:
df = df.fillna(0)

In [98]:
forest.fit(df, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [102]:
y_predict = forest.predict(pd.DataFrame(testDataVecs).fillna(0))

In [107]:
from sklearn.metrics import accuracy_score

accuracy_score(y_val, y_predict)

0.7681789931634556

In [114]:
from sklearn import svm

In [117]:
clf = svm.SVC(gamma='scale',verbose=True)
clf.fit(df, y_train)


[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [118]:
y_predict = clf.predict(pd.DataFrame(testDataVecs).fillna(0))


In [119]:
accuracy_score(y_val, y_predict)

0.7973896830329397

In [120]:
test_data

Unnamed: 0,id,review
0,test_000000,"""Chưa dùng thử nên chưa biết"""
1,test_000001,""" Không đáng tiềnVì ngay đợt sale nên mới mua ..."
2,test_000002,"""Cám ơn shop. Đóng gói sản phẩm rất đẹp và chắ..."
3,test_000003,"""Vải đẹp.phom oki luôn.quá ưng"""
4,test_000004,"""Chuẩn hàng đóng gói đẹp"""
5,test_000005,""" Đóng gói sản phẩm rất đẹp và chắc chắn Shop ..."
6,test_000006,"""Sau khi đọc xong cuốn truyện thì cảm xú..."
7,test_000007,"""Chỉ cảm ứng khi gần dây điện ổ cắm ko có vật ..."
8,test_000008,"""Tệ😡 Sản phẩm đứt chỉ tùm lum😡 Rách quá trời c..."
9,test_000009,"""Shop Chất lượng sản phẩm rất kém Shop phục v..."


In [121]:
# Calculating average feature vactors for test set     
clean_test_reviews = []
for review in test_data['review']:
    clean_test_reviews.append(review_wordlist(review))
    
realTestDataVecs = getAvgFeatureVecs(clean_test_reviews, word2vec_model, num_features)

Review 0 of 10981


  
  app.launch_new_instance()


Review 1000 of 10981
Review 2000 of 10981
Review 3000 of 10981
Review 4000 of 10981
Review 5000 of 10981
Review 6000 of 10981
Review 7000 of 10981
Review 8000 of 10981
Review 9000 of 10981
Review 10000 of 10981


In [122]:
y_predict = clf.predict(pd.DataFrame(realTestDataVecs).fillna(0))


In [123]:
test_data['label'] = y_predict
test_data[['id','label']].to_csv('sample.csv',index=False)