In [163]:
import numpy as np
import re
import pandas as pd
import nltk.data
import gensim
from distutils.version import LooseVersion, StrictVersion
import os
import codecs
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
global word2vec_model
from deepai_nlp.tokenization.crf_tokenizer import CrfTokenizer
from deepai_nlp.tokenization.utils import preprocess_text
from sklearn.feature_extraction.text import TfidfVectorizer



https://colab.research.google.com/github/ngxbac/aivivn_phanloaisacthaibinhluan/blob/master/baseline_lgbm_tfidf.ipynb#scrollTo=ojmynVfZtFRE

In [164]:
class DataSource(object):
    def _load_raw_data(self,filename, is_train=True):
        a = []
        b = []
        regex = 'train_'
        if not is_train:
            regex = 'test_'
        with open(filename, 'r', encoding="utf8") as file:
            for line in file :
                if regex in line:
                    b.append(a)
                    a = [line]
                elif line!='\n':
                    a.append(line)       
        b.append(a)      
        return b[1:]
    
    def _create_row(self, sample, is_train=True):
        d = {}
        d['id'] = sample[0].replace('\n','')
        review = ""
        if is_train:
            for clause in sample[1:-1]:
                review+= clause.replace('\n','').strip()
            d['label'] = int(sample[-1].replace('\n',''))          
        else:         
            for clause in sample[1:]:
                review+= clause.replace('\n','').strip()
        d['review'] = review
        return d
    
    
    def load_data(self, filename, is_train=True):
        raw_data = self._load_raw_data(filename, is_train)
        lst = []
        for row in raw_data:
            lst.append(self._create_row(row, is_train))
        return lst

In [165]:
#Load stopwords
stopwords_file = 'vietnamese-stopwords.txt'
stopwords = []
with open(stopwords_file, 'r', encoding="utf8") as file:
    for line in file :
        stopwords.append(line.replace('\n','').strip())

In [166]:
mapping = {
    "ship": "vận chuyển",
    "shop": "cửa hàng",
    "sp": "sản phẩm",
    r"\bm\b": " mình",
    "mik": "mình",
    r"\bk\b": "không",
    r"\bkh\b": "không",
    r"\btl\b": "trả lời",
    r"\br\b": "rồi",
    "fb": "mạng xã hội ", # facebook
    "face": "mạng xã hội",
    "thanks": "cảm ơn",
    "thank": "cảm ơn",
    "tks": "cảm ơn", 
    r"\bdc\b": "được",
    r"\bok\b": "tốt",
    r"\bdt\b": "điện thoại",
    r"\bh\b": "giờ"
}

In [167]:
tokenizer = CrfTokenizer()
vietnamese_chars = "[^a-zA-Z_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹ]"
def review_wordlist(review, remove_stopwords= False):
    review_text = str(review)
    # 2. Removing non-letter.
    review_text = re.sub(vietnamese_chars," ",review_text)
    # 3. Converting to lower case and splitting
    review_text = review_text.lower()
    for key, value in mapping.items():
        review_text = re.sub(key,value,review_text)
    words = tokenizer.tokenize(review_text)
    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords)     
        words = [w for w in words if not w in stops]
    return(words)

In [168]:
ds = DataSource()
train_data = pd.DataFrame(ds.load_data('dataset/train.crash'))
test_data = pd.DataFrame(ds.load_data('dataset/test.crash', is_train=False))
train_data['review'] = train_data['review'].fillna("none")
test_data['review'] = test_data['review'].fillna("none")

In [169]:
train_data.head()

Unnamed: 0,id,label,review
0,train_000000,0,"""Dung dc sp tot cam onshop Đóng gói sản phẩm r..."
1,train_000001,0,""" Chất lượng sản phẩm tuyệt vời . Son mịn nhưn..."
2,train_000002,0,""" Chất lượng sản phẩm tuyệt vời nhưng k có hộp..."
3,train_000003,1,""":(( Mình hơi thất vọng 1 chút vì mình đã kỳ v..."
4,train_000004,1,"""Lần trước mình mua áo gió màu hồng rất ok mà ..."


In [297]:
model = './word2vec/wiki.vi.model.bin'
#Load word2vec model
if os.path.isfile(model):
    print ('Loading word2vec model ...')
if LooseVersion(gensim.__version__) >= LooseVersion("1.0.1"):
    from gensim.models import KeyedVectors
    word2vec_model = KeyedVectors.load_word2vec_format(model, binary=True)
else:
    from gensim.models import Word2Vec
    word2vec_model = Word2Vec.load_word2vec_format(model, binary=True)
word2vec_model.wv.syn0.shape



Loading word2vec model ...


2019-02-22 05:52:13,180 : INFO : loading projection weights from ./word2vec/wiki.vi.model.bin
2019-02-22 05:52:17,748 : INFO : loaded (231486, 400) matrix from ./word2vec/wiki.vi.model.bin
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


(231486, 400)

In [173]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [174]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [248]:
import emoji
def extract_emojis(str):
    return [c for c in str if c in emoji.UNICODE_EMOJI]
emojis_vocab = []
for r in train_data['review']:
    emojis_vocab += extract_emojis(r)
emojis_vocab = np.unique(np.asarray(emojis_vocab))

In [256]:
def getEmojiBowFeatures(reviews,vocab):
    bow_emoji_features = []
    for r in reviews:
        emojis = extract_emojis(r)
        bag_vector = np.zeros(len(vocab))
        #print(emojis)
        for e in emojis:
            for i,emojii in enumerate(emojis_vocab):
                if emojii == e: 
                    bag_vector[i] += 1
        bow_emoji_features.append(bag_vector)
    return np.asarray(bow_emoji_features)

#getEmojiBowFeatures("💚💚😑💕💕", emojis_vocab)

In [298]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
num_features = 400#100
clean_train_reviews = []
for review in train_data.review:
    clean_train_reviews.append(review_wordlist(review, remove_stopwords=False))
bow_train_features = getEmojiBowFeatures(train_data.review, emojis_vocab)
bow_train_features = sc.fit_transform(bow_train_features)
#print(bow_train_features.shape)
length = np.asarray([len(r) for r in train_data.review])

In [299]:
X_train = getAvgFeatureVecs(clean_train_reviews, word2vec_model, num_features)

Review 0 of 16087


  
  app.launch_new_instance()


Review 1000 of 16087
Review 2000 of 16087
Review 3000 of 16087
Review 4000 of 16087
Review 5000 of 16087
Review 6000 of 16087
Review 7000 of 16087
Review 8000 of 16087
Review 9000 of 16087
Review 10000 of 16087
Review 11000 of 16087
Review 12000 of 16087
Review 13000 of 16087
Review 14000 of 16087
Review 15000 of 16087
Review 16000 of 16087


In [283]:
length = length.reshape(16087,1)

In [300]:
X_train = np.concatenate((X_train,bow_train_features),axis=1)
#X_train = np.concatenate((X_train,length),axis=1)

y_train = train_data.label


In [207]:
# Calculating average feature vactors for test set     
clean_test_reviews = []
for review in test_data.review:
    clean_test_reviews.append(review_wordlist(review))
bow_test_features = getEmojiBowFeatures(test_data.review, emojis_vocab)
bow_test_features = sc.fit_transform(bow_test_features)


In [229]:
X_test = getAvgFeatureVecs(clean_test_reviews, word2vec_model, num_features)


Review 0 of 10981


  del sys.path[0]
  app.launch_new_instance()


Review 1000 of 10981
Review 2000 of 10981
Review 3000 of 10981
Review 4000 of 10981
Review 5000 of 10981
Review 6000 of 10981
Review 7000 of 10981
Review 8000 of 10981
Review 9000 of 10981
Review 10000 of 10981


In [270]:
X_test = np.concatenate((X_test,bow_test_features),axis=1)
X_test = np.concatenate((X_test,length),axis=1)


ValueError: all the input arrays must have same number of dimensions

In [286]:
X_train.shape

(16087, 226)

In [287]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

XX_train, X_val, yy_train, y_val = train_test_split(X_train, y_train, test_size=0.2,
    random_state=42)

forest = RandomForestClassifier(n_estimators = 100)
forest.fit(pd.DataFrame(XX_train).fillna(0), yy_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [288]:
y_predict = forest.predict(pd.DataFrame(X_val).fillna(0))
accuracy_score(y_val, y_predict)

0.8104412678682411

In [303]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

XX_train, X_val, yy_train, y_val = train_test_split(X_train, y_train, test_size=0.2,
    random_state=42)
sc = StandardScaler()
#XX_train = sc.fit_transform(XX_train)
#X_test = sc.transform(X_test)
clf = svm.SVC(gamma='scale',verbose=True)
clf.fit(pd.DataFrame(XX_train).fillna(0), yy_train)
y_predict = clf.predict(pd.DataFrame(X_val).fillna(0))
accuracy_score(y_val, y_predict)

[LibSVM]

0.8505282784338098

In [302]:
XX_train.shape

(12869, 525)

In [None]:
import lightgbm as lgb

d_train = lgb.Dataset(df, label=y_train)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 80
params['min_data'] = 50
params['max_depth'] = 20
clf = lgb.train(params, d_train, 100)

In [216]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

In [215]:
y_train = train_data.label

In [292]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import lightgbm as lgb
import gc

print("Starting LightGBM. Train shape: {}, test shape: {}".format(X_train.shape, X_test.shape))

# Cross validation model
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)

# Create arrays and dataframes to store results
oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])

# k-fold
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
    print("Fold %s" % (n_fold))
    train_x, train_y = X_train[train_idx], y_train[train_idx]
    valid_x, valid_y = X_train[valid_idx], y_train[valid_idx]

    # set data structure
    lgb_train = lgb.Dataset(train_x,
                            label=train_y,
                            free_raw_data=False)
    lgb_test = lgb.Dataset(valid_x,
                           label=valid_y,
                           free_raw_data=False)

    params = {
        'objective' :'binary',
        'learning_rate' : 0.01,
        'num_leaves' : 76,
        'feature_fraction': 0.64, 
        'bagging_fraction': 0.8, 
        'bagging_freq':1,
        'boosting_type' : 'gbdt',
    }

    reg = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_test],
        valid_names=['train', 'valid'],
        num_boost_round=10000,
        verbose_eval=100,
        early_stopping_rounds=100,
        feval=lgb_f1_score
    )

    oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
    sub_preds += reg.predict(X_test, num_iteration=reg.best_iteration) / folds.n_splits

    del reg, train_x, train_y, valid_x, valid_y
    gc.collect()

Starting LightGBM. Train shape: (16087, 226), test shape: (10981, 350)
Fold 0


  'precision', 'predicted', average, warn_for)


Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.448914	train's f1: 0.858152	valid's binary_logloss: 0.502944	valid's f1: 0.756535
[200]	train's binary_logloss: 0.347329	train's f1: 0.893712	valid's binary_logloss: 0.440639	valid's f1: 0.779528
[300]	train's binary_logloss: 0.284272	train's f1: 0.919402	valid's binary_logloss: 0.411849	valid's f1: 0.790167
[400]	train's binary_logloss: 0.23836	train's f1: 0.940111	valid's binary_logloss: 0.394463	valid's f1: 0.793017
[500]	train's binary_logloss: 0.202597	train's f1: 0.95693	valid's binary_logloss: 0.38381	valid's f1: 0.796013
[600]	train's binary_logloss: 0.173818	train's f1: 0.97022	valid's binary_logloss: 0.376507	valid's f1: 0.799858
[700]	train's binary_logloss: 0.150022	train's f1: 0.980539	valid's binary_logloss: 0.371869	valid's f1: 0.80427
[800]	train's binary_logloss: 0.130127	train's f1: 0.988078	valid's binary_logloss: 0.369203	valid's f1: 0.804826
[900]	train's binary_logloss: 

KeyboardInterrupt: 

In [57]:
threshold = 0.5
preds = (sub_preds > threshold).astype(np.uint8)

In [59]:
test_data['label'] = preds
test_data[['id','label']].to_csv('predictions.csv',index=False)

In [44]:
# Calculating average feature vactors for test set     
clean_real_test_reviews = []
for review in test_data['review']:
    clean_real_test_reviews.append(review_wordlist(review))
    
realTestDataVecs = getAvgFeatureVecs(clean_real_test_reviews, word2vec_model, num_features)


Review 0 of 10981


  
  app.launch_new_instance()


Review 1000 of 10981
Review 2000 of 10981
Review 3000 of 10981
Review 4000 of 10981
Review 5000 of 10981
Review 6000 of 10981
Review 7000 of 10981
Review 8000 of 10981
Review 9000 of 10981
Review 10000 of 10981


ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [53]:
bow_train_features = getEmojiBowFeatures(test_data['review'], emojis_vocab)
realTestDataVecs = np.concatenate((realTestDataVecs,bow_train_features),axis=1)

In [54]:
X_test = realTestDataVecs

In [67]:
y_predict = clf.predict(pd.DataFrame(realTestDataVecs).fillna(0))


In [70]:
test_data['label'] = y_predict
test_data[['id','label']].to_csv('predictions.csv',index=False)