In [1]:
import numpy as np
import re
import pandas as pd
import nltk.data
import gensim
from distutils.version import LooseVersion, StrictVersion
import os
import codecs
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
global word2vec_model
from deepai_nlp.tokenization.crf_tokenizer import CrfTokenizer
from deepai_nlp.tokenization.utils import preprocess_text
from sklearn.feature_extraction.text import TfidfVectorizer





# Heading

https://colab.research.google.com/github/ngxbac/aivivn_phanloaisacthaibinhluan/blob/master/baseline_lgbm_tfidf.ipynb#scrollTo=ojmynVfZtFRE

In [2]:
class DataSource(object):
    def _load_raw_data(self,filename, is_train=True):
        a = []
        b = []
        regex = 'train_'
        if not is_train:
            regex = 'test_'
        with open(filename, 'r', encoding="utf8") as file:
            for line in file :
                if regex in line:
                    b.append(a)
                    a = [line]
                elif line!='\n':
                    a.append(line)       
        b.append(a)      
        return b[1:]
    
    def _create_row(self, sample, is_train=True):
        d = {}
        d['id'] = sample[0].replace('\n','')
        review = ""
        if is_train:
            for clause in sample[1:-1]:
                review+= clause.replace('\n','').strip()
            d['label'] = int(sample[-1].replace('\n',''))          
        else:         
            for clause in sample[1:]:
                review+= clause.replace('\n','').strip()
        d['review'] = review
        return d
    
    
    def load_data(self, filename, is_train=True):
        raw_data = self._load_raw_data(filename, is_train)
        lst = []
        for row in raw_data:
            lst.append(self._create_row(row, is_train))
        return lst

In [32]:

mapping = {
    "ship": "v·∫≠n chuy·ªÉn",
    "shop": "c·ª≠a h√†ng",
    "sp": "s·∫£n ph·∫©m",
    "m": " m√¨nh",
    "mik": "m√¨nh",
    "k": "kh√¥ng",
    "kh": "kh√¥ng",
    "tl": "tr·∫£ l·ªùi",
    "r": "r·ªìi",
    "fb": "m·∫°ng x√£ h·ªôi ", # facebook
    "face": "m·∫°ng x√£ h·ªôi",
    "thanks": "c·∫£m ∆°n",
    "thank": "c·∫£m ∆°n",
    "tks": "c·∫£m ∆°n", 
    "dc": "ƒë∆∞·ª£c",
    "ok": "t·ªët",
    "dt": "ƒëi·ªán tho·∫°i",
    "h": "gi·ªù",
    "hsd": "h·∫°n s·ª≠ d·ª•ng",
    "trc": "tr∆∞·ªõc",
    "oki": "t·ªët",
    "ad": "c·ª≠a h√†ng"
}
for i, m in enumerate(mapping):
    mapping[i] = m.strip().replace(' ','_')
#Load stopwords
stopwords_file = 'vietnamese-stopwords.txt'
stopwords = []
with open(stopwords_file, 'r', encoding="utf8") as file:
    for line in file :
        stopwords.append(line.replace('\n','').strip().replace(' ','_'))
tokenizer = CrfTokenizer()
vietnamese_chars = "[^a-zA-Z_√Ä√Å√Ç√É√à√â√ä√å√ç√í√ì√î√ï√ô√öƒÇƒêƒ®≈®∆†√†√°√¢√£√®√©√™√¨√≠√≤√≥√¥√µ√π√∫ƒÉƒëƒ©≈©∆°∆ØƒÇ·∫†·∫¢·∫§·∫¶·∫®·∫™·∫¨·∫Æ·∫∞·∫≤·∫¥·∫∂·∫∏·∫∫·∫º·ªÄ·ªÄ·ªÇ∆∞ƒÉ·∫°·∫£·∫•·∫ß·∫©·∫´·∫≠·∫Ø·∫±·∫≥·∫µ·∫∑·∫π·∫ª·∫Ω·ªÅ·ªÅ·ªÉ·ªÑ·ªÜ·ªà·ªä·ªå·ªé·ªê·ªí·ªî·ªñ·ªò·ªö·ªú·ªû·ª†·ª¢·ª§·ª¶·ª®·ª™·ªÖ·ªá·ªâ·ªã·ªç·ªè·ªë·ªì·ªï·ªó·ªô·ªõ·ªù·ªü·ª°·ª£·ª•·ªß·ª©·ª´·ª¨·ªÆ·ª∞·ª≤·ª¥√ù·ª∂·ª∏·ª≠·ªØ·ª±·ª≥·ªµ·ª∑·ªπ]"

RuntimeError: dictionary changed size during iteration

In [9]:
stopwords

['a_l√¥',
 'a_ha',
 'ai',
 'ai_ai',
 'ai_n·∫•y',
 'ai_ƒë√≥',
 'al√¥',
 'amen',
 'anh',
 'anh_·∫•y',
 'ba',
 'ba_ba',
 'ba_b·∫£n',
 'ba_c√πng',
 'ba_h·ªç',
 'ba_ng√†y',
 'ba_ng√¥i',
 'ba_tƒÉng',
 'bao_gi·ªù',
 'bao_l√¢u',
 'bao_nhi√™u',
 'bao_n·∫£',
 'bay_bi·∫øn',
 'bi·∫øt',
 'bi·∫øt_bao',
 'bi·∫øt_bao_nhi√™u',
 'bi·∫øt_ch·∫Øc',
 'bi·∫øt_ch·ª´ng_n√†o',
 'bi·∫øt_m√¨nh',
 'bi·∫øt_m·∫•y',
 'bi·∫øt_th·∫ø',
 'bi·∫øt_tr∆∞·ªõc',
 'bi·∫øt_vi·ªác',
 'bi·∫øt_ƒë√¢u',
 'bi·∫øt_ƒë√¢u_ch·ª´ng',
 'bi·∫øt_ƒë√¢u_ƒë·∫•y',
 'bi·∫øt_ƒë∆∞·ª£c',
 'bu·ªïi',
 'bu·ªïi_l√†m',
 'bu·ªïi_m·ªõi',
 'bu·ªïi_ng√†y',
 'bu·ªïi_s·ªõm',
 'b√†',
 'b√†_·∫•y',
 'b√†i',
 'b√†i_b√°c',
 'b√†i_b·ªè',
 'b√†i_c√°i',
 'b√°c',
 'b√°n',
 'b√°n_c·∫•p',
 'b√°n_d·∫°',
 'b√°n_th·∫ø',
 'b√¢y_b·∫©y',
 'b√¢y_ch·ª´',
 'b√¢y_gi·ªù',
 'b√¢y_nhi√™u',
 'b√®n',
 'b√©ng',
 'b√™n',
 'b√™n_b·ªã',
 'b√™n_c√≥',
 'b√™n_c·∫°nh',
 'b√¥ng',
 'b∆∞·ªõc',
 'b∆∞·ªõc_kh·ªèi',
 'b∆∞·ªõc_t·ªõi',
 'b∆∞·ªõc_ƒëi',
 'b·∫°n',
 'b·∫£n',
 'b·∫£n_b·ªô',
 'b·∫£n_ri√™ng',


In [29]:
tokenizer = CrfTokenizer()
def clean_text(review, char_reg, stopwords, mapping):
    review_text = str(review)
    # 2. Removing non-letter.
    review_text = re.sub(char_reg," ",review_text)
    # 3. Converting to lower case and splitting
    review_text = review_text.lower()
    # 4. Subtitute words
    #words = review_text.split()
    words = tokenizer.tokenize(review_text)
    print(words)
    # 5. Remove stopwords
    #words = [w.replace('_',' ') for w in words]
    stops = set(stopwords)  
    #print(stops)
    words = [w for w in words if not w in stops]
    print(words)
    for i,w in enumerate(words):
        if w in mapping:
            words[i]= mapping[w]
    review_text = ' '.join(words)
    return(review_text)

In [30]:
clean_text("ch∆∞a d√πng th·ª≠ n√™n ch∆∞a bi·∫øt",char_reg = vietnamese_chars, mapping =mapping, stopwords = stopwords)

Loading model from file C:\ProgramData\Anaconda3\lib\site-packages\deepai_nlp-0.0.1-py3.7.egg\deepai_nlp\models/pretrained_tokenizer.crfsuite
['ch∆∞a', 'd√πng', 'th·ª≠', 'n√™n', 'ch∆∞a', 'bi', 't']
['th·ª≠', 'bi', 't']


'th·ª≠ bi t'

In [17]:
ds = DataSource()
train_data = pd.DataFrame(ds.load_data('dataset/train.crash'))
test_data = pd.DataFrame(ds.load_data('dataset/test.crash', is_train=False))
#train_data['review'] = train_data['review'].fillna("none")
#test_data['review'] = test_data['review'].fillna("none")
df = pd.concat([train_data,test_data], axis=0, sort=False)
df['review_cleaned'] = df['review'].apply(lambda s: clean_text(s,char_reg = vietnamese_chars, mapping =mapping, stopwords = stopwords))
df['num_words'] = df['review'].apply(lambda s: len(s.split()))
df['num_unique_words'] = df['review'].apply(lambda s: len(set(w for w in s.split())))
df['words_vs_unique'] = df['num_unique_words'] / df['num_words'] * 100


Loading model from file C:\ProgramData\Anaconda3\lib\site-packages\deepai_nlp-0.0.1-py3.7.egg\deepai_nlp\models/pretrained_tokenizer.crfsuite


In [18]:
df.head(100)

Unnamed: 0,id,label,review,review_cleaned,num_words,num_unique_words,words_vs_unique
0,train_000000,0.0,"""Dung dc sp tot cam onshop ƒê√≥ng g√≥i s·∫£n ph·∫©m r...",dung ƒë∆∞·ª£c s·∫£n ph·∫©m tot cam onshop ƒë√≥ng_g√≥i s·∫£n...,21,19,90.476190
1,train_000001,0.0,""" Ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m tuy·ªát v·ªùi . Son m·ªãn nh∆∞n...",ch·∫•t_l∆∞·ª£ng s·∫£n_ph·∫©m tuy·ªát_v·ªùi son m·ªãn ƒë√°nh m√†u...,19,19,100.000000
2,train_000002,0.0,""" Ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m tuy·ªát v·ªùi nh∆∞ng k c√≥ h·ªôp...",ch·∫•t_l∆∞·ª£ng s·∫£n_ph·∫©m tuy·ªát_v·ªùi kh√¥ng h·ªôp kh√¥ng ...,19,15,78.947368
3,train_000003,1.0,""":(( M√¨nh h∆°i th·∫•t v·ªçng 1 ch√∫t v√¨ m√¨nh ƒë√£ k·ª≥ v...",h∆°i th·∫•t_v·ªçng ch√∫t k·ª≥_v·ªçng s√°ch hi_v·ªçng h·ªçc_t·∫≠...,114,92,80.701754
4,train_000004,1.0,"""L·∫ßn tr∆∞·ªõc m√¨nh mua √°o gi√≥ m√†u h·ªìng r·∫•t ok m√† ...",mua √°o_gi√≥ m√†u h·ªìng t·ªët ƒë·ª£t giao √°o_gi√≥ ch·∫•t v...,26,24,92.307692
5,train_000005,0.0,""" Ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m tuy·ªát v·ªùi c√≥ ƒëi·ªÅu kh√¥ng ...",ch·∫•t_l∆∞·ª£ng s·∫£n_ph·∫©m tuy·ªát_v·ªùi c·ª©ng_c√°p c·ªë_ƒë·ªãnh...,23,22,95.652174
6,train_000006,0.0,"""ƒê√£ nh·∫≠n ƒëc h√†ng r·∫•t nhanh m·ªõi ƒë·∫∑t bu·ªïi t·ªëi m√†...",ƒëc h√†ng t·ªëi tr∆∞a mai ƒë√≥ng_g√≥i s·∫£n_ph·∫©m ƒë·∫πp c·ª≠a...,31,29,93.548387
7,train_000007,1.0,"""C√°c si√™u ph·∫©m th·∫•y c·∫•u h√¨nh to√†n t·ª±a t·ª±a nhau...",si√™u ph·∫©m c·∫•u_h√¨nh to√†n t·ª±a t·ª±a ko ƒë·ªôt_ph√° n√¢n...,48,44,91.666667
8,train_000008,0.0,"""H√†ng ship nhanh ch·∫•t l∆∞·ª£ng t·ªët t∆∞ v·∫•n nhi·ªát...",h√†ng v·∫≠n chuy·ªÉn ch·∫•t_l∆∞·ª£ng t∆∞_v·∫•n nhi·ªát_t√¨nh v...,20,19,95.000000
9,train_000009,1.0,"""ƒê·ªìng h·ªì ƒë·∫πp nh∆∞ng 1 c√°i ƒë·ª©t d√¢y 1 c√°i k ch·∫°y...",ƒë·ªìng_h·ªì ƒë·∫πp ƒë·ª©t d√¢y kh√¥ng ch·∫°y mua ve s·ª≠a,16,14,87.500000


In [19]:
df.to_csv('cleaned_data.csv', encoding = "utf8", index = False)

In [None]:
train_df = df[~df['label'].isnull()]
test_df = df[df['label'].isnull()]

train_comments = train_df['comment'].fillna("none").values
test_comments = test_df['comment'].fillna("none").values

y_train = train_df['label'].values

In [24]:
X_train = []
for review in train_data.review:
    X_train.append(clean_text(review, char_reg = vietnamese_chars, stopwords = stopwords))
    

KeyboardInterrupt: 

In [None]:
X_train

In [None]:

X_train = tfidf.fit_transform(train_data.review)
X_test = tfidf.transform(test_comments)

In [299]:
X_train = getAvgFeatureVecs(clean_train_reviews, word2vec_model, num_features)

Review 0 of 16087


  
  app.launch_new_instance()


Review 1000 of 16087
Review 2000 of 16087
Review 3000 of 16087
Review 4000 of 16087
Review 5000 of 16087
Review 6000 of 16087
Review 7000 of 16087
Review 8000 of 16087
Review 9000 of 16087
Review 10000 of 16087
Review 11000 of 16087
Review 12000 of 16087
Review 13000 of 16087
Review 14000 of 16087
Review 15000 of 16087
Review 16000 of 16087


In [283]:
length = length.reshape(16087,1)

In [300]:
X_train = np.concatenate((X_train,bow_train_features),axis=1)
#X_train = np.concatenate((X_train,length),axis=1)

y_train = train_data.label


In [207]:
# Calculating average feature vactors for test set     
clean_test_reviews = []
for review in test_data.review:
    clean_test_reviews.append(review_wordlist(review))
bow_test_features = getEmojiBowFeatures(test_data.review, emojis_vocab)
bow_test_features = sc.fit_transform(bow_test_features)


In [229]:
X_test = getAvgFeatureVecs(clean_test_reviews, word2vec_model, num_features)


Review 0 of 10981


  del sys.path[0]
  app.launch_new_instance()


Review 1000 of 10981
Review 2000 of 10981
Review 3000 of 10981
Review 4000 of 10981
Review 5000 of 10981
Review 6000 of 10981
Review 7000 of 10981
Review 8000 of 10981
Review 9000 of 10981
Review 10000 of 10981


In [270]:
X_test = np.concatenate((X_test,bow_test_features),axis=1)
X_test = np.concatenate((X_test,length),axis=1)


ValueError: all the input arrays must have same number of dimensions

In [286]:
X_train.shape

(16087, 226)

In [287]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

XX_train, X_val, yy_train, y_val = train_test_split(X_train, y_train, test_size=0.2,
    random_state=42)

forest = RandomForestClassifier(n_estimators = 100)
forest.fit(pd.DataFrame(XX_train).fillna(0), yy_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [288]:
y_predict = forest.predict(pd.DataFrame(X_val).fillna(0))
accuracy_score(y_val, y_predict)

0.8104412678682411

In [303]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

XX_train, X_val, yy_train, y_val = train_test_split(X_train, y_train, test_size=0.2,
    random_state=42)
sc = StandardScaler()
#XX_train = sc.fit_transform(XX_train)
#X_test = sc.transform(X_test)
clf = svm.SVC(gamma='scale',verbose=True)
clf.fit(pd.DataFrame(XX_train).fillna(0), yy_train)
y_predict = clf.predict(pd.DataFrame(X_val).fillna(0))
accuracy_score(y_val, y_predict)

[LibSVM]

0.8505282784338098

In [302]:
XX_train.shape

(12869, 525)

In [None]:
import lightgbm as lgb

d_train = lgb.Dataset(df, label=y_train)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 80
params['min_data'] = 50
params['max_depth'] = 20
clf = lgb.train(params, d_train, 100)

In [216]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

In [215]:
y_train = train_data.label

In [292]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import lightgbm as lgb
import gc

print("Starting LightGBM. Train shape: {}, test shape: {}".format(X_train.shape, X_test.shape))

# Cross validation model
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)

# Create arrays and dataframes to store results
oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])

# k-fold
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
    print("Fold %s" % (n_fold))
    train_x, train_y = X_train[train_idx], y_train[train_idx]
    valid_x, valid_y = X_train[valid_idx], y_train[valid_idx]

    # set data structure
    lgb_train = lgb.Dataset(train_x,
                            label=train_y,
                            free_raw_data=False)
    lgb_test = lgb.Dataset(valid_x,
                           label=valid_y,
                           free_raw_data=False)

    params = {
        'objective' :'binary',
        'learning_rate' : 0.01,
        'num_leaves' : 76,
        'feature_fraction': 0.64, 
        'bagging_fraction': 0.8, 
        'bagging_freq':1,
        'boosting_type' : 'gbdt',
    }

    reg = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_test],
        valid_names=['train', 'valid'],
        num_boost_round=10000,
        verbose_eval=100,
        early_stopping_rounds=100,
        feval=lgb_f1_score
    )

    oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
    sub_preds += reg.predict(X_test, num_iteration=reg.best_iteration) / folds.n_splits

    del reg, train_x, train_y, valid_x, valid_y
    gc.collect()

Starting LightGBM. Train shape: (16087, 226), test shape: (10981, 350)
Fold 0


  'precision', 'predicted', average, warn_for)


Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.448914	train's f1: 0.858152	valid's binary_logloss: 0.502944	valid's f1: 0.756535
[200]	train's binary_logloss: 0.347329	train's f1: 0.893712	valid's binary_logloss: 0.440639	valid's f1: 0.779528
[300]	train's binary_logloss: 0.284272	train's f1: 0.919402	valid's binary_logloss: 0.411849	valid's f1: 0.790167
[400]	train's binary_logloss: 0.23836	train's f1: 0.940111	valid's binary_logloss: 0.394463	valid's f1: 0.793017
[500]	train's binary_logloss: 0.202597	train's f1: 0.95693	valid's binary_logloss: 0.38381	valid's f1: 0.796013
[600]	train's binary_logloss: 0.173818	train's f1: 0.97022	valid's binary_logloss: 0.376507	valid's f1: 0.799858
[700]	train's binary_logloss: 0.150022	train's f1: 0.980539	valid's binary_logloss: 0.371869	valid's f1: 0.80427
[800]	train's binary_logloss: 0.130127	train's f1: 0.988078	valid's binary_logloss: 0.369203	valid's f1: 0.804826
[900]	train's binary_logloss: 

KeyboardInterrupt: 

In [57]:
threshold = 0.5
preds = (sub_preds > threshold).astype(np.uint8)

In [59]:
test_data['label'] = preds
test_data[['id','label']].to_csv('predictions.csv',index=False)

In [44]:
# Calculating average feature vactors for test set     
clean_real_test_reviews = []
for review in test_data['review']:
    clean_real_test_reviews.append(review_wordlist(review))
    
realTestDataVecs = getAvgFeatureVecs(clean_real_test_reviews, word2vec_model, num_features)


Review 0 of 10981


  
  app.launch_new_instance()


Review 1000 of 10981
Review 2000 of 10981
Review 3000 of 10981
Review 4000 of 10981
Review 5000 of 10981
Review 6000 of 10981
Review 7000 of 10981
Review 8000 of 10981
Review 9000 of 10981
Review 10000 of 10981


ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [53]:
bow_train_features = getEmojiBowFeatures(test_data['review'], emojis_vocab)
realTestDataVecs = np.concatenate((realTestDataVecs,bow_train_features),axis=1)

In [54]:
X_test = realTestDataVecs

In [67]:
y_predict = clf.predict(pd.DataFrame(realTestDataVecs).fillna(0))


In [70]:
test_data['label'] = y_predict
test_data[['id','label']].to_csv('predictions.csv',index=False)