In [1]:
import pandas as pd #导入Pandas
import numpy as np #导入Numpy
import jieba #导入结巴分词
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
import warnings
warnings.filterwarnings("ignore")

import re
from collections import Counter, defaultdict
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn import metrics

Using TensorFlow backend.


In [2]:
def build_dataset(words, vocabulary_size = 5000):
    from collections import Counter
    count = [['UNK', -1]]
    count.extend(Counter(words).most_common(vocabulary_size - 1))
    w_dictionary = {}
    for word, _ in count:
        w_dictionary[word] = len(w_dictionary)
    da = list()
    unk_count = 0
    for word in words:
        if word in w_dictionary:
            index = w_dictionary[word]
        else:
            index = 0
            unk_count += 1
        da.append(index)
    count[0][1] = unk_count
    reverse_dictionary = {zip(w_dictionary.values(), w_dictionary.keys())}
    return da, count, w_dictionary, reverse_dictionary

def rmsel(true_label,pred):
    rmse = np.sqrt(metrics.mean_squared_error(true_label, pred))
    return rmse / (1 + rmse)

In [3]:
train = pd.read_csv('../input/train_first.csv')
predict = pd.read_csv('../input/predict_first.csv')
predict['Score'] = -1

data = pd.concat([train, predict])
data.head()

Unnamed: 0,Id,Discuss,Score
0,201e8bf2-77a2-3a98-9fcf-4ce03914e712,好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的,5
1,f4d51947-eac4-3005-9d3c-2f32d6068a2d,新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！,4
2,74aa7ae4-03a4-394c-bee0-5702d3a3082a,庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去,4
3,099661c2-4360-3c49-a2fe-8c783764f7db,个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...,5
4,97ca672d-e558-3542-ba7b-ee719bba1bab,迪斯尼一日游,5


In [4]:
stop_word = []
stop_words_path = '../input/stop_word.txt'
with open(stop_words_path,encoding='utf-8') as f:
    for line in f.readlines():
        stop_word.append(line.strip())
stop_word.append(' ')

def clean_str(stri):
    stri = re.sub(r'[a-zA-Z0-9]+','',stri)
    cut_str = jieba.cut(stri.strip())
    list_str = [word for word in cut_str if word not in stop_word]
    return list_str

In [5]:
data['words'] = data['Discuss'].apply(lambda x : clean_str(x))
data.head()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.342 seconds.
Prefix dict has been built succesfully.


Unnamed: 0,Id,Discuss,Score,words
0,201e8bf2-77a2-3a98-9fcf-4ce03914e712,好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的,5,"[好大, 一个, 游乐, 公园, 已经, 次, 感觉, 玩够, 第三, 第四次]"
1,f4d51947-eac4-3005-9d3c-2f32d6068a2d,新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！,4,"[新, 中国, 成立, 举行, 中国, 人, 来说, 重要, 深刻, 意义]"
2,74aa7ae4-03a4-394c-bee0-5702d3a3082a,庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去,4,"[庐山, 瀑布, 有名, 多个, 瀑布, 最, 好看, 非, 三叠, 泉莫属, 推荐, 一去]"
3,099661c2-4360-3c49-a2fe-8c783764f7db,个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...,5,"[觉得, 颐和园, 北京, 最值, 一起, 地方, 相比, 下, 门票, 最贵, 故宫, 雄..."
4,97ca672d-e558-3542-ba7b-ee719bba1bab,迪斯尼一日游,5,"[迪斯尼, 一日游]"


In [6]:
# 回归转多分类
y = np_utils.to_categorical(train['Score'])
print(y)

[[0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 ...
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]]


In [7]:
d2v_train = data['words'].copy()
d2v_train.head()

0             [好大, 一个, 游乐, 公园, 已经, 次, 感觉, 玩够, 第三, 第四次]
1               [新, 中国, 成立, 举行, 中国, 人, 来说, 重要, 深刻, 意义]
2      [庐山, 瀑布, 有名, 多个, 瀑布, 最, 好看, 非, 三叠, 泉莫属, 推荐, 一去]
3    [觉得, 颐和园, 北京, 最值, 一起, 地方, 相比, 下, 门票, 最贵, 故宫, 雄...
4                                           [迪斯尼, 一日游]
Name: words, dtype: object

In [8]:
all_words = []
for i in d2v_train:
    all_words.extend(i)
print(all_words[0:100])

['好大', '一个', '游乐', '公园', '已经', '次', '感觉', '玩够', '第三', '第四次', '新', '中国', '成立', '举行', '中国', '人', '来说', '重要', '深刻', '意义', '庐山', '瀑布', '有名', '多个', '瀑布', '最', '好看', '非', '三叠', '泉莫属', '推荐', '一去', '觉得', '颐和园', '北京', '最值', '一起', '地方', '相比', '下', '门票', '最贵', '故宫', '雄伟', '气势磅礴', '颐和园', '宁静', '波光粼粼', '美', '迪斯尼', '一日游', '方便', '看水', '看山', '感受', '古人', '智慧结晶', '秋景', '美丽', '如画', '红黄绿', '相间', '身体', '状况不佳', '人', '来说', '走平路', '接受', '赞', '唯一', '糟点', '周未', '周边游', '景点', '服务', '不错', '排队', '太长', '好玩', '项目', '人', '晚上', '烟火', '一定', '真的', '不错', '做好', '攻', '绍兴', '护城河', '夜游', '感觉', '不错', '一日游', '不错', '选择', '有趣', '荡气回肠', '年', '留下来']


In [9]:
da, count, w_dictionary, reverse_dictionary = build_dataset(all_words, vocabulary_size = len(all_words))
print(count[0:100])

[['UNK', 0], ('好', 19050), ('不错', 17257), ('人', 15168), ('一个', 12281), ('地方', 12238), ('景区', 12129), ('值得', 9652), ('景点', 8966), ('感觉', 8526), ('风景', 8448), ('走', 7870), ('比较', 7247), ('美', 6662), ('里面', 6581), ('门票', 6463), ('景色', 6431), ('真的', 6185), ('看到', 5918), ('说', 5553), ('时间', 5227), ('北京', 5084), ('玩', 5045), ('特别', 4888), ('西湖', 4781), ('方便', 4780), ('最', 4778), ('元', 4773), ('中', 4715), ('喜欢', 4635), ('看看', 4624), ('挺', 4623), ('坐', 4209), ('建筑', 4204), ('中国', 4155), ('小', 4141), ('下', 4133), ('一定', 4020), ('再', 3988), ('历史', 3819), ('爬', 3815), ('小时', 3790), ('一去', 3756), ('里', 3754), ('觉得', 3675), (',', 3653), ('旅游', 3590), ('公园', 3570), ('建议', 3551), ('后', 3473), ('游客', 3455), ('适合', 3426), ('黄山', 3413), ('太', 3394), ('买', 3359), ('有点', 3338), ('特色', 3304), ('年', 3292), ('晚上', 3289), ('吃', 3279), ('位于', 3222), ('很大', 3211), ('想', 3115), ('推荐', 3104), ('一下', 3061), ('一次', 3051), ('现在', 2897), ('排队', 2859), ('游玩', 2835), ('需要', 2673), ('杭州', 2612), ('导游', 2537), ('索道', 252

In [10]:
hhh = pd.DataFrame(pd.Series(all_words).value_counts()) #统计词的出现次数
hhh.head()

hhh['id']=list(range(1,len(hhh)+1))
hhh.head()

Unnamed: 0,0,id
好,19050,1
不错,17257,2
人,15168,3
一个,12281,4
地方,12238,5


In [11]:
def get_sent(x, dictionary):
    encode = []
    for i in x:
        if i in dictionary: encode.append(dictionary[i])
        else: encode.append(0)
    return encode
    
data['sent'] = data['words'].apply(lambda x : get_sent(x, w_dictionary))
data.head()

Unnamed: 0,Id,Discuss,Score,words,sent
0,201e8bf2-77a2-3a98-9fcf-4ce03914e712,好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的,5,"[好大, 一个, 游乐, 公园, 已经, 次, 感觉, 玩够, 第三, 第四次]","[1500, 4, 872, 47, 74, 1228, 9, 21246, 3196, 1..."
1,f4d51947-eac4-3005-9d3c-2f32d6068a2d,新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！,4,"[新, 中国, 成立, 举行, 中国, 人, 来说, 重要, 深刻, 意义]","[408, 34, 3512, 1714, 34, 3, 238, 308, 726, 505]"
2,74aa7ae4-03a4-394c-bee0-5702d3a3082a,庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去,4,"[庐山, 瀑布, 有名, 多个, 瀑布, 最, 好看, 非, 三叠, 泉莫属, 推荐, 一去]","[577, 108, 359, 2299, 108, 26, 157, 1143, 2239..."
3,099661c2-4360-3c49-a2fe-8c783764f7db,个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...,5,"[觉得, 颐和园, 北京, 最值, 一起, 地方, 相比, 下, 门票, 最贵, 故宫, 雄...","[44, 367, 21, 50896, 102, 5, 873, 36, 15, 7858..."
4,97ca672d-e558-3542-ba7b-ee719bba1bab,迪斯尼一日游,5,"[迪斯尼, 一日游]","[1306, 344]"


In [12]:
train_df = data[data['Score'] != -1]
predict_df = data[data['Score'] == -1]
del predict_df['Score']

train_df.head()

Unnamed: 0,Id,Discuss,Score,words,sent
0,201e8bf2-77a2-3a98-9fcf-4ce03914e712,好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的,5,"[好大, 一个, 游乐, 公园, 已经, 次, 感觉, 玩够, 第三, 第四次]","[1500, 4, 872, 47, 74, 1228, 9, 21246, 3196, 1..."
1,f4d51947-eac4-3005-9d3c-2f32d6068a2d,新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！,4,"[新, 中国, 成立, 举行, 中国, 人, 来说, 重要, 深刻, 意义]","[408, 34, 3512, 1714, 34, 3, 238, 308, 726, 505]"
2,74aa7ae4-03a4-394c-bee0-5702d3a3082a,庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去,4,"[庐山, 瀑布, 有名, 多个, 瀑布, 最, 好看, 非, 三叠, 泉莫属, 推荐, 一去]","[577, 108, 359, 2299, 108, 26, 157, 1143, 2239..."
3,099661c2-4360-3c49-a2fe-8c783764f7db,个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...,5,"[觉得, 颐和园, 北京, 最值, 一起, 地方, 相比, 下, 门票, 最贵, 故宫, 雄...","[44, 367, 21, 50896, 102, 5, 873, 36, 15, 7858..."
4,97ca672d-e558-3542-ba7b-ee719bba1bab,迪斯尼一日游,5,"[迪斯尼, 一日游]","[1306, 344]"


In [13]:
predict_df.head()

Unnamed: 0,Id,Discuss,words,sent
0,9a1caf96-681e-3c11-b588-43ac742d7fd2,快乐之旅,"[快乐, 之旅]","[619, 409]"
1,82b450db-65c2-351c-84fb-761d76582680,岛上看日落的地方，视野很开阔，非常漂亮,"[岛上, 日落, 地方, 视野, 开阔, 漂亮]","[405, 732, 5, 1218, 1022, 97]"
2,2eec4606-590c-3fa2-b846-7f92441c54a6,很有鲁迅风味 很喜欢这样有文化的地方,"[鲁迅, 风味, 喜欢, 文化, 地方]","[696, 1196, 29, 73, 5]"
3,509f9a68-ac41-35ff-9d2e-2fc12f73ed7f,去乌鲁木齐还能不去天山天池吗，哈哈哈～,"[乌鲁木齐, 天山, 天池, 哈哈哈]","[7697, 3021, 378, 622]"
4,395f4b22-1c5f-328a-a19d-5065e0530cbc,非常满意，直接拿身份证刷机入园就行了，不用排队买票，比较节约时间,"[满意, 直接, 身份证, 刷机, 入园, 就行了, 不用, 排队, 买票, 比较, 节约,...","[443, 103, 265, 77562, 1112, 2370, 172, 67, 19..."


In [14]:
maxlen = 10
print("Pad sequences (samples x time)")

train_df['sent'] = list(sequence.pad_sequences(train_df['sent'], maxlen=maxlen))
predict_df['sent'] = list(sequence.pad_sequences(predict_df['sent'], maxlen=maxlen))

Pad sequences (samples x time)


In [23]:
nfolds = 5
def training(train_df, train_label, test_df):
    X = np.array(list(train_df['sent']))
    y = np.array(np_utils.to_categorical(train_label))
    T = np.array(list(test_df['sent']))
    folds = list(StratifiedKFold(n_splits=nfolds, random_state=2018, shuffle=True).split(X, train_label.values))
    
    S_train = np.zeros((X.shape[0], 1)) # 训练样本数 * 模型个数
    S_test = np.zeros((T.shape[0], 1))  # 测试集样本数 * 模型个数
    S_test_n = np.zeros((T.shape[0], len(folds))) # 测试集样本数 * n_folds
    
    error = []
    for j, (train_idx, test_idx) in enumerate(folds):
        X_train = X[train_idx] # 训练集特征
        y_train = y[train_idx] # 训练集标签

        X_holdout = X[test_idx] # 待预测的输入
        y_holdout = y[test_idx]
        
        print('Build model...')
        model = Sequential()
        model.add(Embedding(len(w_dictionary) + 1, 256))
        model.add(LSTM(256)) # try using a GRU instead, for fun
        model.add(Dropout(0.5))
        model.add(Dense(6))
        model.add(Activation('softmax'))
        
        model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
        model.fit(X_train, y_train, batch_size=32, nb_epoch=2,validation_data=(X_holdout,y_holdout))
        
        y_true = [np.argmax(i) for i in list(y_holdout)]
        predictions = list(model.predict(X_holdout,batch_size=32))
        y_pred = [np.sum(i * [0, 1, 2, 3, 4, 5]) for i in predictions]
        print('rmse: {}'.format(rmsel(y_true, y_pred)) )
        error.append(rmsel(y_true, y_pred))
        
        submission = list(model.predict(T, batch_size=32))
        sub_pred = [np.sum(i * [0, 1, 2, 3, 4, 5]) for i in submission]
        
        S_train[test_idx] = np.array(y_pred).reshape(-1, 1)
        S_test_n[:, j] = np.array(sub_pred)
        
    S_test[:] = S_test_n.mean(1).reshape(-1, 1)
    return S_train, S_test, round(np.mean(error), 5)

S_train, S_test, error = training(train_df[0:10], train_df['Score'][0:10], predict_df)

Build model...
Train on 7 samples, validate on 3 samples
Epoch 1/2
Epoch 2/2
rmse: 0.6819533514783339
Build model...
Train on 8 samples, validate on 2 samples
Epoch 1/2
Epoch 2/2
rmse: 0.6631396020648007
Build model...
Train on 8 samples, validate on 2 samples
Epoch 1/2
Epoch 2/2
rmse: 0.6710814729784078
Build model...
Train on 9 samples, validate on 1 samples
Epoch 1/2
Epoch 2/2
rmse: 0.705824230022326
Build model...
Train on 8 samples, validate on 2 samples
Epoch 1/2
Epoch 2/2
rmse: 0.6605599464491422


In [24]:
train_out = train_df[['Id']]
train_out['lstm_len_10'] = S_train
train_out.to_csv('../models/__models__/train_lstm_len_10.csv', index = False)

test_out = predict_df['Id']
test_out['lstm_len_10'] = S_test

test_out.to_csv('../models/__models__/test_lstm_len_10.csv', index = False)

print('error: {}'.format(error))

error: 0.67651
