In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing import text, sequence
from sklearn.model_selection import StratifiedShuffleSplit

np.random.seed(42)

Using TensorFlow backend.


In [2]:
INPUT_PATH = '../input/'
CACHE_PATH = '../cache/'
OUTPUT_PATH ='../output/'

In [3]:
# 读取数据
df_train = pd.read_csv(CACHE_PATH + 'train_processed_all.csv')
df_predict = pd.read_csv(CACHE_PATH + 'predict_processed_all.csv')

In [4]:
df_train = df_train[df_train['Score'].notnull()].reset_index(drop=True)

In [5]:
df_train['Score'].value_counts()/df_train.shape[0]

5.0    0.605600
4.0    0.280627
3.0    0.099018
2.0    0.009191
1.0    0.005564
Name: Score, dtype: float64

In [6]:
MAX_FEATURES = 20000 
MAX_LEN = 300 

In [7]:
train_X  = df_train['Discuss'].astype(str)
train_y = df_train['Score'].astype(float)
predict_X  = df_predict['Discuss'].astype(str)

In [8]:
tokenizer = text.Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_X)
train_X = tokenizer.texts_to_sequences(train_X)
predict_X = tokenizer.texts_to_sequences(predict_X)
train_X = sequence.pad_sequences(train_X, maxlen=MAX_LEN)
predict_X = sequence.pad_sequences(predict_X, maxlen=MAX_LEN)

In [9]:
predict_X.shape

(50000, 300)

In [10]:
# 分层划分数据集
def split_data_set(X,y,seed=42):
    split = StratifiedShuffleSplit(n_splits=1,test_size=0.05,random_state=seed)
    for train_index,val_index in split.split(X,y):
        strat_train_X = X[train_index]
        strat_val_X = X[val_index]
        strat_train_y = y[train_index]
        strat_val_y = y[val_index]
    return strat_train_X,strat_val_X,strat_train_y,strat_val_y

In [11]:
X_train, X_val, y_train, y_val = split_data_set(train_X, train_y)
np.savez(CACHE_PATH + 'data.npz',
         X_train=X_train,
         X_val=X_val,
         y_train=y_train,
         y_val=y_val,
         X_test=predict_X)

In [12]:
EMBEDDING_FILE = CACHE_PATH + 'chinese_vector_300.txt'
EMBEDDING_SIZE = 300

In [13]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

In [15]:
with open(EMBEDDING_FILE) as f:
    embedding_index = dict(get_coefs(*o.strip().split()) for o in f)

In [16]:
def get_embedding_matrix(embedding_index):
    word_index = tokenizer.word_index
    nb_words = min(MAX_FEATURES,len(word_index)) 
    # 生成初始词向量矩阵
    embedding = np.asarray(list(embedding_index.values()))
    embedding_mean, embeddiing_std = embedding.mean(),embedding.std()
    embedding_matrix = np.random.normal(embedding_mean,embeddiing_std,(nb_words,EMBEDDING_SIZE))
    # 填充已有的词向量
    for word, i in word_index.items():
        if i < MAX_FEATURES:
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [17]:
embedding_matrix = get_embedding_matrix(embedding_index)
np.save(CACHE_PATH + 'embedding_matrix.npy', embedding_matrix)