In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline

import jieba
import re
from sklearn.model_selection import StratifiedKFold,KFold
import random

import fasttext

In [2]:
data_path = './YNU.EDU2018-ScenicWord/train_first.csv'
df = pd.read_csv(data_path,header = 0, encoding='utf8')

test_data_path = './YNU.EDU2018-ScenicWord/predict_first.csv'
test_df = pd.read_csv(test_data_path,header = 0, encoding='utf8')

# 数据清洗

In [3]:
df.drop_duplicates(subset='Discuss', keep='first',inplace=True)

In [4]:
stop_word = []
stop_words_path = 'stopWordList.txt'
with open(stop_words_path,encoding='utf8') as f:
    for line in f.readlines():
        stop_word.append(line.strip())
stop_word.append(' ')

def clean_str(stri):
    stri = re.sub(r'[a-zA-Z0-9]+','',stri)
    cut_str = jieba.cut(stri.strip())
    list_str = [word for word in cut_str if word not in stop_word]
    stri = ' '.join(list_str)
    return stri

In [None]:
stop_word = []
stop_words_path = 'stopWordList.txt'
with open(stop_words_path,encoding='utf8') as f:
    for line in f.readlines():
        stop_word.append(line.strip())
stop_word.append(' ')

In [5]:
df['Discuss'] = df['Discuss'].map(lambda x : clean_str(x))
test_df['Discuss'] = test_df['Discuss'].map(lambda x : clean_str(x))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/s5/cxqfsb816f36h8gp706x5kjm0000gn/T/jieba.cache
Loading model cost 1.200 seconds.
Prefix dict has been built succesfully.


In [6]:
def fillnull(x):
    if x == '':
        return '空白'
    else:
        return x

In [7]:
df['Discuss'] = df['Discuss'].map(lambda x: fillnull(x))
test_df['Discuss'] = test_df['Discuss'].map(lambda x: fillnull(x))

# 下采样，数据增强

In [22]:
# df_1 = df.loc[df['Score']==1, :]
# df_2 = df.loc[df['Score']==2, :]

# df = pd.concat([df,df_1,df_2],ignore_index=True)

In [8]:
score_5_idx = df[df['Score']==5].index.tolist()
score_4_idx = df[df['Score']==4].index.tolist()
score_3_idx = df[df['Score']==3].index.tolist()
score_2_idx = df[df['Score']==2].index.tolist()
score_1_idx = df[df['Score']==1].index.tolist()

In [9]:
def spilt_sample(sample,n=4):
    num_sample = len(sample)
    sub_lenth = int(1/n * num_sample)
    sub_sample = []
    for i in range(n):
        sub = sample[i*sub_lenth:(i+1)*sub_lenth]
        sub_sample.append(sub)
    return sub_sample

In [10]:
score_5_sample = spilt_sample(score_5_idx)
score_4_sample = spilt_sample(score_4_idx)
score_3_sample = spilt_sample(score_3_idx)

df1_idx = [score_5_sample[0],score_4_sample[0],score_3_sample[0],score_2_idx,score_1_idx]
df1_idx = [idx for i_sample in df1_idx for idx in i_sample]
random.shuffle(df1_idx)

df2_idx = [score_5_sample[1],score_4_sample[1],score_3_sample[1],score_2_idx,score_1_idx]
df2_idx = [idx for i_sample in df2_idx for idx in i_sample]
random.shuffle(df2_idx)

df3_idx = [score_5_sample[2],score_4_sample[2],score_3_sample[2],score_2_idx,score_1_idx]
df3_idx = [idx for i_sample in df3_idx for idx in i_sample]
random.shuffle(df3_idx)

df4_idx = [score_5_sample[3],score_4_sample[3],score_3_sample[3],score_2_idx,score_1_idx]
df4_idx = [idx for i_sample in df4_idx for idx in i_sample]
random.shuffle(df4_idx)

In [11]:
df1 = df.loc[df1_idx,:]
df1 = df1.sample(frac = 1)
df2 = df.loc[df2_idx,:]
df2 = df2.sample(frac = 1)
df3 = df.loc[df3_idx,:]
df3 = df3.sample(frac = 1)
df4 = df.loc[df4_idx,:]
df4 = df4.sample(frac = 1)

# 辅助函数，评测函数

In [12]:
def fasttext_data(data,label):
    fasttext_data = []
    for i in range(len(label)):
        sent = data[i]+"\t__label__"+str(int(label[i]))
        fasttext_data.append(sent)
    with open('train.txt','w') as f:
        for data in fasttext_data:
            f.write(data)
            f.write('\n')
    return 'train.txt'

def get_predict(pred):
    score = np.array([1,2,3,4,5])
    pred2 = []
    for p in pred:
        pr = np.sum(p * score)
        pred2.append(pr)
    return np.array(pred2)

def rmsel(true_label,pred):
    true_label = np.array(true_label)
    pred = np.array(pred)  
    n = len(true_label)
    a = true_label - pred
    rmse = np.sqrt(np.sum(a * a)/n)
    b = 1/(1+rmse)
    return b

# 交叉验证，预测

In [13]:
def fast_cv(df):
    df = df.reset_index(drop=True)
    X = df['Discuss'].values
    y = df['Score'].values
    fast_pred = []
    folds = list(KFold(n_splits=5, shuffle=True, random_state=2017).split(X, y))
    for train_index, test_index in folds:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        train_file = fasttext_data(X_train,y_train)
        
        classifier = fasttext.supervised(train_file,'model.model',lr=0.01,dim=128,label_prefix="__label__")
        result = classifier.predict_proba(df.loc[test_index,'Discuss'].tolist(),k=5)
        
        pred = [[int(sco) * proba for sco,proba in result_i] for result_i in result]
        pred = [sum(pred_i) for pred_i in pred]
        print(rmsel(y_test,pred))
        
        test_result = classifier.predict_proba(test_df['Discuss'].tolist(),k=5)
        fast_predi = [[int(sco) * proba for sco,proba in result_i] for result_i in test_result]
        fast_predi = [sum(pred_i) for pred_i in fast_predi]
        fast_pred.append(fast_predi)
    
    fast_pred = np.array(fast_pred)
    fast_pred = np.mean(fast_pred, axis=0)
    return fast_pred

In [14]:
test_pred1 = fast_cv(df1)

0.5159818233439385
0.5107258832584286
0.5154059219529723
0.5126544899661535
0.5117747128599764


In [15]:
test_pred2 = fast_cv(df2)

0.5130471396054775
0.5164329659090641
0.5088290295262752
0.5090103346795625
0.5118612888354229


In [16]:
test_pred3 = fast_cv(df3)

0.5134732474077905
0.516805905973566
0.5145942325833824
0.5085949394977739
0.5100220581413013


In [17]:
test_pred4 = fast_cv(df4)

0.5109065178925176
0.5086759353224652
0.5171876267346569
0.5026640705846672
0.5187304797353075


# 提交

In [18]:
data = np.zeros((len(test_df),5))
sub_df = pd.DataFrame(data)
sub_df.columns = ['Id','fast1','fast2','fast3','fast4']
sub_df['Id'] = test_df['Id'].values
sub_df['fast1'] = test_pred1
sub_df['fast2'] = test_pred2
sub_df['fast3'] = test_pred3
sub_df['fast4'] = test_pred4

In [19]:
sub_df['mean'] = sub_df.mean(axis=1)

In [20]:
test_pred = fast_cv(df)
sub_df['mean2'] = test_pred

0.5790714226277386
0.5781726996518566
0.5767399630845129
0.582091618025645
0.5804736020315329


In [21]:
sub_df.describe()

Unnamed: 0,fast1,fast2,fast3,fast4,mean,mean2
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,4.266991,4.269612,4.276665,4.277559,4.272707,4.401865
std,0.331754,0.327128,0.329702,0.328225,0.328536,0.300022
min,2.058204,2.144532,2.064844,2.206251,2.118458,1.385547
25%,4.03711,4.046097,4.051563,4.051563,4.046754,4.240236
50%,4.289064,4.290822,4.298633,4.29961,4.294141,4.416798
75%,4.524999,4.520704,4.528126,4.532032,4.523755,4.606641
max,4.932812,4.910547,4.930859,4.917188,4.913867,4.989454


In [22]:
pred = sub_df['mean2'].values
pred = np.where(pred>4.7,5,pred)
sub_df['mean2'] = pred

In [23]:
sub_df[['Id','mean2']].to_csv('fastsub2.csv',header=None,index=False)