In [1]:
import pandas as pd#导入csv文件的库
import numpy as np#进行矩阵运算的库
import polars as pl#和pandas类似,但是处理大型数据集有更好的性能.
#用于对一组元素计数,一个存在默认值的字典,访问不存在的值时抛出的是默认值
from collections import Counter,defaultdict
import re#用于正则表达式提取
from scipy.stats import skew, kurtosis#统计分析和概率分布导入偏度和峰度
import gc#垃圾回收模块

#model
from lightgbm import LGBMRegressor#导入lgbm回归器
from catboost import CatBoostRegressor#catboost回归器

#KFold是直接分成k折,StratifiedKFold还要考虑每种类别的占比
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.preprocessing import MinMaxScaler#用最大值和最小值进行归一化操作(x-min)/(max-min)

#设置随机种子,保证模型可以复现
import random
seed=2023
np.random.seed(seed)
random.seed(seed)

import warnings#避免一些可以忽略的报错
warnings.filterwarnings('ignore')#filterwarnings()方法是用于设置警告过滤器的方法，它可以控制警告信息的输出方式和级别。

In [2]:
import sys
sys.version

'3.11.5 (main, Sep 11 2023, 08:31:25) [Clang 14.0.6 ]'

In [3]:
#交叉验证的折数
num_folds = 5
#融合模型的权重
blending_weights = {
    'lgbm': 0.3,
    'catboost': 0.3,
    'lightautoml': 0.3
}

In [6]:
## Do it on train
INPUT = "/kaggle/input/linking-writing-processes-to-writing-quality"
train_logs=pd.read_csv("train_logs.csv")
print(f"len(train_logs):{len(train_logs)}")
train_logs=train_logs.sort_values(by=['id', 'down_time'])
# 重置索引
train_logs = train_logs.reset_index(drop=True)
# 根据'id'列进行分组，并为每个分组添加一个递增的序列
train_logs['event_id'] = train_logs.groupby('id').cumcount() + 1

train_scores=pd.read_csv("train_scores.csv")

# DO it on test
test_logs=pd.read_csv("test_logs.csv")
print(f"len(test_logs):{len(test_logs)}")
test_logs=test_logs.sort_values(by=['id', 'down_time'])
# 重置索引
test_logs = test_logs.reset_index(drop=True)
# 根据'id'列进行分组，并为每个分组添加一个递增的序列
test_logs['event_id'] = test_logs.groupby('id').cumcount() + 1
test_logs.to_csv("test_logs.csv",index=None)

len(train_logs):8405898
len(test_logs):6


In [7]:
#统计‘q’ '.' ‘ ’的一个函数.
def getEssays(df):
    #获取传入的df的这几列.
    textInputDf = df[['id', 'activity', 'cursor_position', 'text_change','down_event']]
    #取出activity不等于'Nonproduction'的那些数据
    textInputDf = textInputDf[textInputDf.activity != 'Nonproduction']
    #统计每个id的出现次数,不排序
    valCountsArr = textInputDf['id'].value_counts(sort=False).values
    #最后的下标
    lastIndex = 0
    #创建一个新的序列对象.
    essay_df = pd.DataFrame(columns=['essay'])
    #index是第几个id,valCount是出现次数
    for index, valCount in enumerate(valCountsArr):
        #取出第i个id的['activity', 'cursor_position', 'text_change']
        currTextInput = textInputDf[['activity', 'cursor_position', 'text_change','down_event']].iloc[lastIndex : lastIndex + valCount]
        #跳到下一个id的index
        lastIndex += valCount
        essayText = ""
        previousText = ""
        for Input in currTextInput.values:
            if Input[3] != 'z' or Input[0] != 'Remove/Cut':
                previousText = essayText
            #input[0]是这个id的activity
            if Input[0] == 'Replace':
                #text_change按照' => '分开 replaceTxt:[' qqq qqqqq ', ' ']
                replaceTxt = Input[2].split(' => ')#应该是A=>B的操作
                #input[1]是鼠标位置,是一个数字 鼠标位置-len()
                #这是一个字符串的转换操作,由replaceTxt[0]转成replaceTxt[1] 
                essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] +essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
                continue
            if Input[0] == 'Paste':#粘贴
                #print(f"input[2]:{Input[2]}") #input[2]:qqqqqqqqqqq 
                essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
                continue
            if Input[0] == 'Remove/Cut':#删除剪切 在Input[1]的位置删除Input[2]
                if Input[3] == 'z':
                    essayText = previousText
                else:
                    essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
                continue
            #如果是Move from
            if "M" in Input[0]:
                #[284, 292] To [282, 290] 把[284, 292]这8行移动到[282,290]
                croppedTxt = Input[0][10:]
                #from和to的4个数字分开.
                splitTxt = croppedTxt.split(' To ')
                valueArr = [item.split(', ') for item in splitTxt]
                moveData = (int(valueArr[0][0][1:]), 
                            int(valueArr[0][1][:-1]), 
                            int(valueArr[1][0][1:]), 
                            int(valueArr[1][1][:-1]))
                #行号不相等,如果相等,等于什么都没有做
                if moveData[0] != moveData[2]:
                    #行号小于 
                    if moveData[0] < moveData[2]:
                        essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] +\
                        essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                    #行号大于
                    else:
                        essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] +\
                        essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
                continue
            #相当于是个check    
            essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
        #对应id对应论文
        essay_df = essay_df.append({'essay': essayText}, ignore_index=True)
    #id
    essay_df["id"] =  textInputDf['id'].unique()
    # return pd.DataFrame(essaySeries, columns=['essay']).reset_index().rename(columns={"index":'id'})
    return essay_df.reset_index(drop=True)

In [8]:
#获取数据中第25%的数值
def q1(x):
    return x.quantile(0.25)
#获取数据中第75%的数值
def q3(x):
    return x.quantile(0.75)
AGGREGATIONS = ['count', 'mean', 'std', 'min', 'max', 'first', 'last', 'sem', q1, 'median', q3, 'skew', kurtosis, 'sum']

#将论文转成单词
def split_essays_into_words(df):
    essay_df = df
    #对空格,\n,句号问号感叹号进行匹配,得到一个拆分后的列表.
    essay_df['word'] = essay_df['essay'].apply(lambda x: re.split(' |\\n|\\.|\\?|\\!',x))
    # essay1 [1,2,3] essay2[4,5] ->5行 essay1 1  // essay1 2 // essay1 3 // essay2 1 // essay2 2
    essay_df = essay_df.explode('word')
    #求出每个单词的长度
    essay_df['word_len'] = essay_df['word'].apply(lambda x: len(x))
    #去掉单词长度为0的数据
    essay_df = essay_df[essay_df['word_len'] != 0]
    return essay_df 

#计算word_len的统计学变量,并计算>=word_len的词数
def compute_word_aggregations(word_df):
    #根据id计算单词长度的统计学变量
    word_agg_df = word_df[['id','word_len']].groupby(['id']).agg(AGGREGATIONS)
    #比如('mean','word_len')->'mean_word_len'
    word_agg_df.columns = ['_'.join(x) for x in word_agg_df.columns]
    word_agg_df['id'] = word_agg_df.index
    for word_l in [5, 6, 7, 8, 9, 10, 11, 12]:
        #ge 就是Latex里>=的符号,筛选出word_len>=word_l的行,根据id进行统计,提取每个计数的第0行
        word_agg_df[f'word_len_ge_{word_l}_count'] = word_df[word_df['word_len'] >= word_l].groupby(['id']).count().iloc[:, 0]
        #如果有缺失值就填充为0
        word_agg_df[f'word_len_ge_{word_l}_count'] = word_agg_df[f'word_len_ge_{word_l}_count'].fillna(0)
    #重置索引
    word_agg_df = word_agg_df.reset_index(drop=True)
    return word_agg_df

#explore the punctuations
def punctuations_agg(df):
    essay_df = df.copy()
    essay_df['between_punc'] = essay_df['essay'].apply(lambda x: re.split(r'[,\n.?!]', x))
    essay_df = essay_df.explode('between_punc')
    essay_df['between_punc_len'] = essay_df['between_punc'].apply(lambda x: len(x))
    punc_agg_df = essay_df[['id','between_punc_len']].groupby(['id']).agg(AGGREGATIONS)
    punc_agg_df.columns = ['_'.join(x) for x in punc_agg_df.columns]
    punc_agg_df['id'] = punc_agg_df.index
    punc_agg_df.reset_index(drop=True, inplace=True)
    return punc_agg_df

def punctuations(df):
    essay_df = df.copy()
    punc_df = pd.DataFrame()
    punc_df['exclamation_count'] = essay_df['essay'].apply(lambda x: x.count("!"))
    punc_df['period_count'] = essay_df['essay'].apply(lambda x: x.count("."))
    punc_df['question_count'] = essay_df['essay'].apply(lambda x: x.count("?"))
    punc_df['comma_per_sent'] = essay_df['essay'].apply(lambda x: x.count(",") / len(re.split(r'[\n.?!]', x)))
    return punc_df

#将传入的论文df转成句子
def split_essays_into_sentences(df):
    essay_df = df#传入的df就是论文的df
    #对句子按照. ? !进行拆分. 得到一个拆分后的列表.
    essay_df['sent'] = essay_df['essay'].apply(lambda x: re.split('\\.|\\?|\\!',x))
    # essay1 [1,2,3] essay2[4,5] ->5行 essay1 1  // essay1 2 // essay1 3 // essay2 1 // essay2 2
    essay_df = essay_df.explode('sent')
    #将换行符'\n'变成空白字符 strip 去除行头和行尾的空白字符.
    essay_df['sent'] = essay_df['sent'].apply(lambda x: x.replace('\n','').strip())
    #统计一下每个句子的长度 
    essay_df['sent_len'] = essay_df['sent'].apply(lambda x: len(x))
    #求一下每个句子单词的个数.
    essay_df['sent_word_count'] = essay_df['sent'].apply(lambda x: len(x.split(' ')))
    #去掉那些句子长度为0的数据
    essay_df = essay_df[essay_df.sent_len!=0].reset_index(drop=True)
    return essay_df

def compute_sentence_aggregations(df):
    #统计句子长度的统计学变量和每个句子词数的统计学变量
    sent_agg_df = pd.concat(
        [df[['id','sent_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','sent_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
    )
    #比如('mean','sent_len')->'mean_sent_len'
    sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
    sent_agg_df['id'] = sent_agg_df.index

    # New features intoduced here: https://www.kaggle.com/code/mcpenguin/writing-processes-to-quality-baseline-v2
    for sent_l in [50, 60, 75, 100]:
        #ge 就是Latex里>=的符号,筛选出sent_len>=sent_l的行,根据id进行统计,提取每个计数的第0行
        sent_agg_df[f'sent_len_ge_{sent_l}_count'] = df[df['sent_len'] >= sent_l].groupby(['id']).count().iloc[:, 0]
        #如果有缺失值就填充为0
        sent_agg_df[f'sent_len_ge_{sent_l}_count'] = sent_agg_df[f'sent_len_ge_{sent_l}_count'].fillna(0)
    #重置索引
    sent_agg_df = sent_agg_df.reset_index(drop=True)
    #一句话里词的个数的count,其实就是有多少句话,也就是sent_len的count.重复了,故去掉.
    sent_agg_df.drop(columns=["sent_word_count_count"], inplace=True)
    #sent_len_count其实就是有多少句话,故rename.
    sent_agg_df = sent_agg_df.rename(columns={"sent_len_count":"sent_count"})
    return sent_agg_df

#将论文根据换行符划分为段落.(每段有多少句话为什么没有统计?)
def split_essays_into_paragraphs(df):
    essay_df = df
    #按照'\n'划分成段落 [1,2,3]
    essay_df['paragraph'] = essay_df['essay'].apply(lambda x: x.split('\n'))
    #[论文1 [段落1 段落2,……]->[论文1 段落1 // 论文1 段落2]
    essay_df = essay_df.explode('paragraph')
    #统计段落的长度
    essay_df['paragraph_len'] = essay_df['paragraph'].apply(lambda x: len(x)) 
    #统计每个段落的词数
    essay_df['paragraph_word_count'] = essay_df['paragraph'].apply(lambda x: len(x.split(' ')))
    #将段落长度为0的数据去掉.
    essay_df = essay_df[essay_df.paragraph_len!=0].reset_index(drop=True)
    return essay_df

#对段落的长度和词数用统计学变量,和上面句子的代码一致.
def compute_paragraph_aggregations(df):
    paragraph_agg_df = pd.concat(
        [df[['id','paragraph_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','paragraph_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
    ) 
    paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
    paragraph_agg_df['id'] = paragraph_agg_df.index
    paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
    paragraph_agg_df.drop(columns=["paragraph_word_count_count"], inplace=True)
    paragraph_agg_df = paragraph_agg_df.rename(columns={"paragraph_len_count":"paragraph_count"})
    return paragraph_agg_df

In [9]:
print("train_essays")
# train_essays = pd.read_csv('yunsu_essay.csv')
train_essays = getEssays(train_logs)
# train_essays.to_csv("yunsu_essay.csv",index=False)
print("train_punc_agg_df")
train_punc_agg_df = punctuations_agg(train_essays)
train_punc_agg_df = pd.concat([train_punc_agg_df,punctuations(train_essays)],axis=1)
print("train_word_agg_df")
train_word_agg_df = compute_word_aggregations(split_essays_into_words(train_essays))
print("train_sent_agg_df")
train_sent_agg_df = compute_sentence_aggregations(split_essays_into_sentences(train_essays))
print("train_paragraph_agg_df")
train_paragraph_agg_df = compute_paragraph_aggregations(split_essays_into_paragraphs(train_essays))

print("test_essays")
test_essays = getEssays(test_logs)
test_essays_copy=test_essays.copy()
print("train_punc_agg_df")
test_punc_agg_df = punctuations_agg(test_essays)
test_punc_agg_df = pd.concat([test_punc_agg_df,punctuations(test_essays)],axis=1)
print("test_word_agg_df")
test_word_agg_df = compute_word_aggregations(split_essays_into_words(test_essays))
print("test_sent_agg_df")
test_sent_agg_df = compute_sentence_aggregations(split_essays_into_sentences(test_essays))
print("test_paragraph_agg_df")
test_paragraph_agg_df = compute_paragraph_aggregations(split_essays_into_paragraphs(test_essays))

train_essays
train_punc_agg_df
train_word_agg_df
train_sent_agg_df
train_paragraph_agg_df
test_essays
train_punc_agg_df
test_word_agg_df
test_sent_agg_df
test_paragraph_agg_df


In [10]:
class Preprocessor:#数据预处理的一个类
    
    def __init__(self):
        
        self.activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste','Move From']#这是activity的一列
        self.events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 
              'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']#down_event中选出一些重要的
        self.text_changes = ['q', ' ', 'NoChange', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']#text_change中选出一些重要的
        self.punctuations = ['"', '.', ',', "'", '-', ';', ':', '?', '!', '<', '>', '/',
                        '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+']#down_event中的一些标点符号
        self.gaps = [1, 2, 3, 5, 10, 20, 50, 100]#滞后项
        
        #这里是用于存储每个activity的idf值
        self.idf = defaultdict(float)#创建了一个float类型的字典,如果访问不存在,默认值为0.0
    
    #统计df对象中activity的count
    def activity_counts(self, df):
        #对每个id的所有activity组合成一个列表
        tmp_df = df.groupby('id').agg({'activity': list}).reset_index()
        #创建一个空列表
        ret = list()
        for li in tmp_df['activity'].values:#取出一个人的activity列表
            items = list(Counter(li).items())#转成[(activity1:count1),(activity2:count2),……]
            di = dict()#一个空字典
            #每个activity初始化为0
            for k in self.activities:
                di[k] = 0
            #统计每个activity的count
            for item in items:
                k, v = item[0], item[1]#k:activity v:count
                if k in di:
                    di[k] = v
            #加上这个人的每个activity的count
            ret.append(di)
        #转成pandas类型
        ret = pd.DataFrame(ret)
        #给表格的每列换个名字
        cols = [f'activity_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols

        #每列元素求和,文章中出现的总次数
        #add up the sum in each row
        cnts = ret.sum(1)

        #前面是词袋模型,这里转成tf-idf模型
        for col in cols:#activity_i_count
            if col in self.idf.keys():#字典里如果已经有这个key了
                idf = self.idf[col]
            else:#不在这个字典里
                #计算idf=log(数据量/(某列和+1))
                idf = np.log(df.shape[0] / (ret[col].sum() + 1))
                self.idf[col] = idf#将col的idf加入字典
            #ret[col] / cnts :一个entry的某种类型的总输入次数 / 一个entry该特征大类中所有类型,为什么取log再加1不知道
            ret[col] = 1 + np.log(ret[col] / cnts)
            ret[col] *= idf

        return ret#tf-idf

    #这个是event的tf-idf模型,这里可能有down_event和up_event,故colname单独设置
    def event_counts(self, df, colname):
        tmp_df = df.groupby('id').agg({colname: list}).reset_index()
        ret = list()
        for li in tmp_df[colname].values:
            items = list(Counter(li).items())
            di = dict()
            for k in self.events:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        # a list of dictionary => df
        # columns = possible values for event column
        ret = pd.DataFrame(ret)
        cols = [f'{colname}_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols

        cnts = ret.sum(1)

        for col in cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = df.shape[0] / (ret[col].sum() + 1)
                idf = np.log(idf)
                self.idf[col] = idf
            
            ret[col] = 1 + np.log(ret[col] / cnts)
            ret[col] *= idf

        return ret

    #text_change的tf-idf模型
    def text_change_counts(self, df):
        tmp_df = df.groupby('id').agg({'text_change': list}).reset_index()
        ret = list()
        for li in tmp_df['text_change'].values:
            items = list(Counter(li).items())
            di = dict()
            for k in self.text_changes:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        cols = [f'text_change_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols

        cnts = ret.sum(1)

        for col in cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = df.shape[0] / (ret[col].sum() + 1)
                idf = np.log(idf)
                self.idf[col] = idf
            
            ret[col] = 1 + np.log(ret[col] / cnts)
            ret[col] *= idf
            
        return ret
    #统计标点之类的出现的次数,不过这次是直接将它们相加做统计的.(可能这样比tf-idf好?)
    def match_punctuations(self, df):
        tmp_df = df.groupby('id').agg({'down_event': list}).reset_index()
        ret = list()
        for li in tmp_df['down_event'].values:
            cnt = 0
            items = list(Counter(li).items())
            for item in items:
                k, v = item[0], item[1]
                if k in self.punctuations:#只要在这张表里,就相加
                    cnt += v
            ret.append(cnt)
        ret = pd.DataFrame({'punct_cnt': ret})
        return ret


    def get_input_words(self, df):
        #~是取反的布尔值 取出text_change 中不包含 => 且不是Nochange的
        tmp_df = df[(~df['text_change'].str.contains('=>'))&(df['text_change'] != 'NoChange')].reset_index(drop=True)
        #在drop掉包含 => 和Nochange之后 按id打包成列表
        tmp_df = tmp_df.groupby('id').agg({'text_change': list}).reset_index()
        #将列表连接成一个整体
        tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: ''.join(x))
        #用正则表达式子匹配一个或者多个'q'字符
        tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: re.findall(r'q+', x))
        #统计len,也就是统计text_change中有多少个有q的字符
        tmp_df['input_word_count'] = tmp_df['text_change'].apply(len)
        #求均值,方差,最大值,取到np.nan就设置为0
        tmp_df['input_word_length_mean'] = tmp_df['text_change'].apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df['input_word_length_max'] = tmp_df['text_change'].apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df['input_word_length_std'] = tmp_df['text_change'].apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df.drop(['text_change'], axis=1, inplace=True)
        return tmp_df
    
    #对df做特征工程!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    def make_feats(self, df):
        print("Starting to engineer features")
        #创建一个只有id一列的表格
        feats = pd.DataFrame({'id': df['id'].unique().tolist()})
        #做时序上的特征工程
        print("Engineering time data")
        for gap in self.gaps:
            print(f"-> for gap {gap}")
            #利用up_time的shift创造action_time_gap
            df[f'up_time_shift{gap}'] = df.groupby('id')['up_time'].shift(gap)
            df[f'action_time_gap{gap}'] = df['down_time'] - df[f'up_time_shift{gap}']
        df.drop(columns=[f'up_time_shift{gap}' for gap in self.gaps], inplace=True)

        #对cursor_position做特征工程,这个就是自己-自己
        print("Engineering cursor position data")
        for gap in self.gaps:
            print(f"-> for gap {gap}")
            df[f'cursor_position_shift{gap}'] = df.groupby('id')['cursor_position'].shift(gap)
            df[f'cursor_position_change{gap}'] = df['cursor_position'] - df[f'cursor_position_shift{gap}']
            #取了绝对值,鼠标向前移动也是移动了.
            df[f'cursor_position_abs_change{gap}'] = np.abs(df[f'cursor_position_change{gap}'])
        df.drop(columns=[f'cursor_position_shift{gap}' for gap in self.gaps], inplace=True)

        #对word_count做类似的特征工程,词数减少也是移动了.
        print("Engineering word count data")
        for gap in self.gaps:
            print(f"-> for gap {gap}")
            df[f'word_count_shift{gap}'] = df.groupby('id')['word_count'].shift(gap)
            df[f'word_count_change{gap}'] = df['word_count'] - df[f'word_count_shift{gap}']
            df[f'word_count_abs_change{gap}'] = np.abs(df[f'word_count_change{gap}'])
        df.drop(columns=[f'word_count_shift{gap}' for gap in self.gaps], inplace=True)
        
        print("Engineering statistical summaries for features")
        #需要对哪些特征做哪些统计变量,这些都是大佬统计好的,就不做修改了.
        feats_stat = [
            ('event_id', ['max']),
            ('down_time',['mean', 'std', 'min', 'max', 'last', 'first', 'sem', 'median', 'sum']),
            ('up_time',['mean', 'std', 'min', 'max', 'last', 'first', 'sem', 'median', 'sum']),
            ('action_time', ['max', 'min', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew', pd.DataFrame.kurt,'last', 'first','median']),
            ('activity', ['nunique']),
            ('down_event', ['nunique']),
            ('up_event', ['nunique']),
            ('text_change', ['nunique']),
            ('cursor_position', ['nunique', 'max', 'quantile', 'sem', 'mean', 'std', 'min','last', 'first',  'median', 'sum']),
            ('word_count', ['nunique', 'max', 'quantile', 'sem', 'mean', 'std', 'min', 'last', 'first','median', 'sum'])]
        #滞后特征的统计变量用for循环进行添加
        for gap in self.gaps:
            feats_stat.extend([
                (f'action_time_gap{gap}', ['max', 'min', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew', pd.DataFrame.kurt]),
                (f'cursor_position_change{gap}', ['max', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew', pd.DataFrame.kurt]),
                (f'word_count_change{gap}', ['max', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew', pd.DataFrame.kurt])
            ])
        
        pbar = feats_stat
        for item in pbar:
            colname, methods = item[0], item[1]#取出某列特征和需要进行的统计学的量'max'
            for method in methods:
                #转成能放入agg的方法
                if isinstance(method, str):
                    method_name = method
                else:
                    method_name = method.__name__
                #添加到feats里.
                tmp_df = df.groupby(['id']).agg({colname: method}).reset_index().rename(columns={colname: f'{colname}_{method_name}'})
                feats = feats.merge(tmp_df, on='id', how='left')

        #调用方法求activity的tf-idf
        print("Engineering activity counts data")
        tmp_df = self.activity_counts(df)
        feats = pd.concat([feats, tmp_df], axis=1)
        #调用方法求down_event和up_event的tf-idf
        print("Engineering event counts data")
        tmp_df = self.event_counts(df, 'down_event')
        feats = pd.concat([feats, tmp_df], axis=1)
        tmp_df = self.event_counts(df, 'up_event')
        feats = pd.concat([feats, tmp_df], axis=1)
        
        print("Engineering text change counts data")
        tmp_df = self.text_change_counts(df)
        feats = pd.concat([feats, tmp_df], axis=1)
        
        print("Engineering punctuation counts data")
        tmp_df = self.match_punctuations(df)
        feats = pd.concat([feats, tmp_df], axis=1)

        # input words
        print("Engineering input words data")
        tmp_df = self.get_input_words(df)
        feats = pd.merge(feats, tmp_df, on='id', how='left')

        # compare feats
        print("Engineering ratios data")
        feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
        feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
        feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
        #休息时间的占比
        feats['idle_time_ratio'] = feats['action_time_gap1_sum'] / feats['up_time_max']

        
        print("Engineer R/P burst (new)")
        def helper(selected_burst): 
            selected_burst = selected_burst.reset_index(drop=True)
            selected_burst[0] = False
            end_indices = []
            start_indices = []
            for i in range(1,len(selected_burst)): 
                if selected_burst[i] == False and selected_burst[i-1] == True:
                    end_indices.append(i)
                elif selected_burst[i] == True and selected_burst[i-1] == False:
                    start_indices.append(i)
            return [end_indices[i] - start_indices[i] for i in range(len(end_indices))]
           
        #df is edited in the process, not drop anything orignal
        tmp_df = df.copy()
        display(tmp_df)
        tmp_df = tmp_df[tmp_df['activity'].isin(['Remove/Cut','Input'])].reset_index(drop=True)
        R_burst = tmp_df['activity'] == 'Input' 
        P_burst = tmp_df['action_time_gap1'] < 2000
        valCountsArr = tmp_df['id'].value_counts(sort=False).items()
        last_index = 0
        burst_df = pd.DataFrame(columns=['id', 'P_burst', 'R_burst', 'PR_burst'])
        for id, count in valCountsArr:
            selected_P = P_burst.iloc[last_index:last_index + count]
            selected_R = R_burst.iloc[last_index:last_index + count]
            selected_P_len = helper(selected_P)
            selected_R_len = helper(selected_R)
            selected_PR_len = helper(selected_P & selected_R)
            burst_df = burst_df.append({'id': id, 
                                        'P_burst': selected_P_len, 
                                        'R_burst': selected_R_len, 
                                        'PR_burst': selected_PR_len}, 
                                    ignore_index=True)
            last_index += count
            
            
        def help_agg_burst(burst_df, colname):
            tmp_df = burst_df[['id',colname]].explode(colname)
            tmp_df[colname] = pd.to_numeric(tmp_df[colname], errors='coerce')
            tmp_df_agg = tmp_df.groupby(['id']).agg(AGGREGATIONS)
            tmp_df_agg.columns = ['_'.join(x) for x in tmp_df_agg.columns]
            return tmp_df_agg

        burst_df["P_burst"] = burst_df["P_burst"].apply(lambda x : [num for num in x if num >= 5])
        burst_df["R_burst"] = burst_df["R_burst"].apply(lambda x : [num for num in x if num >= 5])
        burst_df["PR_burst"] = burst_df["PR_burst"].apply(lambda x : [num for num in x if num >= 5])
        burst_df = pd.concat([help_agg_burst(burst_df, "P_burst"),help_agg_burst(burst_df, "R_burst"),help_agg_burst(burst_df, "PR_burst")],axis=1).reset_index()
        feats = pd.merge(feats, burst_df, on='id', how='left')
        
        print("Thinking Time (new)")
        thinking_time = df[df.activity == 'Input'].groupby(['id']).first().reset_index()
        feats = pd.concat([feats,thinking_time['down_time']],axis=1)
        feats.rename(columns={"down_time":"thinking_time"},inplace=True)
        
        
        print("wpm (new)")
        wpm_tmp_df = df.copy()
        def helper_wpm(group):
            return group['word_count'].iloc[-1] / group['up_time'].iloc[-1] * 60000
        wpm_df = wpm_tmp_df[['word_count','up_time','id']].groupby(['id']).apply(helper_wpm).reset_index()
        wpm_df.rename(columns = {0:"wpm"},inplace=True)
        feats = pd.merge(feats, wpm_df, on='id', how='left')
        
        
        print("Done!")
        return feats

preprocessor = Preprocessor()
print("Engineering features for training data")

train_logs_copy = train_logs.copy()
test_logs_copy = test_logs.copy()
train_feats = preprocessor.make_feats(train_logs_copy)
print("-"*25)
print("Engineering features for test data")
test_feats = preprocessor.make_feats(test_logs_copy)

Engineering features for training data
Starting to engineer features
Engineering time data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering cursor position data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering word count data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering statistical summaries for features
Engineering activity counts data
Engineering event counts data
Engineering text change counts data
Engineering punctuation counts data
Engineering input words data
Engineering ratios data
Engineer R/P burst (new)


Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,...,word_count_change5,word_count_abs_change5,word_count_change10,word_count_abs_change10,word_count_change20,word_count_abs_change20,word_count_change50,word_count_abs_change50,word_count_change100,word_count_abs_change100
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,...,,,,,,,,,,
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,...,,,,,,,,,,
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,...,,,,,,,,,,
3,001519c8,4,106686,106777,91,Input,q,q,q,1,...,,,,,,,,,,
4,001519c8,5,107196,107323,127,Input,q,q,q,2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8405893,fff05981,3615,2063944,2064440,496,Nonproduction,Leftclick,Leftclick,NoChange,1031,...,0.0,0.0,-1.0,1.0,-1.0,1.0,2.0,2.0,11.0,11.0
8405894,fff05981,3616,2064497,2064497,0,Nonproduction,Shift,Shift,NoChange,1031,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,10.0,10.0
8405895,fff05981,3617,2064657,2064765,108,Replace,q,q,q => q,1031,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,10.0,10.0
8405896,fff05981,3618,2069186,2069259,73,Nonproduction,Leftclick,Leftclick,NoChange,1028,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,10.0,10.0


Thinking Time (new)
wpm (new)
Done!
-------------------------
Engineering features for test data
Starting to engineer features
Engineering time data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering cursor position data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering word count data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering statistical summaries for features
Engineering activity counts data
Engineering event counts data
Engineering text change counts data
Engineering punctuation counts data
Engineering input words data
Engineering ratios data
Engineer R/P burst (new)


Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,...,word_count_change5,word_count_abs_change5,word_count_change10,word_count_abs_change10,word_count_change20,word_count_abs_change20,word_count_change50,word_count_abs_change50,word_count_change100,word_count_abs_change100
0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,...,,,,,,,,,,
1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,...,,,,,,,,,,
2,2222bbbb,1,290502,290548,46,Input,q,q,q,1,...,,,,,,,,,,
3,2222bbbb,2,711956,712023,67,Input,q,q,q,0,...,,,,,,,,,,
4,4444cccc,1,184996,185052,56,Input,q,q,q,1,...,,,,,,,,,,
5,4444cccc,2,635547,635641,94,Input,Space,Space,,0,...,,,,,,,,,,


Thinking Time (new)
wpm (new)
Done!


In [11]:
data = []

for logs in [train_logs, test_logs]:
    #up_time向后移动并且用down_time填充缺失的位置
    logs['up_time_lagged'] = logs.groupby('id')['up_time'].shift(1).fillna(logs['down_time'])
    #(down_time减上一个时刻的up_time) /1000是单位转换
    logs['time_diff'] = abs(logs['down_time'] - logs['up_time_lagged']) / 1000

    #按照id打包time_diff
    group = logs.groupby('id')['time_diff']
    #延迟时间的max,min,median
    largest_lantency = group.max()
    smallest_lantency = group.min()
    median_lantency = group.median()
    #down_time的first /1000是做单位转换吧
    initial_pause = logs.groupby('id')['down_time'].first() / 1000
    #分层次求和
    pauses_half_sec = group.apply(lambda x: ((x > 0.5) & (x <= 1)).sum())
    pauses_1_sec = group.apply(lambda x: ((x > 1) & (x <= 1.5)).sum())
    pauses_1_half_sec = group.apply(lambda x: ((x > 1.5) & (x <= 2)).sum())
    pauses_2_sec = group.apply(lambda x: ((x > 2) & (x <= 3)).sum())
    pauses_3_sec = group.apply(lambda x: (x > 3).sum())

    data.append(pd.DataFrame({
        'id': logs['id'].unique(),
         #延迟
        'largest_lantency': largest_lantency,
        'smallest_lantency': smallest_lantency,
        'median_lantency': median_lantency,
        'initial_pause': initial_pause,
        'pauses_half_sec': pauses_half_sec,
        'pauses_1_sec': pauses_1_sec,
        'pauses_1_half_sec': pauses_1_half_sec,
        'pauses_2_sec': pauses_2_sec,
        'pauses_3_sec': pauses_3_sec,
    }).reset_index(drop=True))

train_eD592674, test_eD592674 = data

gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存

train_feats = train_feats.merge(train_eD592674, on='id', how='left')
test_feats = test_feats.merge(test_eD592674, on='id', how='left')
train_feats = train_feats.merge(train_scores, on='id', how='left')

In [12]:
#将论文的特征加上
train_feats=train_feats.merge(train_word_agg_df,on='id', how='left')
train_feats=train_feats.merge(train_sent_agg_df,on='id', how='left')
train_feats=train_feats.merge(train_paragraph_agg_df,on='id', how='left')
train_feats=train_feats.merge(train_punc_agg_df,on='id', how='left')
## replace with actual data (new)
train_feats['word_len_count'] = train_logs.groupby('id')['word_count'].last().reset_index(drop=True)
#将论文的特征加上
test_feats=test_feats.merge(test_word_agg_df,on='id', how='left')
test_feats=test_feats.merge(test_sent_agg_df,on='id', how='left')
test_feats=test_feats.merge(test_paragraph_agg_df,on='id', how='left')
test_feats=test_feats.merge(test_punc_agg_df,on='id', how='left')
test_feats['word_len_count'] = test_logs.groupby('id')['word_count'].last().reset_index(drop=True)

In [13]:
#找到只有唯一值的列,删掉
keys=train_feats.keys().values
unique_cols=[key for key in keys if train_feats[key].nunique()<2]
print(f"unique_cols:{unique_cols}")
train_feats = train_feats.drop(columns=unique_cols)
test_feats = test_feats.drop(columns=unique_cols)

unique_cols:['cursor_position_min', 'word_count_change1_quantile', 'word_count_change2_quantile', 'activity_5_count', 'smallest_lantency']


In [14]:
best_features = train_feats.drop(['score'],axis=1).keys().values 
train_scores = train_feats[['id','score']]
X_y = pd.merge(train_feats[best_features], train_scores, on='id', how='left')
X_y.replace([np.inf, -np.inf], np.nan, inplace=True)

In [15]:
#这里创建了lgbm,cat,SVR模型
def make_model():
    
    #大佬找好的参数,这里不做改动
    params = {'reg_alpha': 0.007678095440286993, 
               'reg_lambda': 0.34230534302168353, 
               'colsample_bytree': 0.627061253588415, 
               'subsample': 0.854942238828458, 
               'learning_rate': 0.038697981947473245, 
               'num_leaves': 22, 
               'max_depth': 37, 
               'min_child_samples': 18,
               'random_state': seed,
               'n_estimators': 150,
               "objective": "regression",
               "metric": "rmse",
               'force_col_wise': True,
               "verbosity": 0,
              }
    
    model1 = LGBMRegressor(**params)
    
    model2 = CatBoostRegressor(iterations=1000,
                                 learning_rate=0.1,
                                 depth=6,
                                 eval_metric='RMSE',
                                 random_seed = seed,
                                 bagging_temperature = 0.2,
                                 od_type='Iter',
                                 metric_period = 50,
                                 od_wait=20,
                                 verbose=False)
    
    
    models = []
    models.append((model1, 'lgbm'))
    models.append((model2, 'catboost'))
    
    return models

In [None]:
import joblib 
# solution 1 with gan
# models_and_errors_dict = joblib.load("/kaggle/input/lgbm-cb-gan-pkl/lgbm_and_cb_model.pkl")

# solution 2 without gan
models_and_errors_dict = joblib.load("/kaggle/input/lgbm-and-cb-pkl/lgbm_and_cb_model.pkl")

In [2]:
conda 

3.11.5 (main, Sep 11 2023, 08:31:25) [Clang 14.0.6 ]


In [3]:
!pip install lightautoml==0.3.8b1

[31mERROR: Ignored the following versions that require a different python version: 0.3.0 Requires-Python >=3.6.1,<3.10; 0.3.1 Requires-Python >=3.6.1,<3.10; 0.3.2 Requires-Python >=3.6.1,<3.10; 0.3.3 Requires-Python >=3.6.1,<3.10; 0.3.4 Requires-Python >=3.6.1,<3.10; 0.3.5 Requires-Python >=3.6.1,<3.10; 0.3.6 Requires-Python >=3.6.1,<3.10; 0.3.7 Requires-Python >=3.6.1,<3.10; 0.3.7.1 Requires-Python >=3.6.1,<3.10; 0.3.7.2 Requires-Python >=3.6.1,<3.10; 0.3.7.3 Requires-Python >=3.6.1,<3.10; 0.3.8b1 Requires-Python >=3.6.1,<3.11[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement lightautoml==0.3.8b1 (from versions: 0.2.1, 0.2.2, 0.2.3, 0.2.4, 0.2.5, 0.2.6, 0.2.7, 0.2.8, 0.2.10, 0.2.11, 0.2.12, 0.2.13, 0.2.14, 0.2.15, 0.2.16)[0m[31m
[0m[31mERROR: No matching distribution found for lightautoml==0.3.8b1[0m[31m
[0m

In [2]:
brew update


SyntaxError: invalid syntax (2649740877.py, line 1)

In [23]:
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

N_THREADS = 4
num_folds = 5
# N_FOLDS = 5 redundant
RANDOM_STATE = 2033 
TEST_SIZE = 0.2 
TIMEOUT = 3600
TARGET_NAME = 'score'

from sklearn.metrics import mean_squared_error
from math import sqrt

# task = Task('reg')

# train_data = pd.read_csv("yunsu_train_feats.csv")

# roles = {
# 'target': TARGET_NAME, 'drop': ['id']
# }


# automl = TabularAutoML(
#     task = task,
#     cpu_limit = N_THREADS,
#     reader_params = {'n_jobs': N_THREADS, 'cv': num_folds, 'random_state': RANDOM_STATE, 'device': 'gpu'},
#     timeout = TIMEOUT
# )

# out_of_fold_predictions = automl.fit_predict(train_data, roles = roles, verbose = 1)
# test_predictions = automl.predict(test_feats)
# print(f'Prediction for test_data:\n{test_predictions}\nShape = {test_predictions.shape}')
# joblib.dump((automl, out_of_fold_predictions.data[:,0]),"automl_model_and_predictions_yunsu.pkl")

from sklearn.metrics import mean_squared_error
automl, oof_prediction = joblib.load("/kaggle/input/lightauoml-roy-pkl/Roy.pkl")
lightautoml_rmse = mean_squared_error(X_y['score'].values, oof_prediction,squared=False)
print(f'OOF score (RMSE): {lightautoml_rmse}')
models_and_errors_dict['lightautoml'] = [(automl, lightautoml_rmse, None, None,oof_prediction)]

ModuleNotFoundError: No module named 'lightautoml'

In [None]:
def RMSE(y_true,y_pred):
    return np.sqrt(np.mean((y_true-y_pred)**2))
target = pd.DataFrame(X_y['score'],columns=['score'])

lgb_oof_pred=models_and_errors_dict['lgbm'][4][4]
cat_oof_pred=models_and_errors_dict['catboost'][4][4]
lightautoml_oof_pred=models_and_errors_dict['lightautoml'][0][4]
margin=100
# 重运行就崩了，呜呜
# target=target.values
lgbm_RMSE = RMSE(target['score'],lgb_oof_pred)
cat_RMSE = RMSE(target['score'],cat_oof_pred)
lightautoml_RMSE = RMSE(target['score'],lightautoml_oof_pred)
print(
    f"""
    The CV RMSE of lgbm: {lgbm_RMSE}
    The CV RMSE of cb: {cat_RMSE}
    The CV RMSE of lightautoml:{lightautoml_RMSE}
    """
)
current_RMSE=RMSE(target['score'],(lgb_oof_pred+cat_oof_pred)/2)
best_i=0
best_j=0
for i in range(0,margin):
    for j in range(0,margin-i):
        #o=1000-i-j
            blend_oof_pred=(i*lgb_oof_pred+j*cat_oof_pred+(margin-i-j)*lightautoml_oof_pred)/margin
            if RMSE(target['score'],blend_oof_pred)<current_RMSE:
                print(f"current_RMSE:{current_RMSE}")
                current_RMSE=RMSE(target['score'],blend_oof_pred)
                best_i=i
                best_j=j

#找到最好的参数之后
blending_weights['lgbm']=best_i/margin
blending_weights['catboost']=best_j/margin
blending_weights['lightautoml']=(margin-best_i-best_j)/margin
print(f"blending_weights:{blending_weights}")

In [None]:
from sklearn.preprocessing import StandardScaler
y_hats = dict()

#设置submission_df,id和score
submission_df = pd.DataFrame(test_feats['id'])
submission_df['score'] = 3.5#如果报错,将预测结果设置为3.5

#取出test_feats中所有列
X_unseen = test_feats.copy()[best_features]
X_unseen.drop(columns=['id'], inplace=True)
X_unseen.replace([np.inf, -np.inf], np.nan, inplace=True)
X_unseen.loc[:, X_unseen.isna().all()] = 0

from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
X_unseen_imputed = imputer.fit_transform(X_unseen)
X_unseen_imputed = pd.DataFrame(X_unseen_imputed, columns=X_unseen.columns)
X_unseen = X_unseen_imputed


for model_name, model_info in models_and_errors_dict.items():
    print(f'\n--- {model_name} ---\n')
    
    #复制是因为有的要归一化
    X_unseen_copy = X_unseen.copy()
    y_hats[model_name] = []#某个model的预测结果

    for ix, (trained_model, error, imputer, uhhh ,oof_pred) in enumerate(model_info, start=1):
        print(f"Using model {ix} with error {error}")

        if model_name == "lightautoml":
            y_hats[model_name].append(trained_model.predict(X_unseen_copy).data[:,0])
        else: y_hats[model_name].append(trained_model.predict(X_unseen_copy))
        
    #如果有值的话,求平均,赋值给submission_df
    if y_hats[model_name]:
        y_hat_avg = np.mean(y_hats[model_name], axis=0)
        submission_df['score_' + model_name] = y_hat_avg
    print("Done.")
    
print("blending")
blended_score=np.zeros((len(test_essays_copy)))
for k, v in blending_weights.items():
    blended_score += submission_df['score_' + k] * v
print(f"blended_score:{blended_score}")

In [None]:
#数值型变量的几列
num_cols = ['down_time', 'up_time', 'action_time', 'cursor_position', 'word_count']
activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
text_changes = ['q', ' ', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']

#df表格的colname列统计values的count.
def count_by_values(df, colname, values):
    #maintain_order=True保持原有顺序
    fts = df.select(pl.col('id').unique(maintain_order=True))
    for i, value in enumerate(values):
        #根据每个id判断colname是不是value并统计个数,rename成colname_i_cnt
        tmp_df = df.group_by('id').agg(pl.col(colname).is_in([value]).sum().alias(f'{colname}_{i}_cnt'))
        #加上这个特征
        fts  = fts.join(tmp_df, on='id', how='left') 
    return fts

def dev_feats(df):
    
    print("< Count by values features >")
    
    #统计activity,text_change,down_event,up_event这几个类别型变量的count
    feats = count_by_values(df, 'activity', activities)
    feats = feats.join(count_by_values(df, 'text_change', text_changes), on='id', how='left') 
    feats = feats.join(count_by_values(df, 'down_event', events), on='id', how='left') 
    feats = feats.join(count_by_values(df, 'up_event', events), on='id', how='left') 

    print("< Input words stats features >")
    #不含有'=>'且有变化的行
    temp = df.filter((~pl.col('text_change').str.contains('=>')) & (pl.col('text_change') != 'NoChange'))
    #按照id将text_change连接成一个长字符串,然后匹配'q+'的字符串
    temp = temp.group_by('id').agg(pl.col('text_change').str.concat('').str.extract_all(r'q+'))
    #统计输入的词数,词长度的均值,最大值,方差,中位数,偏斜度.
    temp = temp.with_columns(input_word_count = pl.col('text_change').list.lengths(),
                             input_word_length_mean = pl.col('text_change').apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_max = pl.col('text_change').apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_std = pl.col('text_change').apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_median = pl.col('text_change').apply(lambda x: np.median([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_skew = pl.col('text_change').apply(lambda x: skew([len(i) for i in x] if len(x) > 0 else 0)))
    #将text_change这列去掉,加入特征.
    temp = temp.drop('text_change')
    feats = feats.join(temp, on='id', how='left') 

    print("< Numerical columns features >")

    #对action_time求和,对数值型变量求均值,方差,中位数,最小值,最大值,50%的数字
    temp = df.group_by("id").agg(pl.sum('action_time').suffix('_sum'), pl.mean(num_cols).suffix('_mean'), pl.std(num_cols).suffix('_std'),
                                 pl.median(num_cols).suffix('_median'), pl.min(num_cols).suffix('_min'), pl.max(num_cols).suffix('_max'),
                                 pl.quantile(num_cols, 0.5).suffix('_quantile'))
    feats = feats.join(temp, on='id', how='left') 


    print("< Categorical columns features >")
    #类别型变量求了n_unique,加入特征.
    temp  = df.group_by("id").agg(pl.n_unique(['activity', 'down_event', 'up_event', 'text_change']))
    feats = feats.join(temp, on='id', how='left') 

    print("< Idle time features >")
    #这里就是论文中的特征.(https://files.eric.ed.gov/fulltext/ED592674.pdf)
    temp = df.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
    temp = temp.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
    temp = temp.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
    temp = temp.group_by("id").agg(inter_key_largest_lantency = pl.max('time_diff'),
                                   inter_key_median_lantency = pl.median('time_diff'),
                                   mean_pause_time = pl.mean('time_diff'),
                                   std_pause_time = pl.std('time_diff'),
                                   total_pause_time = pl.sum('time_diff'),
                                   pauses_half_sec = pl.col('time_diff').filter((pl.col('time_diff') > 0.5) & (pl.col('time_diff') < 1)).count(),
                                   pauses_1_sec = pl.col('time_diff').filter((pl.col('time_diff') > 1) & (pl.col('time_diff') < 1.5)).count(),
                                   pauses_1_half_sec = pl.col('time_diff').filter((pl.col('time_diff') > 1.5) & (pl.col('time_diff') < 2)).count(),
                                   pauses_2_sec = pl.col('time_diff').filter((pl.col('time_diff') > 2) & (pl.col('time_diff') < 3)).count(),
                                   pauses_3_sec = pl.col('time_diff').filter(pl.col('time_diff') > 3).count(),)
    feats = feats.join(temp, on='id', how='left') 
    
    print("< P-bursts features >")
    #找到df中activity为‘Input’和‘Remove/cut’的行,并且是time_diff<2的行
    temp = df.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
    temp = temp.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
    temp = temp.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
    temp = temp.with_columns(pl.col('time_diff')<2)
    #然后统计连续出现的个数的(统计学变量)
    temp = temp.with_columns(pl.when(pl.col("time_diff") & pl.col("time_diff").is_last()).then(pl.count()).over(pl.col("time_diff").rle_id()).alias('P-bursts'))
    temp = temp.drop_nulls()#删除包含缺失值的行
    temp = temp.group_by("id").agg(pl.mean('P-bursts').suffix('_mean'), pl.std('P-bursts').suffix('_std'), pl.count('P-bursts').suffix('_count'),
                                   pl.median('P-bursts').suffix('_median'), pl.max('P-bursts').suffix('_max'),
                                   pl.first('P-bursts').suffix('_first'), pl.last('P-bursts').suffix('_last'))
    feats = feats.join(temp, on='id', how='left') 

    print("< R-bursts features >")
    #取出数据中为'Remove/cut'
    temp = df.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
    temp = temp.with_columns(pl.col('activity').is_in(['Remove/Cut']))
    #统计'Remove/cut'连续出现的次数(的统计学变量)
    temp = temp.with_columns(pl.when(pl.col("activity") & pl.col("activity").is_last()).then(pl.count()).over(pl.col("activity").rle_id()).alias('R-bursts'))
    temp = temp.drop_nulls()#删除包含缺失值的行
    temp = temp.group_by("id").agg(pl.mean('R-bursts').suffix('_mean'), pl.std('R-bursts').suffix('_std'), 
                                   pl.median('R-bursts').suffix('_median'), pl.max('R-bursts').suffix('_max'),
                                   pl.first('R-bursts').suffix('_first'), pl.last('R-bursts').suffix('_last'))
    feats = feats.join(temp, on='id', how='left')
    
    return feats

In [None]:
AGGREGATIONS = ['count', 'mean', 'min', 'max', 'first', 'last', q1, 'median', q3, 'sum']

def word_feats(df):
    essay_df = df
    df['word'] = df['essay'].apply(lambda x: re.split(' |\\n|\\.|\\?|\\!',x))
    df = df.explode('word')
    df['word_len'] = df['word'].apply(lambda x: len(x))
    df = df[df['word_len'] != 0]

    word_agg_df = df[['id','word_len']].groupby(['id']).agg(AGGREGATIONS)
    word_agg_df.columns = ['_'.join(x) for x in word_agg_df.columns]
    word_agg_df['id'] = word_agg_df.index
    word_agg_df = word_agg_df.reset_index(drop=True)
    return word_agg_df


def sent_feats(df):
    df['sent'] = df['essay'].apply(lambda x: re.split('\\.|\\?|\\!',x))
    df = df.explode('sent')
    df['sent'] = df['sent'].apply(lambda x: x.replace('\n','').strip())
    # Number of characters in sentences
    df['sent_len'] = df['sent'].apply(lambda x: len(x))
    # Number of words in sentences
    df['sent_word_count'] = df['sent'].apply(lambda x: len(x.split(' ')))
    df = df[df.sent_len!=0].reset_index(drop=True)

    sent_agg_df = pd.concat([df[['id','sent_len']].groupby(['id']).agg(AGGREGATIONS), 
                             df[['id','sent_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1)
    sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
    sent_agg_df['id'] = sent_agg_df.index
    sent_agg_df = sent_agg_df.reset_index(drop=True)
    sent_agg_df.drop(columns=["sent_word_count_count"], inplace=True)
    sent_agg_df = sent_agg_df.rename(columns={"sent_len_count":"sent_count"})
    return sent_agg_df

def parag_feats(df):
    df['paragraph'] = df['essay'].apply(lambda x: x.split('\n'))
    df = df.explode('paragraph')
    # Number of characters in paragraphs
    df['paragraph_len'] = df['paragraph'].apply(lambda x: len(x)) 
    # Number of words in paragraphs
    df['paragraph_word_count'] = df['paragraph'].apply(lambda x: len(x.split(' ')))
    df = df[df.paragraph_len!=0].reset_index(drop=True)
    
    paragraph_agg_df = pd.concat([df[['id','paragraph_len']].groupby(['id']).agg(AGGREGATIONS), 
                                  df[['id','paragraph_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1) 
    paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
    paragraph_agg_df['id'] = paragraph_agg_df.index
    paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
    paragraph_agg_df.drop(columns=["paragraph_word_count_count"], inplace=True)
    paragraph_agg_df = paragraph_agg_df.rename(columns={"paragraph_len_count":"paragraph_count"})
    return paragraph_agg_df

#论文的长度除以(‘Input’和‘Remove/Cut’)的按键个数.
def product_to_keys(logs, essays):
    essays['product_len'] = essays.essay.str.len()#论文的长度
    #logs中每个id ‘Input’和‘Remove/Cut’的数据
    tmp_df = logs[logs.activity.isin(['Input', 'Remove/Cut'])].groupby(['id']).agg({'activity': 'count'}).reset_index().rename(columns={'activity': 'keys_pressed'})
    essays = essays.merge(tmp_df, on='id', how='left')
    #论文的长度除以(‘Input’和‘Remove/Cut’)的按键个数.
    essays['product_to_keys'] = essays['product_len'] / essays['keys_pressed']
    return essays[['id', 'product_to_keys']]
#统计每秒有几个['Input', 'Remove/Cut']的行为.
def get_keys_pressed_per_second(logs):
    #logs中为['Input', 'Remove/Cut']的event_id的个数
    temp_df = logs[logs['activity'].isin(['Input', 'Remove/Cut'])].groupby(['id']).agg(keys_pressed=('event_id', 'count')).reset_index()
    #每个id最小的down_time和最大的up_time
    temp_df_2 = logs.groupby(['id']).agg(min_down_time=('down_time', 'min'), max_up_time=('up_time', 'max')).reset_index()
    #按照id融合在一起
    temp_df = temp_df.merge(temp_df_2, on='id', how='left')
    #每秒有几个event_id
    temp_df['keys_per_second'] = temp_df['keys_pressed'] / ((temp_df['max_up_time'] - temp_df['min_down_time']) / 1000)
    return temp_df[['id', 'keys_per_second']]

In [None]:
#传入训练数据dataX和datay,model,测试数据test_X
#就是一个简单的k折交叉验证,不过模型只有1个训练5次.训练完就得到测试集的预测结果.
def evaluate(data_x, data_y, model, random_state=seed, n_splits=5, test_x=None):
    #StratifiedKFold还要考虑每种类别的占比
    skf    = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)
    test_y = np.zeros((len(test_x), n_splits))#5折的预测结果
    for i, (train_idx, valid_idx) in enumerate(skf.split(data_x, data_y.astype(str))):
        train_x = data_x.iloc[train_idx]
        train_y = data_y[train_idx]
        valid_x = data_x.iloc[valid_idx]
        valid_y = data_y[valid_idx]
        model.fit(train_x, train_y)
        test_y[:, i] = model.predict(test_x)
    return np.mean(test_y, axis=1)


train_logs    = pl.scan_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
train_feats   = dev_feats(train_logs)
train_feats   = train_feats.collect().to_pandas()

print('< Essay Reconstruction >')
train_logs             = train_logs.collect().to_pandas()
train_essays           = pd.read_csv('/kaggle/input/writing-quality-challenge-constructed-essays/train_essays_fast.csv')
train_feats            = train_feats.merge(word_feats(train_essays), on='id', how='left')
train_feats            = train_feats.merge(sent_feats(train_essays), on='id', how='left')
train_feats            = train_feats.merge(parag_feats(train_essays), on='id', how='left')
train_feats            = train_feats.merge(get_keys_pressed_per_second(train_logs), on='id', how='left')
train_feats            = train_feats.merge(product_to_keys(train_logs, train_essays), on='id', how='left')

#找到只有唯一值的列,删掉
keys=train_feats.keys().values
unique_cols=[key for key in keys if train_feats[key].nunique()<2]
print(f"unique_cols:{unique_cols}")
train_feats = train_feats.drop(columns=unique_cols)

print('< Mapping >')
train_scores   = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
data           = train_feats.merge(train_scores, on='id', how='left')
x              = data.drop(['id', 'score'], axis=1)
y              = data['score'].values

print(f'Number of features: {len(x.columns)}')

print('< Testing Data >')
test_logs   = pl.scan_csv('/kaggle/working/test_logs.csv')
test_feats  = dev_feats(test_logs)
test_feats  = test_feats.collect().to_pandas()

test_logs             = test_logs.collect().to_pandas()
test_essays           = test_essays_copy
test_feats            = test_feats.merge(word_feats(test_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')
test_feats            = test_feats.merge(get_keys_pressed_per_second(test_logs), on='id', how='left')
test_feats            = test_feats.merge(product_to_keys(test_logs, test_essays), on='id', how='left')

test_feats = test_feats.drop(columns=unique_cols)

test_ids = test_feats['id'].values
testin_x = test_feats.drop(['id'], axis=1)

print('< Learning and Evaluation >')
lgbm_params = {'n_estimators': 1024,
         'learning_rate': 0.006,
         'metric': 'rmse',
         'random_state': seed,
         'force_col_wise': True,
         'verbosity': 0,}
solution = LGBMRegressor(**lgbm_params)
y_pred_lgb   = evaluate(x.copy(), y.copy(), solution, test_x=testin_x.copy()) 
y_pred_lgb

In [None]:
y_pred = blended_score*0.6+ y_pred_lgb*0.4#将两种预测结果进行一个加权融合

submission = pd.DataFrame({'id': test_ids, 'score': y_pred})
submission.to_csv('submission.csv', index=False)
submission.head()