In [1]:
from typing import List, Optional

import optuna
import pandas as pd
import xgboost as xgb
from tqdm import tqdm

In [2]:
ACTIVITY_NAMES = [
    'nonproduction',
    'input',
    'remove/cut',
    'paste',
    'replace',
    'move'
]
ACTIVITY2IDX = dict(zip(ACTIVITY_NAMES, range(len(ACTIVITY_NAMES))))
TEXT_CHANGE_NAMES = ['alphanum', 'other']
TEXT_CHANGE2IDX = dict(zip(TEXT_CHANGE_NAMES, range(len(TEXT_CHANGE_NAMES))))
ACTION_TIME_COLS = [
    'action_time_max',
    'action_time_mean',
    'action_time_std'
]
WORD_COUNT_NAMES = ['word_count_max', 'word_count_last']


def process_activity(data: pd.Series) -> List[float]:
    """Count activity frequencies

    Parameters
    ----------
    data : pd.Series
        activity data

    Returns
    -------
    List[float]
        activity frequencies
    """

    activity = data.apply(lambda x: 'move' if 'Move' in x else x.lower())
    activity2freq = activity.value_counts(normalize=True)
    res = [0] * len(ACTIVITY_NAMES)
    for name, freq in activity2freq.items():
        res[ACTIVITY2IDX[name]] = freq

    return res


def process_text_change(data: pd.Series) -> List[float]:
    text_change = data.apply(lambda x: 'alphanum' if x == 'q' else 'other')
    text_change2freq = text_change.value_counts(normalize=True)
    res = [0] * len(TEXT_CHANGE_NAMES)
    for name, freq in text_change2freq.items():
        res[TEXT_CHANGE2IDX[name]] = freq

    return res


def prepare_data(
    df: pd.DataFrame,
    labels: Optional[pd.DataFrame] = None
) -> pd.DataFrame:
    """Prepare feature matrix and (optional) labels for a given portion of data

    Parameters
    ----------
    df : pd.DataFrame
        Events DataFrame
    labels : Optional[pd.DataFrame], optional
        Labels DataFrame, by default None

    Returns
    -------
    Tuple[np.ndarray, Optional[np.ndarray]]
        X, y
    """

    data = []
    n_groups = df['id'].nunique()
    for log_id, group in tqdm(df.groupby('id'), total=n_groups):
        tmp = []

        # id
        tmp.append(log_id)

        # action_time
        tmp.extend([
            group['action_time'].max(),
            group['action_time'].mean(),
            group['action_time'].std()
        ])

        # activity
        tmp.extend(process_activity(data=group['activity']))

        # text_change
        tmp.extend(process_text_change(data=group['text_change']))

        # word_count
        tmp.append(group['word_count'].max())
        tmp.append(group.tail(1)['word_count'].values[0])

        # log size
        tmp.append(len(group))

        data.append(tmp)
    res = pd.DataFrame(
        data=data,
        columns=[
            'id', 
            *ACTION_TIME_COLS, 
            *ACTIVITY_NAMES, 
            *TEXT_CHANGE_NAMES, 
            *WORD_COUNT_NAMES, 
            'log_size'
        ]
    )
    if labels is not None:
        res = res.merge(labels, on='id', how='left')

    return res

# Load training/test data

In [3]:
df_logs = pd.read_csv('../input/linking-writing-processes-to-writing-quality/train_logs.csv')
df_logs_test = pd.read_csv('../input/linking-writing-processes-to-writing-quality/test_logs.csv')
df_labels = pd.read_csv('../input/linking-writing-processes-to-writing-quality/train_scores.csv')
df_logs.shape, df_labels.shape, df_logs_test.shape

((8405898, 11), (2471, 2), (6, 11))

# Train data

In [4]:
df_agg = prepare_data(df=df_logs, labels=df_labels)
print(df_agg.shape)
df_agg.head(n=10)

100%|██████████| 2471/2471 [00:03<00:00, 654.33it/s]


(2471, 16)


Unnamed: 0,id,action_time_max,action_time_mean,action_time_std,nonproduction,input,remove/cut,paste,replace,move,alphanum,other,word_count_max,word_count_last,log_size,score
0,001519c8,2259,116.246774,91.797374,0.04693,0.786077,0.163082,0.0,0.002738,0.001173,0.758702,0.241298,256,255,2557,3.5
1,0022f953,1758,112.221271,55.431189,0.103504,0.789731,0.105949,0.000407,0.000407,0.0,0.691932,0.308068,323,320,2454,3.5
2,0042269b,3005,101.837766,82.383766,0.042311,0.849855,0.106141,0.0,0.001692,0.0,0.787476,0.212524,404,404,4136,6.0
3,0059420b,806,121.848329,113.768226,0.063625,0.838046,0.097044,0.000643,0.000643,0.0,0.736504,0.263496,206,206,1556,2.0
4,0075873a,701,123.943896,62.082013,0.028447,0.767286,0.204267,0.0,0.0,0.0,0.775978,0.224022,252,252,2531,4.0
5,0081af50,1102,81.404342,40.653054,0.034374,0.811398,0.152872,0.0,0.001357,0.0,0.783356,0.216644,275,275,2211,2.0
6,0093f095,501,109.717847,37.018331,0.019263,0.896884,0.083853,0.0,0.0,0.0,0.806799,0.193201,242,241,1765,4.5
7,009e23ab,803,90.755631,41.934952,0.065873,0.839354,0.094348,0.0,0.000425,0.0,0.728432,0.271568,308,307,2353,4.0
8,00e048f1,613,75.011356,33.244615,0.025868,0.899685,0.074448,0.0,0.0,0.0,0.782334,0.217666,223,223,1585,3.5
9,00e1f05a,11017,93.343215,198.89669,0.029134,0.785203,0.184769,0.0,0.000894,0.0,0.778559,0.221441,739,739,7826,4.5


# Test data

In [5]:
df_agg_test = prepare_data(df=df_logs_test)
print(df_agg_test.shape)
df_agg_test.head(n=10)

100%|██████████| 3/3 [00:00<00:00, 1241.29it/s]

(3, 15)





Unnamed: 0,id,action_time_max,action_time_mean,action_time_std,nonproduction,input,remove/cut,paste,replace,move,alphanum,other,word_count_max,word_count_last,log_size
0,0000aaaa,87,86.0,1.414214,0,1.0,0,0,0,0,0.0,1.0,0,0,2
1,2222bbbb,67,56.5,14.849242,0,1.0,0,0,0,0,1.0,0.0,1,1,2
2,4444cccc,94,75.0,26.870058,0,1.0,0,0,0,0,0.5,0.5,1,1,2


In [6]:
columns = df_agg.columns
features_names = columns[1:-1]
features_names

Index(['action_time_max', 'action_time_mean', 'action_time_std',
       'nonproduction', 'input', 'remove/cut', 'paste', 'replace', 'move',
       'alphanum', 'other', 'word_count_max', 'word_count_last', 'log_size'],
      dtype='object')

In [7]:
X_train = df_agg[features_names].values
y_train = df_agg['score'].values
X_train.shape, y_train.shape

((2471, 14), (2471,))

In [8]:
X_test = df_agg_test[features_names].values
X_test.shape

(3, 14)

# Train model

In [9]:
n_trials = 30

clf = xgb.XGBRegressor(eval_metric='rmse')
param_distributions = {
    'n_estimators': optuna.distributions.IntDistribution(2, 100),
    'max_depth': optuna.distributions.IntDistribution(2, 10),
    'learning_rate': optuna.distributions.FloatDistribution(0, 1)
}
optuna_search = optuna.integration.OptunaSearchCV(
    estimator=clf, 
    param_distributions=param_distributions,
    n_trials=n_trials
)
optuna_search.fit(X_train, y_train)
y_test = optuna_search.predict(X_test)

  optuna_search = optuna.integration.OptunaSearchCV(
[I 2023-10-25 05:38:44,656] A new study created in memory with name: no-name-3b6bac77-e53d-41bc-acd6-012bf0edf042
[I 2023-10-25 05:38:44,935] Trial 0 finished with value: 0.42938848424669196 and parameters: {'n_estimators': 41, 'max_depth': 5, 'learning_rate': 0.45681386970781623}. Best is trial 0 with value: 0.42938848424669196.
[I 2023-10-25 05:38:45,609] Trial 1 finished with value: 0.466680780436326 and parameters: {'n_estimators': 80, 'max_depth': 8, 'learning_rate': 0.15390429435582698}. Best is trial 1 with value: 0.466680780436326.
[I 2023-10-25 05:38:45,733] Trial 2 finished with value: 0.401450291868039 and parameters: {'n_estimators': 13, 'max_depth': 8, 'learning_rate': 0.16962792226678025}. Best is trial 1 with value: 0.466680780436326.
[I 2023-10-25 05:38:46,504] Trial 3 finished with value: 0.3116780899182718 and parameters: {'n_estimators': 100, 'max_depth': 8, 'learning_rate': 0.7790305883725153}. Best is trial 1 wit

In [10]:
df_agg_test['score'] = y_test
df_agg_test.head()

Unnamed: 0,id,action_time_max,action_time_mean,action_time_std,nonproduction,input,remove/cut,paste,replace,move,alphanum,other,word_count_max,word_count_last,log_size,score
0,0000aaaa,87,86.0,1.414214,0,1.0,0,0,0,0,0.0,1.0,0,0,2,1.252428
1,2222bbbb,67,56.5,14.849242,0,1.0,0,0,0,0,1.0,0.0,1,1,2,1.845953
2,4444cccc,94,75.0,26.870058,0,1.0,0,0,0,0,0.5,0.5,1,1,2,1.252428


In [11]:
df_agg_test[['id', 'score']].to_csv('submission.csv', index=False)