In [1]:
import math
from typing import List, Optional

import optuna
import pandas as pd
import xgboost as xgb
from tqdm import tqdm

In [2]:
ACTIVITY_NAMES = [
    'nonproduction',
    'input',
    'remove/cut',
    'paste',
    'replace',
    'move'
]
ACTIVITY2IDX = dict(zip(ACTIVITY_NAMES, range(len(ACTIVITY_NAMES))))
TEXT_CHANGE_NAMES = ['alphanum', 'other']
TEXT_CHANGE2IDX = dict(zip(TEXT_CHANGE_NAMES, range(len(TEXT_CHANGE_NAMES))))
ACTION_TIME_COLS = [
    'action_time_max_log',
    'action_time_mean_log',
    'action_time_std_log'
]


def process_activity(data: pd.Series) -> List[float]:
    """Count activity frequencies

    Parameters
    ----------
    data : pd.Series
        activity data

    Returns
    -------
    List[float]
        activity frequencies
    """

    activity = data.apply(lambda x: 'move' if 'Move' in x else x.lower())
    activity2freq = activity.value_counts(normalize=True)
    res = [0] * len(ACTIVITY_NAMES)
    for name, freq in activity2freq.items():
        res[ACTIVITY2IDX[name]] = freq

    return res


def process_text_change(data: pd.Series) -> List[float]:
    text_change = data.apply(lambda x: 'alphanum' if x == 'q' else 'other')
    text_change2freq = text_change.value_counts(normalize=True)
    res = [0] * len(TEXT_CHANGE_NAMES)
    for name, freq in text_change2freq.items():
        res[TEXT_CHANGE2IDX[name]] = freq

    return res


def prepare_data(
    df: pd.DataFrame,
    labels: Optional[pd.DataFrame] = None
) -> pd.DataFrame:
    """Prepare feature matrix and (optional) labels for a given portion of data

    Parameters
    ----------
    df : pd.DataFrame
        Events DataFrame
    labels : Optional[pd.DataFrame], optional
        Labels DataFrame, by default None

    Returns
    -------
    Tuple[np.ndarray, Optional[np.ndarray]]
        X, y
    """

    data = []
    n_groups = df['id'].nunique()
    for log_id, group in tqdm(df.groupby('id'), total=n_groups):
        tmp = []

        # id
        tmp.append(log_id)

        # action_time
        tmp.extend([
            math.log(group['action_time'].max() + 1),
            math.log(group['action_time'].mean() + 1),
            math.log(group['action_time'].std() + 1)
        ])

        # activity
        tmp.extend(process_activity(data=group['activity']))

        # text_change
        tmp.extend(process_text_change(data=group['text_change']))

        data.append(tmp)
    res = pd.DataFrame(
        data=data,
        columns=['id', *ACTION_TIME_COLS, *ACTIVITY_NAMES, *TEXT_CHANGE_NAMES]
    )
    if labels is not None:
        res = res.merge(labels, on='id', how='left')

    return res

# Load training/test data

In [3]:
df_logs = pd.read_csv('../input/linking-writing-processes-to-writing-quality/train_logs.csv')
df_logs_test = pd.read_csv('../input/linking-writing-processes-to-writing-quality/test_logs.csv')
df_labels = pd.read_csv('../input/linking-writing-processes-to-writing-quality/train_scores.csv')
df_logs.shape, df_labels.shape, df_logs_test.shape

((8405898, 11), (2471, 2), (6, 11))

# Train data

In [4]:
df_agg = prepare_data(df=df_logs, labels=df_labels)
print(df_agg.shape)
df_agg.head(n=10)

100%|██████████| 2471/2471 [00:03<00:00, 718.48it/s]


(2471, 13)


Unnamed: 0,id,action_time_max_log,action_time_mean_log,action_time_std_log,nonproduction,input,remove/cut,paste,replace,move,alphanum,other,score
0,001519c8,7.72312,4.764281,4.530418,0.04693,0.786077,0.163082,0.0,0.002738,0.001173,0.758702,0.241298,3.5
1,0022f953,7.472501,4.729344,4.033022,0.103504,0.789731,0.105949,0.000407,0.000407,0.0,0.691932,0.308068,3.5
2,0042269b,8.008366,4.633153,4.423454,0.042311,0.849855,0.106141,0.0,0.001692,0.0,0.787476,0.212524,6.0
3,0059420b,6.693324,4.81095,4.742915,0.063625,0.838046,0.097044,0.000643,0.000643,0.0,0.736504,0.263496,2.0
4,0075873a,6.553933,4.827865,4.144436,0.028447,0.767286,0.204267,0.0,0.0,0.0,0.775978,0.224022,4.0
5,0081af50,7.005789,4.411638,3.729375,0.034374,0.811398,0.152872,0.0,0.001357,0.0,0.783356,0.216644,2.0
6,0093f095,6.2186,4.706985,3.638068,0.019263,0.896884,0.083853,0.0,0.0,0.0,0.806799,0.193201,4.5
7,009e23ab,6.689599,4.519129,3.759686,0.065873,0.839354,0.094348,0.0,0.000425,0.0,0.728432,0.271568,4.0
8,00e048f1,6.419995,4.330883,3.533529,0.025868,0.899685,0.074448,0.0,0.0,0.0,0.782334,0.217666,3.5
9,00e1f05a,9.307286,4.546939,5.297801,0.029134,0.785203,0.184769,0.0,0.000894,0.0,0.778559,0.221441,4.5


# Test data

In [5]:
df_agg_test = prepare_data(df=df_logs_test)
print(df_agg_test.shape)
df_agg_test.head(n=10)

100%|██████████| 3/3 [00:00<00:00, 1309.49it/s]

(3, 12)





Unnamed: 0,id,action_time_max_log,action_time_mean_log,action_time_std_log,nonproduction,input,remove/cut,paste,replace,move,alphanum,other
0,0000aaaa,4.477337,4.465908,0.881374,0,1.0,0,0,0,0,0.0,1.0
1,2222bbbb,4.219508,4.051785,2.763122,0,1.0,0,0,0,0,1.0,0.0
2,4444cccc,4.553877,4.330733,3.327553,0,1.0,0,0,0,0,0.5,0.5


In [6]:
columns = df_agg.columns
features_names = columns[1:-1]
features_names

Index(['action_time_max_log', 'action_time_mean_log', 'action_time_std_log',
       'nonproduction', 'input', 'remove/cut', 'paste', 'replace', 'move',
       'alphanum', 'other'],
      dtype='object')

In [7]:
X_train = df_agg[features_names].values
y_train = df_agg['score'].values
X_train.shape, y_train.shape

((2471, 11), (2471,))

In [8]:
X_test = df_agg_test[features_names].values
X_test.shape

(3, 11)

# Train model

In [9]:
n_trials = 30

clf = xgb.XGBRegressor(eval_metric='rmse')
param_distributions = {
    'n_estimators': optuna.distributions.IntDistribution(2, 100),
    'max_depth': optuna.distributions.IntDistribution(2, 10),
    'learning_rate': optuna.distributions.FloatDistribution(0, 1)
}
optuna_search = optuna.integration.OptunaSearchCV(
    estimator=clf, 
    param_distributions=param_distributions,
    n_trials=n_trials
)
optuna_search.fit(X_train, y_train)
y_test = optuna_search.predict(X_test)

  optuna_search = optuna.integration.OptunaSearchCV(
[I 2023-10-25 05:18:51,549] A new study created in memory with name: no-name-336f6fd1-bbc5-4b5f-b3be-0231716226c5
[I 2023-10-25 05:18:52,220] Trial 0 finished with value: 0.04144282997931044 and parameters: {'n_estimators': 57, 'max_depth': 10, 'learning_rate': 0.39044410899239623}. Best is trial 0 with value: 0.04144282997931044.
[I 2023-10-25 05:18:52,824] Trial 1 finished with value: 0.08289603699019395 and parameters: {'n_estimators': 84, 'max_depth': 7, 'learning_rate': 0.36690593016034145}. Best is trial 1 with value: 0.08289603699019395.
[I 2023-10-25 05:18:53,038] Trial 2 finished with value: 0.059735732342631076 and parameters: {'n_estimators': 72, 'max_depth': 3, 'learning_rate': 0.6069799700136622}. Best is trial 1 with value: 0.08289603699019395.
[I 2023-10-25 05:18:53,213] Trial 3 finished with value: 0.18586751354292705 and parameters: {'n_estimators': 35, 'max_depth': 4, 'learning_rate': 0.30093550176915673}. Best is t

In [10]:
df_agg_test['score'] = y_test
df_agg_test.head()

Unnamed: 0,id,action_time_max_log,action_time_mean_log,action_time_std_log,nonproduction,input,remove/cut,paste,replace,move,alphanum,other,score
0,0000aaaa,4.477337,4.465908,0.881374,0,1.0,0,0,0,0,0.0,1.0,2.26124
1,2222bbbb,4.219508,4.051785,2.763122,0,1.0,0,0,0,0,1.0,0.0,1.940591
2,4444cccc,4.553877,4.330733,3.327553,0,1.0,0,0,0,0,0.5,0.5,2.446241


In [11]:
df_agg_test[['id', 'score']].to_csv('submission.csv', index=False)