In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/you-are-bot/sample_submission.csv
/kaggle/input/you-are-bot/train.json
/kaggle/input/you-are-bot/test.json
/kaggle/input/you-are-bot/ytrain.csv
/kaggle/input/you-are-bot/ytest.csv


In [96]:
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from scipy.sparse import hstack
from sklearn.compose import ColumnTransformer

In [53]:
def load_train_data(data_file: str, labels_file: str):
    all_texts = []
    all_labels = []
    all_part_indicies = []

    labels_df = pd.read_csv(labels_file)
    labels_df = labels_df[labels_df["participant_index"] == 0]
    labels_dict = dict(zip(labels_df["dialog_id"], labels_df["is_bot"]))

    with open(data_file, "r", encoding="utf-8") as f:

        data = json.load(f)
        for key in data.keys():
            messages = data[key]

            part_0_texts = [
                m["text"] for m in messages if m["participant_index"] == "0"
            ]
            part_1_texts = [
                m["text"] for m in messages if m["participant_index"] == "1"
            ]

            part_0_label = int(labels_dict[key])
            part_1_label = 1 - part_0_label

            text_0 = " ".join(part_0_texts)
            text_1 = " ".join(part_1_texts)

            all_texts.append(text_0)
            all_labels.append(part_0_label)
            all_part_indicies.append(0)

            all_texts.append(text_1)
            all_labels.append(part_1_label)
            all_part_indicies.append(1)

    df = pd.DataFrame({"text": all_texts, "participant_index": all_part_indicies, "is_bot": all_labels})
    return df


def load_test_data(data_file: str, labels_file: str):
    df_info = pd.read_csv(labels_file)

    with open(data_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    all_texts = []
    ids = []
    part_indicies = []

    for _, row in df_info.iterrows():
        dialog_id = row["dialog_id"]
        participant_index = str(row["participant_index"])
        messages = data[dialog_id]

        texts = [
            m["text"] for m in messages if m["participant_index"] == participant_index
        ]
        combined_text = " ".join(texts)
        all_texts.append(combined_text)
        ids.append(row["ID"])
        part_indicies.append(participant_index)

    df = pd.DataFrame({"ID": ids, "text": all_texts, "participant_index": part_indicies})
    return df

In [117]:
def main(full=False, submission_name="preds"):
    df = load_train_data(
        "/kaggle/input/you-are-bot/train.json",
        "/kaggle/input/you-are-bot/ytrain.csv"
    )
    X = df[["text", "participant_index"]]
    y = df["is_bot"]

    pipe = Pipeline(
        [
            (
                "vectorizer", ColumnTransformer(
                    transformers=[
                        (
                            "text",
                            Pipeline(
                                [
                                    ("squeez", FunctionTransformer(lambda x: x.squeeze())),
                                    ("tfidf", TfidfVectorizer()),
                                    ("toarray", FunctionTransformer(lambda x: x.toarray()))
                                ]
                            ),
                            ["text"]
                        ),
                        ("identity", FunctionTransformer(lambda x: x), ["participant_index"]),
                    ]
                )
            ),
            ("model", LogisticRegression(random_state=42)),
        ]
    )

    if full:
        pipe.fit(X, y)
        
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        pipe.fit(X_train, y_train)
    
        val_pred = pipe.predict(X_test)
        val_proba = pipe.predict_proba(X_test)
        val_acc = accuracy_score(y_test, val_pred)
        val_roc = roc_auc_score(y_test, val_proba[:, 1])
        val_logloss = log_loss(y_test, val_proba)
        print("Val Accuracy:", val_acc)
        print("Val ROC AUC:", val_roc)
        print("Val Log Loss:", val_logloss)
    
    df_test = load_test_data(
        "/kaggle/input/you-are-bot/test.json",
        "/kaggle/input/you-are-bot/ytest.csv"
    )
    X_test = df_test[["text", "participant_index"]]
    test_proba = pipe.predict_proba(X_test)[:, 1]

    preds_df = pd.DataFrame({"ID": df_test["ID"], "is_bot": test_proba})
    preds_df.to_csv("{}.csv".format(submission_name), index=False)

In [118]:
main()

Val Accuracy: 0.8253968253968254
Val ROC AUC: 0.856970087881964
Val Log Loss: 0.45388555272547276


In [119]:
main(full=True)

In [120]:
pd.read_csv("/kaggle/working/preds.csv")

Unnamed: 0,ID,is_bot
0,af36ac2aa9734738bbd533db8e5fb43a_0,0.097071
1,cdc2c5c605144c8e8dd5e9ea3d1352fc_0,0.134767
2,ed19efdedcb24600aea67c968aba5520_0,0.262444
3,f2ea031960cf4454b4596d94cbee021e_0,0.284162
4,d948808cda4944cd838f88308a9ecd8b_0,0.065628
...,...,...
671,23ce3b6cf164467386e2b34db908dbc3_1,0.768632
672,4dad8117d3c946ef9c021aac9e5ded02_1,0.735312
673,8e822ce1089741febae586c5fef99124_1,0.856258
674,56201a8ac9c64665aa6d236dbc79daf4_1,0.933101
