In [None]:
# Run some setup code for this notebook.
import pandas as pd

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2


In [None]:
def load_data(path):
    df = pd.read_json(path)
    df_expanded = df["user"].apply(lambda x: pd.Series(x))
    df = pd.concat([df.drop("user", axis=1), df_expanded], axis=1)
    return df


df = load_data("./data/train.json")
df2fill = load_data("./data/test.json")

print(df.shape, df2fill.shape)
df.head()

## 处理自然语言信息

In [None]:
from transformers import BertTokenizer, BertModel, BertConfig
import numpy as np

MODEL_PATH = "D:/models/bert-case-based/"

config = BertConfig.from_pretrained(MODEL_PATH)
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
model = BertModel.from_pretrained(MODEL_PATH)


def str_embedding(s: str, max_len: int):
    text_dict = tokenizer(
        s,
        max_length=max_len,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    return pd.Series(
        model(text_dict["input_ids"], attention_mask=text_dict["attention_mask"])[0]
        .detach()
        .squeeze(0)
        .numpy()
        .flatten()
    )

def str_entropy(s: str):
    from collections import Counter
    n = len(s)
    cnt = Counter(s)
    res = 0.0
    for _ch, c in cnt.items():
        p = c / n
        res -= p * np.log2(p)
    return res


arg_map = {
    "name": (-1, -1),
    "screen_name": (-1, -1),
    "location": (5, 5),
    "description": (25, 7),
}


def process_str(d: pd.DataFrame):
    print(d.shape)
    for name, (mxlen, pca_dim) in arg_map.items():
        dlen = pd.DataFrame(d[name].apply(len))
        dlen.columns += "_len"
        d = pd.concat([d, dlen], axis=1)

        dent = pd.DataFrame(d[name].apply(str_entropy))
        dent.columns += "_ent"
        d = pd.concat([d, dent], axis=1)

        if mxlen != -1:
            de = d[name].apply(str_embedding, args=(mxlen,))
            from sklearn.decomposition import PCA

            pca = PCA(n_components=pca_dim)
            de = pd.DataFrame(pca.fit_transform(de))
            de.columns = list(map(lambda x: name + str(x), range(pca_dim)))
            d = pd.concat([d, de], axis=1)
        print(d.shape)
    return d


def process2str(a: pd.DataFrame, b: pd.DataFrame):
    N = len(a)
    con = pd.concat([a, b])
    con = con.reset_index(drop=True)
    res = process_str(con)
    return res[:N], res[N:]


df, df2fill = process2str(df, df2fill)

## 数据预处理

In [None]:
def preprocess(d: pd.DataFrame):
    d.drop(["id", "id_str", "utc_offset", "time_zone"], axis=1, inplace=True)

    d.drop(
        [
            "name",
            "screen_name",
            "location",
            "description",
            "url",
            "entities",
            "profile_background_image_url",
            "profile_background_image_url_https",
            "profile_image_url",
            "profile_image_url_https",
            "profile_banner_url",
        ],
        axis=1,
        inplace=True,
    )

    old_columns = list(d.columns[1:])
    d.columns = ["created_at0"] + old_columns
    d.drop(["created_at0"], axis=1, inplace=True)
    d["created_at"] = pd.to_datetime(d["created_at"], infer_datetime_format=True)
    d["created_at"] = d["created_at"].apply(
        lambda x: x.value // (10**9) / (24 * 60 * 60)
    )

    def col2rgb(s: str):
        x = int(s, base=16)
        return pd.Series([x // (256 * 256), (x // 256) % 256, x % 256])

    for name in [
        "profile_background_color",
        "profile_link_color",
        "profile_sidebar_border_color",
        "profile_sidebar_fill_color",
        "profile_text_color",
    ]:
        d_rgb = d[name].apply(col2rgb)
        d_rgb.columns = [name + "_r", name + "_g", name + "_b"]
        d = pd.concat([d.drop([name], axis=1), d_rgb], axis=1)

    d["lang"] = d["lang"].apply(str.lower)

    return d


df_label = pd.get_dummies(df["label"])["human"]
df.drop(["label"], axis=1, inplace=True)
df2fill.drop(["label"], axis=1, inplace=True)


def dummy2(a: pd.DataFrame, b: pd.DataFrame):
    N = len(a)
    concat_dummy = pd.get_dummies(pd.concat([a, b]))
    return concat_dummy[:N], concat_dummy[N:]


df = preprocess(df)
df2fill = preprocess(df2fill)

df, df2fill = dummy2(df, df2fill)

In [None]:
print(df.info())
print(df2fill.info())

In [None]:
df.head()

## 数据探索

In [None]:
df_label.value_counts()

In [None]:
continuous_attrs = [
    "created_at",
    "followers_count",
    "friends_count",
    "listed_count",
    "favourites_count",
    "statuses_count",
]

df[continuous_attrs].describe()

In [None]:
for s in continuous_attrs:
    df[s] = np.log10(1 + df[s])
    df2fill[s] = np.log10(1 + df2fill[s])

for s in ["statuses_count"]:
    df[s] = np.square(df[s])
    df2fill[s] = np.square(df2fill[s])

for s in ["favourites_count"]:
    df[s] = np.power(df[s], 1.35)
    df2fill[s] = np.power(df2fill[s], 1.35)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, axs_hist = plt.subplots(2, 3, figsize=(12, 8))

for i, s in enumerate(continuous_attrs):
    sns.histplot(data=df, x=s, hue=df_label, kde=True, ax=axs_hist[i // 3, i % 3])

plt.savefig("image/hist.png", dpi=300, bbox_inches="tight", facecolor="white")

In [None]:
plt.figure(figsize=(7, 5))
sns.heatmap(df[continuous_attrs].corr(), annot=True)
plt.savefig("image/heatmap.png", dpi=300, bbox_inches="tight", facecolor="white")

## 模型训练

In [None]:
df_label = np.array(df_label)
global X_train, X_test, y_train, y_test
global df2fill_scaled


def make_data():
    from sklearn.model_selection import train_test_split

    global X_train, X_test, y_train, y_test
    global df2fill_scaled

    X_train, X_test, y_train, y_test = map(np.array, train_test_split(df, df_label, test_size=0.2))

    from sklearn.covariance import EllipticEnvelope
    detector = EllipticEnvelope(contamination=0.05)
    detector.fit(X_train)
    pred = detector.predict(X_train)
    X_train[pred==-1, :] = np.nan

    from sklearn.impute import KNNImputer
    imputer = KNNImputer()
    X_train = imputer.fit_transform(X_train)

    from sklearn.preprocessing import MinMaxScaler

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    df2fill_scaled = scaler.transform(df2fill.values)


make_data()

### 逻辑回归

In [None]:
def LR_solve():
    from sklearn.linear_model import LogisticRegression

    LR_model = LogisticRegression(solver="saga", max_iter=1000)
    LR_model.fit(X_train, y_train)
    LR_acc = LR_model.score(X_test, y_test)
    return LR_acc


LR_solve()

### 支持向量机

In [None]:
def LSVC_solve():
    from sklearn.svm import LinearSVC

    LSVC_model = LinearSVC(max_iter=2000)
    LSVC_model.fit(X_train, y_train)
    LSVC_acc = LSVC_model.score(X_test, y_test)
    return LSVC_acc


LSVC_solve()

### 随机森林

In [None]:
def RF_solve():
    from sklearn.ensemble import RandomForestClassifier

    RF_model = RandomForestClassifier(n_estimators=256)
    RF_model.fit(X_train, y_train)
    RF_acc = RF_model.score(X_test, y_test)
    return RF_acc


RF_solve()

### 梯度提升树

In [None]:
def GB_solve():
    from sklearn.ensemble import GradientBoostingClassifier

    GB_model = GradientBoostingClassifier(n_estimators=256)
    GB_model.fit(X_train, y_train)
    GB_acc = GB_model.score(X_test, y_test)
    return GB_acc


GB_solve()

## 模型比较

In [None]:
methods = {
    "Logistic Regression": LR_solve,
    "Linear SVC": LSVC_solve,
    "Random Forest": RF_solve,
    "Gradient Boosting": GB_solve,
}
accs = {name: [] for name in methods.keys()}

for iter in range(5):
    make_data()
    for name, func in methods.items():
        accs[name].append(func())
        print(accs[name][-1], end=" ")
    print()

compare = pd.DataFrame({"Model": [], "Accuracy": []})
for name in methods.keys():
    compare.loc[len(compare)] = [name, np.average(accs[name]) * 100]

compare.sort_values(by="Accuracy", ascending=False)

## 模型验证

In [None]:

from sklearn.ensemble import RandomForestClassifier

make_data()
RF_model = RandomForestClassifier(n_estimators=256)
RF_model.fit(X_train, y_train)
print(RF_model.score(X_test, y_test))

In [None]:
pred = RF_model.predict(df2fill_scaled)
pred

In [None]:
import json

with open("./data/rawtest.json", "r") as file:
    data = json.load(file)
    for i in range(len(data)):
        data[i]["label"] = "human" if pred[i] else "bot"

In [None]:
# with open("./data/test.json", "w") as file:
#     json.dump(data, file, indent=4)