In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install gensim==4.2.0

In [19]:
import os
import tqdm
from typing import Dict, List
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import lightgbm as lgb

from gensim.models import FastText, Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import datapath
from gensim import utils

import warnings
warnings.filterwarnings("ignore")

In [None]:
!unzip "drive/MyDrive/ml_project/texts.zip" -d "drive/MyDrive/ml_project/texts"

In [3]:
PROJECT_DIR = Path("drive/MyDrive/ml_project/")

In [4]:
!ls $PROJECT_DIR

data  model.txt  predict.csv  texts  texts.zip


In [5]:
csv_dir = PROJECT_DIR / "data/"
train_path = csv_dir / "train_groups.csv"
test_path = csv_dir / "test_groups.csv"
texts_dir = PROJECT_DIR / "texts/parsed"

In [6]:
train_groups = pd.read_csv(train_path)
print(f"train shape: {train_groups.shape}")

test_df = pd.read_csv(test_path)
print(f"test shape: {test_df.shape}")

train_groups.head()

train shape: (11690, 4)
test shape: (16627, 3)


Unnamed: 0,pair_id,group_id,doc_id,target
0,1,1,15731,0
1,2,1,14829,0
2,3,1,15764,0
3,4,1,17669,0
4,5,1,14852,0


In [7]:
val_size = 0.2
group_thres = int(len(train_groups["group_id"].unique()) * (1 - val_size))
group_thres

103

In [8]:
train_df = train_groups.loc[train_groups["group_id"] <= group_thres]
val_df = train_groups.loc[train_groups["group_id"] > group_thres]

# Создание нового датасета

In [None]:
tfidf = TfidfVectorizer(input='filename').fit_transform(group_texts).todense()
counts = CountVectorizer(input='filename').fit_transform(group_texts).todense()

# getting top_n lowest and highest features
counts_features = get_top_features(counts, top=n_counts)
words_features = get_top_features(tfidf, top=n_words)

# getting top_n lowest and highest cosine similarities between documents
counts_cosine_features = get_top_cosine_features(counts, top=top_docs)
tfidf_cosine_features = get_top_cosine_features(tfidf, top=top_docs)

# row-wise proportion of zeros in tfidf
tfidf_zeros_proportion = np.apply_along_axis(zeros_prop, 1, tfidf)

features = np.hstack([
    counts_features,
    words_features,

    counts_cosine_features,
    tfidf_cosine_features,
    tfidf_zeros_proportion.reshape(-1, 1)
])

In [65]:
def get_texts_fnames(folder: Path) -> Dict[int, str]:
    files = (fname for fname in os.listdir(folder) if fname.endswith('.txt'))
    res = {}
    for fname in tqdm.tqdm(files):
        doc_id_str = fname.split('.')[0]
        res[int(doc_id_str)] = folder / fname
    return res


def zeros_prop(arr):
    new_arr = (arr > 1e-14).astype(int)
    return np.sum(new_arr == 0) / len(new_arr)


def get_top_features(matrix: np.ndarray, top: int):
    rows_sorted = np.sort(matrix, axis=1)
    top_features = np.hstack([rows_sorted[:, :top], rows_sorted[:, -top:]])
    return top_features


def get_top_cosine_features(matrix: np.ndarray, top: int):
    cosine_sim = cosine_similarity(matrix)
    cosine_sim -= np.eye(matrix.shape[0])
    return get_top_features(cosine_sim, top)


class IterFiles:
    def __init__(self, files: List):
        self.files = files
    def __iter__(self):
        for file in self.files:
            with open(file, 'r', encoding='utf-8') as f:
                line = f.readline()
            yield line.split()


def get_dataset(
    df: pd.DataFrame, 
    texts: Dict[int, str],
    top_docs: int=20,
    n_counts: int=100,
    n_words: int=1000,
    has_target: bool=True
):
    X, y, groups = [], [], []
    for group_id in tqdm.tqdm(df["group_id"].unique()):
        group = df.loc[df["group_id"] == group_id]
        n_docs = group.shape[0]

        # vectorizers inference
        group_texts = [texts[doc_id] for doc_id in group["doc_id"]]
        tfidf = TfidfVectorizer(input='filename').fit_transform(group_texts).todense()
        counts = CountVectorizer(input='filename').fit_transform(group_texts).todense()

        # getting top_n lowest and highest features
        counts_features = get_top_features(counts, top=n_counts)
        words_features = get_top_features(tfidf, top=n_words)

        # getting top_n lowest and highest cosine similarities between documents
        counts_cosine_features = get_top_cosine_features(counts, top=top_docs)
        tfidf_cosine_features = get_top_cosine_features(tfidf, top=top_docs)

        # row-wise proportion of zeros in tfidf
        tfidf_zeros_proportion = np.apply_along_axis(zeros_prop, 1, tfidf)

        # doc2vec features
        documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(IterFiles(group_texts))]
        model = Doc2Vec(documents, vector_size=500, window=10, min_count=1, workers=8, epochs=2)
        doc2vec_features = np.vstack([model.docvecs[i] for i in range(n_docs)])

        doc2vec_cosine_features = get_top_cosine_features(counts, top=top_docs)

        features = np.hstack([
            counts_features,
            counts_cosine_features,

            words_features,
            tfidf_cosine_features,
            tfidf_zeros_proportion.reshape(-1, 1),

            doc2vec_features,
            doc2vec_cosine_features
        ])

        X.append(np.copy(features))
        groups.append(np.copy([group_id] * n_docs))
        if has_target:
            y.append(group["target"])

    X = np.vstack(X)
    groups = np.hstack(groups)

    if has_target:
        y = np.hstack(y)
        print(X.shape, y.shape, groups.shape)
        return X, y, groups
    else:
        print(X.shape, groups.shape)
        return X, groups

In [66]:
doc_to_text = get_texts_fnames(texts_dir)

28026it [00:00, 40697.04it/s]


In [67]:
X_train, y_train, _ = get_dataset(train_df, doc_to_text)

100%|██████████| 103/103 [09:27<00:00,  5.51s/it]

(9309, 2821) (9309,) (9309,)





In [68]:
X_val, y_val, _ = get_dataset(val_df, doc_to_text)

100%|██████████| 26/26 [01:47<00:00,  4.15s/it]

(2381, 2821) (2381,) (2381,)





In [69]:
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

In [77]:
params = {
    'num_leaves': 31,
    'objective': 'binary',
    'metric': ['auc']
}

num_round = 100

bst = lgb.train(
    params, 
    train_data, 
    num_round, 
    valid_sets=[val_data], 
    # callbacks=[lgb.early_stopping(stopping_rounds=5)]
)

[1]	valid_0's auc: 0.818649
[2]	valid_0's auc: 0.8497
[3]	valid_0's auc: 0.8631
[4]	valid_0's auc: 0.870112
[5]	valid_0's auc: 0.879689
[6]	valid_0's auc: 0.883112
[7]	valid_0's auc: 0.887322
[8]	valid_0's auc: 0.891125
[9]	valid_0's auc: 0.895862
[10]	valid_0's auc: 0.897496
[11]	valid_0's auc: 0.899625
[12]	valid_0's auc: 0.901346
[13]	valid_0's auc: 0.901701
[14]	valid_0's auc: 0.903043
[15]	valid_0's auc: 0.904699
[16]	valid_0's auc: 0.906349
[17]	valid_0's auc: 0.908513
[18]	valid_0's auc: 0.910094
[19]	valid_0's auc: 0.911336
[20]	valid_0's auc: 0.912955
[21]	valid_0's auc: 0.914478
[22]	valid_0's auc: 0.914948
[23]	valid_0's auc: 0.9155
[24]	valid_0's auc: 0.914689
[25]	valid_0's auc: 0.915309
[26]	valid_0's auc: 0.916332
[27]	valid_0's auc: 0.916842
[28]	valid_0's auc: 0.916761
[29]	valid_0's auc: 0.916921
[30]	valid_0's auc: 0.917289
[31]	valid_0's auc: 0.917766
[32]	valid_0's auc: 0.918299
[33]	valid_0's auc: 0.918525
[34]	valid_0's auc: 0.919412
[35]	valid_0's auc: 0.919837


In [78]:
thresholds = np.linspace(0.01, 1.0, 100)
f1_scores = []
for thres in thresholds:
    predicts = (bst.predict(X_val, num_iterarion=bst.best_iteration) > thres).astype(int)
    f1_scores.append(f1_score(y_val, predicts))

best_score_idx = np.argmax(f1_scores)
best_thres = thresholds[best_score_idx]
print(f"Best score: {f1_scores[best_score_idx]}, best thres: {best_thres}")

Best score: 0.7974463145676145, best thres: 0.34


In [79]:
# retraining on all train data
train_data = lgb.Dataset(np.vstack([X_train, X_val]), label=np.hstack([y_train, y_val]))

In [80]:
params = {
    'num_leaves': 31,
    'objective': 'binary',
    'metric': ['auc']
}

num_round = 100

bst = lgb.train(
    params, 
    train_data, 
    num_round
)

In [81]:
bst.save_model(str(PROJECT_DIR / "model.txt"), num_iteration=bst.best_iteration)

<lightgbm.basic.Booster at 0x7f94c4a685d0>

In [None]:
!cat $PROJECT_DIR/model.txt

# Получение предсказаний

In [None]:
del X_train, y_train, X_val, y_val, train_data, val_data

In [None]:
bst = lgb.Booster(model_file=str(PROJECT_DIR / "model.txt"))

In [76]:
X_test, _ = get_dataset(test_df, doc_to_text, has_target=False)

100%|██████████| 180/180 [17:00<00:00,  5.67s/it]

(16627, 2821) (16627,)





In [83]:
best_thres = 0.35
y_pred = (bst.predict(X_test, num_iterarion=bst.best_iteration) > best_thres).astype(int)

In [84]:
pairs = test_df['pair_id']

with open(PROJECT_DIR / "predict.csv", 'wb') as f:
    f.write(bytes(str("pair_id,target\n"), "utf-8"))
    for i, pair in enumerate(pairs):
        f.write(bytes(str(pair) + "," + str(y_pred[i]) + str('\n'), "utf-8"))

In [None]:
!cat $PROJECT_DIR/predict.csv