In [None]:
from preprocess_utils import *

large_hindex = 6
huge_hindex = 36

train, test = get_processed_data()
train = select_columns(train)
print(train.shape)

In [None]:
del test
train.head()

In [None]:
frequency = train["hindex"].value_counts(normalize=True)

# Small / Large Index Classification

In [None]:
from sklearn.model_selection import train_test_split

train_split, test_split = train_test_split(train)
print(train_split.shape, test_split.shape)

train_split.loc[train_split["hindex"] < large_hindex, "hindex"] = 0
test_split.loc[test_split["hindex"] < large_hindex, "hindex"] = 0
train_split.loc[train_split["hindex"] >= large_hindex, "hindex"] = 1
test_split.loc[test_split["hindex"] >= large_hindex, "hindex"] = 1

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
model_cat = CatBoostClassifier(verbose=False, text_features=["text"])
model_cat.fit(train_split.drop(["author", "hindex"], axis=1), train_split["hindex"])
mod_preds = model_cat.predict(test_split.drop(["author", "hindex"], axis=1))
print(classification_report(test_split["hindex"], mod_preds))

# Small / Large / Huge Index Classification

In [None]:
from sklearn.model_selection import train_test_split

train_split, test_split = train_test_split(train)
print(train_split.shape, test_split.shape)

train_split.loc[train_split["hindex"] < large_hindex, "hindex"] = 0
test_split.loc[test_split["hindex"] < large_hindex, "hindex"] = 0
train_split.loc[((train_split["hindex"] >= large_hindex) & (train_split["hindex"] < huge_hindex)), "hindex"] = 1
test_split.loc[((test_split["hindex"] >= large_hindex)  & (test_split["hindex"] < huge_hindex)), "hindex"] = 1
train_split.loc[train_split["hindex"] >= huge_hindex, "hindex"] = 2
test_split.loc[test_split["hindex"] >= huge_hindex, "hindex"] = 2

print(train_split.shape)

X_train = train_split.drop(
    ["author", "hindex"], axis=1
).to_numpy()
y_train = train_split["hindex"].to_numpy()
X_test = test_split.drop(
    ["author", "hindex"], axis=1
).to_numpy()
y_test = test_split["hindex"].to_numpy()

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report
model_cat = CatBoostClassifier(verbose=False)
model_cat.fit(X_train, y_train)
mod_preds = model_cat.predict(X_test)
print(classification_report(y_test, mod_preds))

# Original Prediction MSE on large hindex

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import train_test_split

train_split, test_split = train_test_split(train)
print(train_split.shape, test_split.shape)

X_train = train_split.drop(
    ["author", "hindex"], axis=1
).to_numpy()
y_train = train_split["hindex"].to_numpy()
X_test = test_split.drop(
    ["author", "hindex"], axis=1
).to_numpy()
y_test = test_split["hindex"].to_numpy()

In [None]:
from catboost import CatBoostRegressor
model_cat = CatBoostRegressor(verbose=False)
model_cat.fit(X_train, y_train)
mod_preds = model_cat.predict(X_test)
print("Overall MSE:", mean_squared_error(y_test, mod_preds))
y_big_test = [y for y in y_test if y >= large_hindex]
mod_big_preds = [mod_preds[i] for i in range(len(y_test)) if y_test[i] >= large_hindex]
print("Big Hindex MSE:", mean_squared_error(y_big_test, mod_big_preds))

# Specific Prediction MSE on large hindex

In [None]:
big_train = train.loc[train["hindex"] >= large_hindex]

big_train_split, big_test_split = train_test_split(big_train)
print(big_train_split.shape, big_test_split.shape)

X_big_train = big_train_split.drop(
    ["author", "hindex"], axis=1
).to_numpy()
y_big_train = big_train_split["hindex"].to_numpy()
X_big_test = big_test_split.drop(
    ["author", "hindex"], axis=1
).to_numpy()
y_big_test = big_test_split["hindex"].to_numpy()



In [None]:
from catboost import CatBoostRegressor
model_cat = CatBoostRegressor(verbose=False)
model_cat.fit(X_big_train, y_big_train)
mod_preds = model_cat.predict(X_big_test)
print("Big Hindex MSE:", mean_squared_error(y_big_test, mod_preds))

# Original Prediction MSE on small hindex

In [None]:
train_split, test_split = train_test_split(train)
print(train_split.shape, test_split.shape)

X_train = train_split.drop(
    ["author", "hindex"], axis=1
).to_numpy()
y_train = train_split["hindex"].to_numpy()
X_test = test_split.drop(
    ["author", "hindex"], axis=1
).to_numpy()
y_test = test_split["hindex"].to_numpy()

In [None]:
from catboost import CatBoostRegressor
model_cat = CatBoostRegressor(verbose=False)
model_cat.fit(X_train, y_train)
mod_preds = model_cat.predict(X_test)
print("Overall MSE:", mean_squared_error(y_test, mod_preds))
y_small_test = [y for y in y_test if y < large_hindex]
mod_small_preds = [mod_preds[i] for i in range(len(y_test)) if y_test[i] < large_hindex]
print("Small Hindex MSE:", mean_squared_error(y_small_test, mod_small_preds))

# Specific Prediction MSE on small hindex

In [None]:
small_train = train.loc[train["hindex"] < large_hindex]

small_train_split, small_test_split = train_test_split(small_train)
print(small_train_split.shape, small_test_split.shape)

X_small_train = small_train_split.drop(
    ["author", "hindex"], axis=1
).to_numpy()
y_small_train = small_train_split["hindex"].to_numpy()
X_small_test = small_test_split.drop(
    ["author", "hindex"], axis=1
).to_numpy()
y_small_test = small_test_split["hindex"].to_numpy()

In [None]:
from catboost import CatBoostRegressor
model_cat = CatBoostRegressor(verbose=False)
model_cat.fit(X_small_train, y_small_train)
mod_preds = model_cat.predict(X_small_test)
print("Small Hindex MSE:", mean_squared_error(y_small_test, mod_preds))

# Combine Specific prediction and Large Index Classification

In [1]:
from preprocess_utils import get_processed_data, select_columns
from sklearn.model_selection import train_test_split

large_hindex = 6

train, test = get_processed_data()
train = select_columns(train)
del test

train_split, test_split = train_test_split(train)
print(train_split.shape, test_split.shape)

large_train_split = train_split.loc[train_split["hindex"] >= large_hindex]
large_test_split = test_split.loc[test_split["hindex"] >= large_hindex]
small_train_split = train_split.loc[train_split["hindex"] < large_hindex]
small_test_split = test_split.loc[test_split["hindex"] < large_hindex]

y_test = test_split["hindex"].copy()

# Balance small and large data
# large_train_split_temp = large_train_split.append(small_train_split.sample(n=int(len(small_train_split) / 2.5), ignore_index=True))
# small_train_split = small_train_split.append(large_train_split_temp.sample(n=int(len(large_train_split) / 1.5), ignore_index=True))
# large_train_split = large_train_split_temp

In [None]:
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, mean_squared_error

model = CatBoostRegressor(verbose=False)
model.fit(train_split.drop(["author", "hindex"], axis=1), train_split["hindex"])
preds = model.predict(test_split.drop(["author","hindex"], axis=1))
print("Classical Hindex MSE:", mean_squared_error(test_split["hindex"], preds))
print("Classical Hindex MSE on small:", mean_squared_error(small_test_split["hindex"], model.predict(small_test_split.drop(["author","hindex"], axis=1))))
print("Classical Hindex MSE on large:", mean_squared_error(large_test_split["hindex"], model.predict(large_test_split.drop(["author", "hindex"], axis=1))))


model_small = CatBoostRegressor(verbose=False)
model_small.fit(small_train_split.drop(["author","hindex"], axis=1), small_train_split["hindex"])
small_preds = model_small.predict(test_split.drop(["author","hindex"], axis=1))
print("Small Hindex MSE:", mean_squared_error(test_split["hindex"], small_preds))
print("Small Hindex MSE on small:", mean_squared_error(small_test_split["hindex"], model_small.predict(small_test_split.drop(["author","hindex"], axis=1))))
print("Small Hindex MSE on large:", mean_squared_error(large_test_split["hindex"], model_small.predict(large_test_split.drop(["author", "hindex"], axis=1))))


model_large = CatBoostRegressor(verbose=False)
model_large.fit(large_train_split.drop(["author","hindex"], axis=1), large_train_split["hindex"])
large_preds = model_large.predict(test_split.drop(["author","hindex"], axis=1))
print("Large Hindex MSE:", mean_squared_error(test_split["hindex"], large_preds))
print("Large Hindex MSE on small:", mean_squared_error(small_test_split["hindex"], model_large.predict(small_test_split.drop(["author","hindex"], axis=1))))
print("Large Hindex MSE on large:", mean_squared_error(large_test_split["hindex"], model_large.predict(large_test_split.drop(["author", "hindex"], axis=1))))

In [None]:
train_split_classifier, test_split_classifier = train_split, test_split
train_split_classifier.loc[train_split_classifier["hindex"] < large_hindex, "hindex"] = 0
train_split_classifier.loc[train_split_classifier["hindex"] >= large_hindex, "hindex"] = 1
test_split_classifier.loc[test_split_classifier["hindex"] < large_hindex, "hindex"] = 0
test_split_classifier.loc[test_split_classifier["hindex"] >= large_hindex, "hindex"] = 1

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report
hindex_classifier = CatBoostClassifier(verbose=False, num_trees=2000)
hindex_classifier.fit(train_split_classifier.drop(["author", "hindex"], axis=1), train_split_classifier["hindex"])
hindex_classifier_preds = hindex_classifier.predict(test_split_classifier.drop(["author", "hindex"], axis=1))
print(classification_report(test_split_classifier["hindex"], hindex_classifier_preds))

hindex_classifier_probas = hindex_classifier.predict_proba(test_split_classifier.drop(["author", "hindex"], axis=1))

In [None]:
from tqdm import tqdm

combine_preds = []
count_small, count_large, count_classic = 0, 0, 0
count_small_correct, count_large_correct = 0, 0

threshold = 0.95
for i in tqdm(range(len(y_test))):
    # if hindex_classifier_preds[i] == 0:
    #     combine_preds.append(small_preds[i])
    # else:
    #     combine_preds.append(large_preds[i])
    # if hindex_classifier_probas[i][0] > threshold:
    #     combine_preds.append(small_preds[i])
    #     count_small += 1
    #     if hindex_classifier_preds[i] == 0:
    #         count_small_correct += 1
    # elif hindex_classifier_probas[i][1] > threshold:
    #     combine_preds.append(large_preds[i])
    #     count_large += 1
    #     if hindex_classifier_preds[i] == 1:
    #         count_large_correct += 1
    # else:
    #     combine_preds.append(preds[i])
    #     count_classic += 1
    combine_preds.append((hindex_classifier_probas[i][0] * small_preds[i] + hindex_classifier_probas[i][1] * large_preds[i] + preds[i]) / 2)
    # combine_preds.append(preds[i])

print("Combine Hindex MSE:", mean_squared_error(y_test, combine_preds))
# print("count :", count_small, count_large, count_classic)
# print("count correct:", count_small_correct, count_large_correct)
# print("ratio correct:", count_small_correct/count_small, count_large_correct/count_large)

In [None]:
combine_preds

# Submit

In [None]:
from preprocess_utils import get_processed_data, select_columns
from sklearn.model_selection import train_test_split

large_hindex = 6

train_split, test_split = get_processed_data()
train_split = select_columns(train_split)
test_split = select_columns(test_split)
print(train_split.shape, test_split.shape)

large_train_split = train_split.loc[train_split["hindex"] >= large_hindex]
large_test_split = test_split.loc[test_split["hindex"] >= large_hindex]
small_train_split = train_split.loc[train_split["hindex"] < large_hindex]
small_test_split = test_split.loc[test_split["hindex"] < large_hindex]

y_test = test_split["header"].to_numpy()

In [None]:
test, _ = get_test_data()
test["hindex"] = combine_preds
submission = test[["author", "hindex"]]
submission.to_csv("../tmp/submission.csv", index=None)

# Function

In [None]:
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import classification_report, mean_squared_error
import math


def get_small_data(train, test, large_hindex):
    small_train = train.loc[train_split["hindex"] < large_hindex]
    small_test = test.loc[test_split["hindex"] < large_hindex]

    # Data for small hindex regressor
    X_small_train = small_train.drop(["author", "hindex"], axis=1).to_numpy()
    y_small_train = small_train["hindex"].to_numpy()
    X_small_test = small_test.drop(["author", "hindex"], axis=1).to_numpy()
    y_small_test = small_test["hindex"].to_numpy()

    return X_small_train, y_small_train, X_small_test, y_small_test


def get_large_data(train, test, large_hindex, huge_hindex):
    large_train = train.loc[
        (train_split["hindex"] >= large_hindex) & (train_split["hindex"] < huge_hindex)
    ]
    large_test = test.loc[
        (test_split["hindex"] >= large_hindex) & (test_split["hindex"] < huge_hindex)
    ]

    # Data for large hindex regressor
    X_large_train = large_train.drop(["author", "hindex"], axis=1).to_numpy()
    y_large_train = large_train["hindex"].to_numpy()
    X_large_test = large_test.drop(["author", "hindex"], axis=1).to_numpy()
    y_large_test = large_test["hindex"].to_numpy()

    return X_large_train, y_large_train, X_large_test, y_large_test


def get_huge_data(train, test, huge_hindex):
    huge_train = train.loc[train_split["hindex"] >= huge_hindex]
    huge_test = test.loc[test_split["hindex"] >= huge_hindex]

    # Data for huge hindex regressor
    X_huge_train = huge_train.drop(["author", "hindex"], axis=1).to_numpy()
    y_huge_train = huge_train["hindex"].to_numpy()
    X_huge_test = huge_test.drop(["author", "hindex"], axis=1).to_numpy()
    y_huge_test = huge_test["hindex"].to_numpy()

    return X_huge_train, y_huge_train, X_huge_test, y_huge_test


def get_classifier_data(train, test):
    train = train.copy()
    test = test.copy()
    
    train.loc[train_split["hindex"] < large_hindex, "hindex"] = 0
    test.loc[test_split["hindex"] < large_hindex, "hindex"] = 0
    train.loc[((train_split["hindex"] >= large_hindex) & (train_split["hindex"] < huge_hindex)), "hindex"] = 1
    test.loc[((test_split["hindex"] >= large_hindex)  & (test_split["hindex"] < huge_hindex)), "hindex"] = 1
    train.loc[train_split["hindex"] >= huge_hindex, "hindex"] = 2
    test.loc[test_split["hindex"] >= huge_hindex, "hindex"] = 2

    # Data for the classifier
    X_train_classifier = train_split_classifier.drop(
        ["author", "hindex"], axis=1
    ).to_numpy()
    y_train_classifier = train_split_classifier["hindex"].to_numpy()
    X_test_classifier = test_split_classifier.drop(
        ["author", "hindex"], axis=1
    ).to_numpy()
    y_test_classifier = test_split_classifier["hindex"].to_numpy()

    return X_train_classifier, y_train_classifier, X_test_classifier, y_test_classifier

def get_original_data(train, test):
    # Original Data
    X_train = train_split.drop(
        ["author", "hindex"], axis=1
    ).to_numpy()
    y_train = train_split["hindex"].to_numpy()
    X_test = test_split.drop(
        ["author", "hindex"], axis=1
    ).to_numpy()
    y_test = test_split["hindex"].to_numpy()

    return X_train, y_train, X_test, y_test

# Duplicate very large hindex

In [None]:
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, mean_squared_error

very_large_hindex = 36

train_split, test_split = train_test_split(train)

very_large_train_split = train_split.loc[train_split["hindex"] >= very_large_hindex]

train_split = train_split.append(very_large_train_split)
print(train_split.shape)

X_train = train_split.drop(
    ["author", "hindex"], axis=1
).to_numpy()
y_train = train_split["hindex"].to_numpy()
X_test = test_split.drop(
    ["author", "hindex"], axis=1
).to_numpy()
y_test = test_split["hindex"].to_numpy()


model = CatBoostRegressor(verbose=False)
model.fit(X_train, y_train)
preds = model.predict(X_test)
print("Classical Hindex MSE:", mean_squared_error(y_test, preds))


# Fasttext Classification

In [None]:
import numpy as np
from preprocess_utils import df_to_txt

large_hindex = 6
train_fasttext, test_fasttext = get_processed_data()
train_fasttext["hindex_lab"] = np.where(train_fasttext["hindex"] >= large_hindex, "__label__1", "__label__0")

train_fasttext_split, test_fasttext_split = train_test_split(train_fasttext)

path_fasttext_text = "../tmp/fasttext_text.txt"
df_to_txt(train_fasttext, path_fasttext_text)
model_fasttext = fasttext.train_supervised(
        path_fasttext_text, lr=0.15815, dim=2, epoch=33, wordNgrams=3
)
os.remove(path_fasttext_text)

In [None]:
path_fasttext_text = "../tmp/fasttext_text.txt"
df_to_txt(test_fasttext_split, path_fasttext_text)
print(model_fasttext.test(path_fasttext_text))
os.remove(path_fasttext_text)

## Add Fasttext prediction to data

In [None]:
preds = model_fasttext.predict(train_fasttext["text"])

In [None]:
train.head()

In [None]:
from tqdm import tqdm
import pandas as pd

fasttext_pred = []
for text in tqdm(train_fasttext["text"].to_list()):
    if not pd.isna(text):
        label, proba = model_fasttext.predict(text)
    else:
        label, proba = model_fasttext.predict("")
    if label[0] == "__label__1":
        fasttext_pred.append(proba[0])
    else: 
        fasttext_pred.append(1-proba[0])

In [None]:
train["fasttext_pred"] = fasttext_pred