# Tf-idf + Linear SVC (Baseline)

In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import LinearSVC

In [32]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
train_df

Unnamed: 0,Title,Abstract,Primary Category
0,A Parallel algorithm for $\mathcal{X}$-Armed b...,The target of $\mathcal{X}$-armed bandit probl...,stat.ML
1,Explaining the behavior of joint and marginal ...,In latent variable models the parameter estima...,stat.CO
2,Multiscale Partial Information Decomposition o...,Heart rate variability results from the combin...,stat.ME
3,A Sequentially Fair Mechanism for Multiple Sen...,In the standard use case of Algorithmic Fairne...,stat.ML
4,Higher Order Generalization Error for First Or...,We propose a novel approach to analyze general...,stat.ML
...,...,...,...
42448,Mortality Rates of US Counties: Are they Relia...,We examine US County-level observational data ...,stat.AP
42449,Optimal Rates for Estimation of Two-Dimensiona...,We study minimax estimation of two-dimensional...,stat.TH
42450,Optimal adaptive estimation of a quadratic fun...,Adaptive estimation of a quadratic functional ...,stat.TH
42451,PAC-Bayesian bounds for Principal Component An...,Based on some new robust estimators of the cov...,stat.TH


In [43]:
def combine_text(X):
    X = X.copy()
    X["Text"] = X["Title"] + "\n" + X["Abstract"]
    X = X.drop(columns=["Title", "Abstract"])

    return X["Text"]

X_train, y_train = train_df.drop(columns=["Primary Category"]), train_df["Primary Category"]
X_test, y_test = test_df.drop(columns=["Primary Category"]), test_df["Primary Category"]

pipe = Pipeline([
    ("combine_text", FunctionTransformer(combine_text)),
    ("tfidf", TfidfVectorizer()),
    ("clf", LinearSVC(dual=True, random_state=42, max_iter=5000))
])

In [44]:
param_grid = {
    "tfidf__stop_words": ["english", None],
    "clf__C": [0.075, 0.1, 0.25, 0.5],
}

grid = GridSearchCV(pipe, param_grid, cv=5, verbose=3, scoring="f1_macro")
grid.fit(X_train, y_train)
print(grid.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END clf__C=0.075, tfidf__stop_words=english;, score=0.649 total time=   7.4s
[CV 2/5] END clf__C=0.075, tfidf__stop_words=english;, score=0.649 total time=   7.5s
[CV 3/5] END clf__C=0.075, tfidf__stop_words=english;, score=0.648 total time=   7.5s
[CV 4/5] END clf__C=0.075, tfidf__stop_words=english;, score=0.648 total time=   7.7s
[CV 5/5] END clf__C=0.075, tfidf__stop_words=english;, score=0.651 total time=   7.5s
[CV 1/5] END clf__C=0.075, tfidf__stop_words=None;, score=0.652 total time=   8.0s
[CV 2/5] END clf__C=0.075, tfidf__stop_words=None;, score=0.651 total time=   8.0s
[CV 3/5] END clf__C=0.075, tfidf__stop_words=None;, score=0.646 total time=  20.6s
[CV 4/5] END clf__C=0.075, tfidf__stop_words=None;, score=0.648 total time=   8.1s
[CV 5/5] END clf__C=0.075, tfidf__stop_words=None;, score=0.648 total time=   8.0s
[CV 1/5] END clf__C=0.1, tfidf__stop_words=english;, score=0.656 total time=   7.7s
[CV 2/5] EN

In [45]:
y_pred = grid.predict(X_test)

print(classification_report(y_test, y_pred))
print(f"Macro-Averaged F1-Score: {f1_score(y_test, y_pred, average='macro')}")

              precision    recall  f1-score   support

     stat.AP       0.68      0.58      0.63      2576
     stat.CO       0.61      0.43      0.50      1007
     stat.ME       0.64      0.69      0.67      5540
     stat.ML       0.78      0.80      0.79      4824
     stat.TH       0.71      0.74      0.72      4248

    accuracy                           0.70     18195
   macro avg       0.69      0.65      0.66     18195
weighted avg       0.70      0.70      0.70     18195

Macro-Averaged F1-Score: 0.6621458490828881
