In [None]:
import sklearn
import numpy

sklearn.__version__, numpy.__version__

In [None]:
import pandas as pd

dataset_df = pd.read_csv('preproc_dataset.csv')
dataset_df

In [None]:
dataset_df.text = dataset_df.text.map(lambda x: x[:768])

### Theme baseline

In [None]:
X_train = dataset_df[dataset_df.sample_type == 'train'].text
X_test = dataset_df[dataset_df.sample_type == 'test'].text
y_train = dataset_df[dataset_df.sample_type == 'train'].theme
y_test = dataset_df[dataset_df.sample_type == 'test'].theme

In [None]:
RANDOM_STATE = 42

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

theme_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', SGDClassifier(loss='modified_huber', random_state=RANDOM_STATE))
], verbose=True)

In [None]:
theme_pipeline.fit(X_train, y_train)

In [None]:
y_pred = theme_pipeline.predict(X_test)

In [None]:
from sklearn.metrics import f1_score

f1_score(y_test, y_pred, average='weighted')

In [None]:
import joblib

joblib.dump(theme_pipeline, 'theme_tfidf_baseline.pt')

### Executor baseline

In [None]:
X_train = dataset_df[dataset_df.sample_type == 'train'].text
X_test = dataset_df[dataset_df.sample_type == 'test'].text
y_train = dataset_df[dataset_df.sample_type == 'train'].executor
y_test = dataset_df[dataset_df.sample_type == 'test'].executor

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

executor_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', SGDClassifier(loss='modified_huber', random_state=RANDOM_STATE))
], verbose=True)

In [None]:
executor_pipeline.fit(X_train, y_train)

In [None]:
y_pred = executor_pipeline.predict(X_test)

In [None]:
from sklearn.metrics import f1_score

f1_score(y_test, y_pred, average='weighted')

In [None]:
import joblib

joblib.dump(executor_pipeline, 'executor_tfidf_baseline.pt')