In [2]:
import numpy as np
import pandas as pd

In [31]:
df = pd.read_csv("./data/tweets_train.csv").drop(columns=['id','keyword', 'location'])
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [32]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

features = 'text'
y_feat = 'remainder__target'
preprocessing_pipeline = make_column_transformer(
    (CountVectorizer(lowercase=True, stop_words="english", max_features=1000), features),
    remainder="passthrough",
)
lr = LogisticRegression(max_iter=500)

In [100]:
from sklearn.model_selection import KFold
from tqdm.auto import tqdm

from neural_fca import NeuralFCA

kf = KFold(n_splits=5)
cv_f1_scores = []
lr_cv_f1_scores = []
fcas = []

for train, test in tqdm(kf.split(df)):
    df_train = df.iloc[train, :]
    df_test= df.iloc[test, :]

    preprocessing_pipeline.fit(df_train)

    df_train2 = pd.DataFrame(
        preprocessing_pipeline.transform(df_train).toarray(),
        columns=preprocessing_pipeline.get_feature_names_out()
    )
    df_test2 = pd.DataFrame(
        preprocessing_pipeline.transform(df_test).toarray(),
        columns=preprocessing_pipeline.get_feature_names_out()
    )

    X_train = df_train2.drop(columns=y_feat)
    y_train = df_train2[y_feat]
    X_test = df_test2.drop(columns=y_feat)
    y_test = df_test2[y_feat]

    topk_words = np.argsort(-lr.coef_[0])[:100]
    X_train_topk = X_train.iloc[:, topk_words].astype("bool")
    X_test_topk = X_test.iloc[:, topk_words].astype("bool")
    X_test_topk['dummy'] = True
    X_train_topk['dummy'] = True
    X_train_topk.index = X_train.index.map(str)
    X_test_topk.index = X_test.index.map(str)
    fca = NeuralFCA(best_concepts_fraction=0.5).fit(X_train_topk, y_train)
    cv_f1_scores.append(fca.score(X_test_topk, y_test))
    fcas.append(fca)

    lr.fit(X_train_topk, y_train)
    lr_cv_f1_scores.append(
        f1_score(lr.predict(X_test_topk), y_test)
    )

0it [00:00, ?it/s]

In [101]:
cv_f1_scores

[0.0, 0.19452054794520546, 0.0, 0.0, 0.0]

In [102]:
lr_cv_f1_scores

[0.2728512960436562,
 0.21151271753681392,
 0.15587529976019185,
 0.13783403656821377,
 0.1625]

In [103]:
np.mean(cv_f1_scores)

0.03890410958904109