In [1]:
from functools import partial

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler

from preprocessing import preprocess
from utils import read_train, build_feature_path, read_train_no_validation, read_test, read_validation
import autosklearn


import pickle

import autosklearn.classification

from sklearn.metrics import classification_report


  self.re = re.compile(self.reString)


In [2]:
print('read training data')
train = read_train()
features = [
'bert_avg_all_but_first_binary_scaled',
'boosters_selected',
'char_prediction',
'hashtags_selected',
'hedges_selected',
'mentions_total',
'female_words_selected',
'male_words_selected',
'most_similar_scale_selected',
'perspective_selected',
'perspective_difference_selected',
'senpai_selected',
'senpai_unclustered_selected',
'sif',
'vader_selected',
]
for feature in features:
    print('read precomputed feature', feature)
    feature_path = build_feature_path('TRAINING_REL', feature)
    feature_df = pd.read_csv(feature_path, index_col='id')
    feature_df.columns = [feature + "_" + column for column in feature_df.columns]
    train = pd.merge(train, feature_df, how='left', left_index=True, right_index=True)
print('encode language')
language_le = LabelEncoder()
train['language'] = language_le.fit_transform(train.language)
print('preprocess text')
train['text'] = train.text.apply(partial(preprocess, fix_encoding=True))
features += ['language', 'text']

train_ = train.loc[read_train_no_validation().index]
X = train_[[column for column in train.columns if any(column.startswith(feature) for feature in features)]]

y = train_.task1.values
y_le = LabelEncoder()
y = y_le.fit_transform(y)

print('X.shape', X.shape, 'y.shape', y.shape, 'unique y', np.unique(y))
labels = y_le.classes_


read training data
read precomputed feature bert_avg_all_but_first_binary_scaled
read precomputed feature boosters_selected
read precomputed feature char_prediction
read precomputed feature hashtags_selected
read precomputed feature hedges_selected
read precomputed feature mentions_total
read precomputed feature female_words_selected
read precomputed feature male_words_selected
read precomputed feature most_similar_scale_selected
read precomputed feature perspective_selected
read precomputed feature perspective_difference_selected
read precomputed feature senpai_selected
read precomputed feature senpai_unclustered_selected
read precomputed feature sif
read precomputed feature vader_selected
encode language
preprocess text
X.shape (5581, 2976) y.shape (5581,) unique y [0 1]


In [3]:
char = ColumnTransformer(transformers=[('cv', Pipeline(steps=[('cv', CountVectorizer(analyzer='char',
                                                                  ngram_range=(3, 4))),
                                                                  ('fs',SelectFromModel(estimator=MultinomialNB())  # use multitask in case of task2
                                        )]), 'text')],
                             remainder='passthrough'
                             )
X_char = char.fit_transform(X, y)




In [4]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=60*60*8,
#     per_run_time_limit=30,
    tmp_folder='/bigdata/sexism/utils/temp/autosklearn_tmp3',
    output_folder='/bigdata/sexism/utils/temp/autosklearn_out3',
    n_jobs=10,
    # Each one of the 4 jobs is allocated 6GB
    memory_limit=2*3072,
    seed=5,
    
#     ml_memory_limit=600*1024,
#     ensemble_size=1,
#     ensemble_memory_limit=7*1024,
#     initial_configurations_via_metalearning=0,
#     include_preprocessors=["no_preprocessing"],
    delete_output_folder_after_terminate=True,
    delete_tmp_folder_after_terminate=True,
)
automl.fit(X_char, y)


AutoSklearnClassifier(memory_limit=6144, n_jobs=10,
                      output_folder='/bigdata/sexism/utils/temp/autosklearn_out3',
                      per_run_time_limit=28800, seed=5,
                      time_left_for_this_task=28800,
                      tmp_folder='/bigdata/sexism/utils/temp/autosklearn_tmp3')

In [5]:
with open('/bigdata/sexism/utils/temp/autosklearn_out/file.pickle','wb+') as f:
    pickle.dump(automl, f)


In [6]:
validation = train.loc[read_validation().index]
X = validation[[column for column in train.columns if any(column.startswith(feature) for feature in features)]]

y = validation.task1.values
y = y_le.transform(y)

print('X.shape', X.shape, 'y.shape', y.shape, 'unique y', np.unique(y))
labels = y_le.classes_
X_char = char.transform(X)
y_pred = automl.predict(X_char)

X.shape (1396, 2976) y.shape (1396,) unique y [0 1]




In [7]:
print(classification_report(y_true=y, y_pred=y_pred, target_names=labels))


              precision    recall  f1-score   support

  non-sexist       0.75      0.72      0.73       720
      sexist       0.71      0.74      0.73       676

    accuracy                           0.73      1396
   macro avg       0.73      0.73      0.73      1396
weighted avg       0.73      0.73      0.73      1396



In [8]:
print('read test data')
test = read_test()
for feature in features:
    if feature in ['language', 'text']:continue
    print('read precomputed feature', feature)
    feature_path = build_feature_path('TEST_REL', feature)
    feature_df = pd.read_csv(feature_path, index_col='id')
    feature_df.columns = [feature + "_" + column for column in feature_df.columns]
    test = pd.merge(test, feature_df, how='left', left_index=True, right_index=True)
print('encode language')
test['language'] = language_le.fit_transform(test.language)
print('preprocess text')
test['text'] = test.text.apply(partial(preprocess, fix_encoding=True))

X_test = test[[column for column in train.columns if any(column.startswith(feature) for feature in features)]]


read test data
read precomputed feature bert_avg_all_but_first_binary_scaled
read precomputed feature boosters_selected
read precomputed feature char_prediction
read precomputed feature hashtags_selected
read precomputed feature hedges_selected
read precomputed feature mentions_total
read precomputed feature female_words_selected
read precomputed feature male_words_selected
read precomputed feature most_similar_scale_selected
read precomputed feature perspective_selected
read precomputed feature perspective_difference_selected
read precomputed feature senpai_selected
read precomputed feature senpai_unclustered_selected
read precomputed feature sif
read precomputed feature vader_selected
encode language
preprocess text




In [9]:

X_test_char = char.transform(X_test)




In [10]:
y_test_pred = automl.predict(X_test_char)

In [11]:
y_test_pred= y_le.inverse_transform(y_test_pred)

In [12]:
np.unique(y_test_pred)

array(['non-sexist', 'sexist'], dtype=object)

In [13]:
pd.DataFrame(pd.Series(y_test_pred, index=test.index, name='task1')).reset_index().transform({'id':lambda x: "{:06d}".format(x),
                                                                                             'task1':lambda x:x}).to_csv('/bigdata/sexism/utils/temp/autosklearn_out/task1.csv', index=False)

In [14]:
print(automl.show_models())

[(0.240000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'extra_trees', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'median', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'normalize', 'feature_preprocessor:__choice__': 'extra_trees_preproc_for_classification', 'classifier:extra_trees:bootstrap': 'False', 'classifier:extra_trees:criterion': 'gini', 'classifier:extra_trees:max_depth': 'None', 'classifier:extra_trees:max_features': 0.9292309396985746, 'classifier:extra_trees:max_leaf_nodes': 'None', 'classifier:extra_trees:min_impurity_decrease': 0.0, 'classifier:extra_trees:min_samples_leaf': 1, 'classifier:extra_trees:min_samples_split': 10, 'classifier:extra_trees:min_weight_fraction_leaf': 0.0, 'data_preprocessi

In [17]:
results_df = pd.DataFrame(pd.Series(y_test_pred, index=test.index, name='task1')).reset_index().transform({'id':lambda x: "{:06d}".format(x),
                                                                                             'task1':lambda x:x})
results_df['test_case'] = 'EXIST2021'
results_df[['test_case', 'id', 'task1']].to_csv('/bigdata/sexism/utils/temp/autosklearn_out/task1.tsv', sep='\t', header = None, index=False)