In [None]:
from functools import partial

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler

from preprocessing import preprocess
from utils import read_train, build_feature_path, read_train_no_validation, read_test, read_validation
import autosklearn


import pickle

import autosklearn.classification

from sklearn.metrics import classification_report


In [125]:
print('read training data')
train = read_train()
features = [
    'boosters_selected', 
    'char_prediction',
    'hashtags_selected',
    'hedges_selected',
    'mentions_total',
    'female_words',
    'male_words',
    'most_similar_scale',
    'perspective',
    'perspective_difference',
    'sif',
    'vader_selected'
 ]
for feature in features:
    print('read precomputed feature', feature)
    feature_path = build_feature_path('TRAINING_REL', feature)
    feature_df = pd.read_csv(feature_path, index_col='id')
    feature_df.columns = [feature + "_" + column for column in feature_df.columns]
    train = pd.merge(train, feature_df, how='left', left_index=True, right_index=True)
print('encode language')
language_le = LabelEncoder()
train['language'] = language_le.fit_transform(train.language)
print('preprocess text')
train['text'] = train.text.apply(partial(preprocess, fix_encoding=True))
features += ['language', 'text']
features += ['language']

train_ = train.loc[read_train_no_validation().index]
X = train_[[column for column in train.columns if any(column.startswith(feature) for feature in features)]]

y = train_.task2.values
y_le = LabelEncoder()
y = y_le.fit_transform(y)

print('X.shape', X.shape, 'y.shape', y.shape, 'unique y', np.unique(y))
labels = y_le.classes_


read training data
read precomputed feature boosters_selected
read precomputed feature char_prediction
read precomputed feature hashtags_selected
read precomputed feature hedges_selected
read precomputed feature mentions_total
read precomputed feature female_words
read precomputed feature male_words
read precomputed feature most_similar_scale
read precomputed feature perspective
read precomputed feature perspective_difference
read precomputed feature sif
read precomputed feature vader_selected
encode language
preprocess text
X.shape (5581, 882) y.shape (5581,) unique y [0 1 2 3 4 5]


In [None]:
char = ColumnTransformer(transformers=[
    ('cv', Pipeline(steps=[('cv', CountVectorizer(analyzer='char',
                                                                  ngram_range=(3, 4))),
                                                                  ('fs',SelectFromModel(estimator=MultinomialNB())  # use multitask in case of task2
                                        )]), 'text'),
                                       
                                       ('scale',  CountVectorizer(), 'most_similar_scale_scale'),
                         ('content',  CountVectorizer(), 'most_similar_scale_sexist_content'),
                                      ],
                             remainder='passthrough'
                             )
X_char = char.fit_transform(X.fillna('dontknow'), y)

In [147]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [50, 100, 200]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [3, 5, 7, 10,]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
              'criterion':["gini", "entropy"],
               'class_weight':['balanced']
                           }

In [148]:
automl = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, refit=True, n_jobs = 20)

In [149]:
automl.fit(X_char.astype(float), y)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=20,
                   param_distributions={'bootstrap': [True, False],
                                        'class_weight': ['balanced'],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, 5, 7, 10],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [50, 100, 200]},
                   random_state=42, verbose=2)

In [150]:
with open('task2_gridsearch.pickle','wb+') as f:
    pickle.dump(automl, f)


In [None]:
validation = train.loc[read_validation().index]
X = validation[[column for column in train.columns if any(column.startswith(feature) for feature in features)]]

y = validation.task2.values
y = y_le.transform(y)

print('X.shape', X.shape, 'y.shape', y.shape, 'unique y', np.unique(y))
labels = y_le.classes_
X_char = X
X_char = char.transform(X.fillna('dontknow'))
y_pred = automl.predict(X_char.astype(float))

In [152]:
print(classification_report(y_true=y, y_pred=y_pred, target_names=labels))


                              precision    recall  f1-score   support

      ideological-inequality       0.66      0.94      0.77       173
misogyny-non-sexual-violence       0.67      0.92      0.77       137
                  non-sexist       0.96      0.61      0.74       720
             objectification       0.93      0.95      0.94       100
             sexual-violence       0.59      0.99      0.74       104
      stereotyping-dominance       0.63      0.88      0.73       162

                    accuracy                           0.76      1396
                   macro avg       0.74      0.88      0.78      1396
                weighted avg       0.82      0.76      0.76      1396



In [160]:
print(classification_report(y_true=(y!=2), y_pred=(y_pred!=2), target_names=['non-sexist', 'sexist']))


              precision    recall  f1-score   support

  non-sexist       0.96      0.61      0.74       720
      sexist       0.70      0.97      0.81       676

    accuracy                           0.78      1396
   macro avg       0.83      0.79      0.78      1396
weighted avg       0.83      0.78      0.78      1396



In [153]:
print('read test data')
test = read_test()
for feature in features:
    if feature in ['language', 'text']:continue
    print('read precomputed feature', feature)
    feature_path = build_feature_path('TEST_REL', feature)
    feature_df = pd.read_csv(feature_path, index_col='id')
    feature_df.columns = [feature + "_" + column for column in feature_df.columns]
    test = pd.merge(test, feature_df, how='left', left_index=True, right_index=True)
print('encode language')
test['language'] = language_le.fit_transform(test.language)
# print('preprocess text')
# test['text'] = test.text.apply(partial(preprocess, fix_encoding=True))

X_test = test[[column for column in train.columns if any(column.startswith(feature) for feature in features)]]


read test data
read precomputed feature boosters_selected
read precomputed feature char_prediction
read precomputed feature hashtags_selected
read precomputed feature hedges_selected
read precomputed feature mentions_total
read precomputed feature female_words
read precomputed feature male_words
read precomputed feature most_similar_scale
read precomputed feature perspective
read precomputed feature perspective_difference
read precomputed feature sif
read precomputed feature vader_selected
encode language


In [None]:

X_test_char = char.transform(X_test.fillna('dontknow'))
# X_test_char = X_test

In [155]:
y_test_pred = automl.predict(X_test_char)

In [156]:
y_test_pred= y_le.inverse_transform(y_test_pred)

In [157]:
np.unique(y_test_pred)

array(['ideological-inequality', 'misogyny-non-sexual-violence',
       'non-sexist', 'objectification', 'sexual-violence',
       'stereotyping-dominance'], dtype=object)

In [158]:
pd.DataFrame(pd.Series(y_test_pred, index=test.index, name='task2')).reset_index().transform({'id':lambda x: "{:06d}".format(x),
                                                                                             'task2':lambda x:x}).to_csv('task2_gridsearch.csv', index=False)

In [159]:
results_df = pd.DataFrame(pd.Series(y_test_pred, index=test.index, name='task2')).reset_index().transform({'id':lambda x: "{:06d}".format(x),
                                                                                             'task2':lambda x:x})
results_df['test_case'] = 'EXIST2021'
results_df[['test_case', 'id', 'task2']].to_csv('task2_gridsearch.tsv', sep='\t', header = None, index=False)

In [161]:
automl.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 10,
 'criterion': 'entropy',
 'class_weight': 'balanced',
 'bootstrap': False}