In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96
interactivity = ''
is_shiny = False
is_dashboard = False
plotly_connected = True

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  if plotly_connected:
    pio.renderers.default = "notebook_connected"
  else:
    pio.renderers.default = "notebook"
  for template in pio.templates.keys():
    pio.templates[template].layout.margin = dict(t=30,r=0,b=0,l=0)
except Exception:
  pass

# disable itables paging for dashboards
if is_dashboard:
  try:
    from itables import options
    options.dom = 'fiBrtlp'
    options.maxBytes = 1024 * 1024
    options.language = dict(info = "Showing _TOTAL_ entries")
    options.classes = "display nowrap compact"
    options.paging = False
    options.searching = True
    options.ordering = True
    options.info = True
    options.lengthChange = False
    options.autoWidth = False
    options.responsive = True
    options.keys = True
    options.buttons = []
  except Exception:
    pass
  
  try:
    import altair as alt
    # By default, dashboards will have container sized
    # vega visualizations which allows them to flow reasonably
    theme_sentinel = '_quarto-dashboard-internal'
    def make_theme(name):
        nonTheme = alt.themes._plugins[name]    
        def patch_theme(*args, **kwargs):
            existingTheme = nonTheme()
            if 'height' not in existingTheme:
              existingTheme['height'] = 'container'
            if 'width' not in existingTheme:
              existingTheme['width'] = 'container'

            if 'config' not in existingTheme:
              existingTheme['config'] = dict()
            
            # Configure the default font sizes
            title_font_size = 15
            header_font_size = 13
            axis_font_size = 12
            legend_font_size = 12
            mark_font_size = 12
            tooltip = False

            config = existingTheme['config']

            # The Axis
            if 'axis' not in config:
              config['axis'] = dict()
            axis = config['axis']
            if 'labelFontSize' not in axis:
              axis['labelFontSize'] = axis_font_size
            if 'titleFontSize' not in axis:
              axis['titleFontSize'] = axis_font_size  

            # The legend
            if 'legend' not in config:
              config['legend'] = dict()
            legend = config['legend']
            if 'labelFontSize' not in legend:
              legend['labelFontSize'] = legend_font_size
            if 'titleFontSize' not in legend:
              legend['titleFontSize'] = legend_font_size  

            # The header
            if 'header' not in config:
              config['header'] = dict()
            header = config['header']
            if 'labelFontSize' not in header:
              header['labelFontSize'] = header_font_size
            if 'titleFontSize' not in header:
              header['titleFontSize'] = header_font_size    

            # Title
            if 'title' not in config:
              config['title'] = dict()
            title = config['title']
            if 'fontSize' not in title:
              title['fontSize'] = title_font_size

            # Marks
            if 'mark' not in config:
              config['mark'] = dict()
            mark = config['mark']
            if 'fontSize' not in mark:
              mark['fontSize'] = mark_font_size

            # Mark tooltips
            if tooltip and 'tooltip' not in mark:
              mark['tooltip'] = dict(content="encoding")

            return existingTheme
            
        return patch_theme

    # We can only do this once per session
    if theme_sentinel not in alt.themes.names():
      for name in alt.themes.names():
        alt.themes.register(name, make_theme(name))
      
      # register a sentinel theme so we only do this once
      alt.themes.register(theme_sentinel, make_theme('default'))
      alt.themes.enable('default')

  except Exception:
    pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass

# interactivity
if interactivity:
  from IPython.core.interactiveshell import InteractiveShell
  InteractiveShell.ast_node_interactivity = interactivity

# NOTE: the kernel_deps code is repeated in the cleanup.py file
# (we can't easily share this code b/c of the way it is run).
# If you edit this code also edit the same code in cleanup.py!

# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'/home/chainsawriot/dev/methodshub-bertclassification':
  os.chdir(r'/home/chainsawriot/dev/methodshub-bertclassification')

# reset state
%reset

# shiny
# Checking for shiny by using False directly because we're after the %reset. We don't want
# to set a variable that stays in global scope.
if False:
  try:
    import htmltools as _htmltools
    import ast as _ast

    _htmltools.html_dependency_render_mode = "json"

    # This decorator will be added to all function definitions
    def _display_if_has_repr_html(x):
      try:
        # IPython 7.14 preferred import
        from IPython.display import display, HTML
      except:
        from IPython.core.display import display, HTML

      if hasattr(x, '_repr_html_'):
        display(HTML(x._repr_html_()))
      return x

    # ideally we would undo the call to ast_transformers.append
    # at the end of this block whenver an error occurs, we do 
    # this for now as it will only be a problem if the user 
    # switches from shiny to not-shiny mode (and even then likely
    # won't matter)
    import builtins
    builtins._display_if_has_repr_html = _display_if_has_repr_html

    class _FunctionDefReprHtml(_ast.NodeTransformer):
      def visit_FunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

      def visit_AsyncFunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

    ip = get_ipython()
    ip.ast_transformers.append(_FunctionDefReprHtml())

  except:
    pass

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v

  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define




In [2]:
import pandas as pd
import numpy as np

# For deep learning
import torch

In [3]:
gpu_avail = torch.cuda.is_available()
gpu_avail

True

In [4]:
## sexism_data = pd.read_csv('sexism_data/sexism_data.csv')

sexism_data = pd.read_csv('sexism_data/sexism_sample.csv')

sexism_data.head()

Unnamed: 0,id,dataset,text,toxicity,sexist,of_id
0,10330,other,MENTION3839 MENTION2475 would have to see what...,0.20388,False,-1
1,5703,callme,I hate it when women beat up men in movies!,0.661064,True,-1
2,12331,scales,Men are better at science and math than women are,0.531301,True,-1
3,12855,callme,"""MENTION149: MENTION2551 ps I'd never listen t...",0.27866,False,-1
4,2240,other,MENTION279 Vivek W is the Carrot Top of academ...,0.084303,False,-1


In [5]:
sexism_data = sexism_data.dropna(subset = 'sexist')

In [6]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging

In [7]:
#| eval: false
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [8]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(sexism_data, stratify=sexism_data['sexist'], test_size=0.2)

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train_df['sexist'])
train_df['labels'] = le.transform(train_df['sexist'])
test_df['labels'] = le.transform(test_df['sexist'])

In [10]:
# to see which number was mapped to which class:
list(le.inverse_transform([0,1]))

[False, True]

In [11]:
# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=3, overwrite_output_dir=True)

# Create a ClassificationModel
model = ClassificationModel(
    "distilbert", "distilbert-base-uncased", args=model_args, use_cuda=gpu_avail,
)

# we set some additional parameters when using a GPU
if gpu_avail:
    model_args.use_multiprocessing=False
    model_args.use_multiprocessing_for_evaluation=False



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Train the model
model.train_model(train_df)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/20 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/20 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/20 [00:00<?, ?it/s]

(60, 0.45164629618326824)

In [13]:
sexist_tweet = "A woman will never be truly fulfilled in life if she doesnâ€™t have a committed long-term relationship with a man"
predictions, raw_outputs = model.predict([sexist_tweet])
le.inverse_transform(predictions)

  0%|          | 0/1 [00:00<?, ?it/s]

array([ True])

In [14]:
nonsexist_tweet = "International Women's Day (IWD) is a holiday celebrated annually on March 8 as a focal point in the women's rights movement."
predictions, raw_outputs = model.predict([nonsexist_tweet])
le.inverse_transform(predictions)

  0%|          | 0/1 [00:00<?, ?it/s]

array([ True])

In [15]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test_df)
result

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

{'mcc': 0.6713171133426189,
 'accuracy': 0.825,
 'f1_score': 0.8444444444444444,
 'tp': 19,
 'tn': 14,
 'fp': 6,
 'fn': 1,
 'auroc': 0.9524999999999999,
 'auprc': 0.9589709722121558,
 'eval_loss': 0.3443008363246918}

In [16]:
# you can also use sklearn's neat classification report to get more metrics
from sklearn.metrics import classification_report

preds, _ = model.predict(list(test_df['text'].values))
# preds = le.inverse_transform(preds)

print(classification_report(test_df['labels'], preds))

  0%|          | 0/1 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.93      0.70      0.80        20
           1       0.76      0.95      0.84        20

    accuracy                           0.82        40
   macro avg       0.85      0.82      0.82        40
weighted avg       0.85      0.82      0.82        40



In [17]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [18]:
#| eval: false
model_name = 'distilbert-base-uncased'
if gpu_avail:
    device_name = 'cuda'
else:
    device_name = 'cpu'

# This is the maximum number of tokens in any document; the rest will be truncated.
max_length = 512

# This is the name of the directory where we'll save our model. You can name it whatever you want.
cached_model_directory_name = 'output_hf'

In [19]:
#| eval: false
train_texts = train_df['text'].values
train_labels = train_df['labels'].values

test_texts = test_df['text'].values
test_labels = test_df['labels'].values

In [20]:
#| eval: false
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

In [21]:
#| eval: false
from datasets import Dataset

train_df = Dataset.from_pandas(train_df)
test_df = Dataset.from_pandas(test_df)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_train_df = train_df.map(tokenize_function, batched=True)
tokenized_test_df = test_df.map(tokenize_function, batched=True)

In [22]:
#| eval: false
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=len(le.classes_)).to()

In [23]:
#| eval: false
import accelerate

In [24]:
#| eval: false
training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    output_dir='./results',          # output directory
    report_to='none'
)

In [25]:
#| eval: false
from sklearn.metrics import accuracy_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
  }

In [26]:
#| eval: false
trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train_df,         # training dataset
    compute_metrics=compute_metrics      # our custom evaluation function
)

In [27]:
#| eval: false
trainer.train()

In [28]:
#| eval: false
trainer.save_model(cached_model_directory_name)

In [29]:
#| eval: false
# trainer = DistilBertForSequenceClassification.from_pretrained(cached_model_directory_name)

In [30]:
#| eval: false
predicted_results = trainer.predict(tokenized_test_df)

In [31]:
#| eval: false
predicted_labels = predicted_results.predictions.argmax(-1) # Get the highest probability prediction
predicted_labels = predicted_labels.flatten().tolist()      # Flatten the predictions into a 1D list
predicted_labels[0:5]

In [32]:
#| eval: false
print(classification_report(tokenized_test_df['labels'],
                            predicted_labels))

In [33]:
#| eval: false
sexism_data_annotations = pd.read_csv('sexism_data/all_data_annotations.csv', sep = '\t')
sexism_data_annotations.head()

In [34]:
#| eval: false
tweets = sexism_data_annotations['_id'].unique()

In [35]:
#| eval: false
from collections import Counter

content_labels = []
phrasing_labels = []

for tweet in tweets:
    data_subset = sexism_data_annotations[sexism_data_annotations['_id'] == tweet]
    content_labels.append(Counter(data_subset['content'].values).most_common()[0][0]) # get the majority label for content
    phrasing_labels.append(Counter(data_subset['phrasing']).most_common()[0][0]) # get the majority label for phrasing

In [36]:
#| eval: false
finegrained_sexism_data = pd.DataFrame([tweets, content_labels, phrasing_labels]).T
finegrained_sexism_data.columns = ['_id', 'content_label', 'phrasing_label']
finegrained_sexism_data

In [37]:
#| eval: false
finegrained_sexism_data.groupby('content_label').size()

In [38]:
#| eval: false
finegrained_sexism_data.groupby('phrasing_label').size()

In [39]:
#| eval: false

finegrained_sexism_data = pd.merge(finegrained_sexism_data, sexism_data[['_id', 'text', 'sexist']])

In [40]:
#| eval: false
finegrained_sexism_data.groupby(['content_label']).size()

In [41]:
#| eval: false
finegrained_sexism_data = finegrained_sexism_data[finegrained_sexism_data['content_label'].isin([1, 2, 6])]

# we also change the label range for simpletransformers, making them range from 0 to 2.
label_map = {1 : 0,
             2 : 1,
             6 : 2}
finegrained_sexism_data['content_label'] = [label_map[i] for i in finegrained_sexism_data['content_label']]
finegrained_sexism_data.groupby(['content_label']).size()

In [42]:
#| eval: false
category = 'content_label'

In [43]:
#| eval: false
multi_train_df, multi_test_df = train_test_split(finegrained_sexism_data,
                                                 stratify=finegrained_sexism_data[category],
                                                 test_size=0.2)

In [44]:
#| eval: false
# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=5,
                                output_dir='output_st',
                                overwrite_output_dir=True)

# Create a ClassificationModel
model = ClassificationModel(
    "distilbert", "distilbert-base-uncased", num_labels=len(finegrained_sexism_data[category].unique()),
    use_cuda=gpu_avail,
    args=model_args
)


# we set some additional parameters when using a GPU
if gpu_avail:
    model_args.use_multiprocessing=False
    model_args.use_multiprocessing_for_evaluation=False

In [45]:
#| eval: false
# multi_train_df['content_label'] = [i-1 for i in multi_train_df['content_label']]
# multi_test_df['content_label'] = [i-1 for i in multi_test_df['content_label']]

In [46]:
#| eval: false
multi_train_df = multi_train_df[['text', category]]
multi_test_df = multi_test_df[['text', category]]

In [47]:
#| eval: false
# Train the model.
model.train_model(multi_train_df)

In [48]:
#| eval: false
predictions, raw_outputs = model.predict([sexist_tweet])
predictions

In [49]:
#| eval: false
preds, _ = model.predict(list(multi_test_df['text'].values))

In [50]:
#| eval: false
print(classification_report(multi_test_df[category], preds))