## Model size analysis

This notebook looks at some of the considerations one needs to make about the model size model

In [1]:
from transformers import RobertaModel, RobertaTokenizer, AutoModelForQuestionAnswering, AutoTokenizer
import torch
import time

In [2]:
def test_model(model_version, iterations=100, device='cuda'):

  model = AutoModelForQuestionAnswering.from_pretrained(model_version, output_attentions=True)
  model = model.to(device)
  tokenizer = AutoTokenizer.from_pretrained(model_version)

  sentence_b = """The English name "Normans" comes from the French words Normans/Normanz, plural of Normant, modern French normand, which is itself borrowed from Old Low Franconian Nortmann "Northman" or directly from Old Norse Norðmaðr, Latinized variously as Nortmannus, Normannus, or Nordmannus (recorded in Medieval Latin, 9th century) to mean "Norseman, Viking"."""
  sentence_a = 'When was the Latin version of the word Norman first recorded?'
  inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True)
  input_ids = inputs['input_ids'].to(device)

  data = []
  for _ in range(iterations):
    start = time.time()
    outputs=model(input_ids.to(device))
    end = time.time()
    data.append(end-start)
  return data


def test_model2(model, tokenizer, iterations=100, device='cuda'):
  sentence_b = """The English name "Normans" comes from the French words Normans/Normanz, plural of Normant, modern French normand, which is itself borrowed from Old Low Franconian Nortmann "Northman" or directly from Old Norse Norðmaðr, Latinized variously as Nortmannus, Normannus, or Nordmannus (recorded in Medieval Latin, 9th century) to mean "Norseman, Viking"."""
  sentence_a = 'When was the Latin version of the word Norman first recorded?'
  inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True)
  input_ids = inputs['input_ids'].to(device)

  data = []
  for _ in range(iterations):
    start = time.time()
    outputs=model(input_ids.to(device))
    end = time.time()
    data.append(end-start)
  return data

In [None]:
inf_time = test_model('roberta-base', 1000)

In [None]:
np.mean(inf_time)

0.01971937656402588

In [None]:
np.std(inf_time)

0.004353389449601591

In [None]:
inf_time = test_model('roberta-large', 1000)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to us

In [None]:
np.mean(inf_time)

0.030643402338027953

In [None]:
np.std(inf_time)

0.001211133327135738

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use 

Remove all 5 random combinations of a single layer, of two layers etc.

In [None]:
import copy
def delete_encoding_layers(layer_set, model):
    """Delete some of the encoding layers of the model.

    Args:
        args (_type_): The training argparse object.
        model (_type_): transformer/pytorch model

    Returns:
        _type_: copy of model with deleted layers.
    """
    del_set = layer_set
    oldModuleList = model.roberta.encoder.layer
    newModuleList = torch.nn.ModuleList()

    # Now iterate over all layers, only keepign only the relevant layers.
    print('Deleting', layer_set)
    for idx in range(len(oldModuleList)):
        if idx not in del_set:
            newModuleList.append(oldModuleList[idx])
        else:
            None

    # create a copy of the model, modify it with the new list, and return
    copyOfModel = copy.deepcopy(model)
    copyOfModel.roberta.encoder.layer = newModuleList
    return copyOfModel

In [None]:
data = []
layers=len(model.roberta.encoder.layer)
for i in range(,layers):
  st = set(np.random.choice(np.arange(1,layers),i,replace=False))
  _mld = delete_encoding_layers(st,model)
  _mld=_mld.to('cuda')
  inf_time = test_model2(_mld, tokenizer, 300)
  data.append([i,str(st),np.mean(inf_time),np.std(inf_time)])


Deleting set()
Deleting {9}
Deleting {2, 4}
Deleting {8, 1, 6}
Deleting {9, 2, 3, 1}
Deleting {2, 3, 6, 8, 9}
Deleting {1, 3, 5, 6, 8, 11}
Deleting {1, 2, 3, 4, 8, 9, 11}
Deleting {1, 2, 3, 5, 6, 7, 8, 11}
Deleting {1, 2, 3, 4, 5, 6, 7, 10, 11}
Deleting {1, 2, 3, 4, 5, 6, 8, 9, 10, 11}
Deleting {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}


In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(data,columns=["Removed layers count","removed layers","mean inf time", "std inf time"])


In [None]:
df

Unnamed: 0,Removed layers count,removed layers,mean inf time,std inf time
0,0,set(),0.014925,0.008465
1,1,{9},0.012151,0.002384
2,2,"{2, 4}",0.014332,0.007571
3,3,"{8, 1, 6}",0.009472,0.001007
4,4,"{9, 2, 3, 1}",0.008527,0.000835
5,5,"{2, 3, 6, 8, 9}",0.0075,0.000758
6,6,"{1, 3, 5, 6, 8, 11}",0.006409,0.000607
7,7,"{1, 2, 3, 4, 8, 9, 11}",0.005554,0.00061
8,8,"{1, 2, 3, 5, 6, 7, 8, 11}",0.00461,0.000517
9,9,"{1, 2, 3, 4, 5, 6, 7, 10, 11}",0.003695,0.000485


In [None]:
import plotly.graph_objects as go
import numpy as np
df = df.sort_values('Removed layers count')
x = list(df['Removed layers count'])
x_rev = x[::-1]

# Line 1
y1 = df['mean inf time']
y1_std = df['std inf time']
y1_upper = (y1+y1_std).to_list()
y1_lower = y1-y1_std
y1_lower = y1_lower[::-1]

y1_lower = y1_lower.to_list()
y1 = y1.to_list()


fig = go.Figure()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor='rgba(0,100,80,0.2)',
    line_color='rgba(255,255,255,0)',
    showlegend=False,
    name='Fair',
))


fig.add_trace(go.Scatter(
    x=x, y=y1,
    line_color='rgb(0,100,80)',
    name='RoBERTa-base',
))

fig.update_traces(mode='lines')

fig.update_layout(
    xaxis_title="Roberta Layers Removed",
    yaxis_title="Inference time (seconds)",
)

fig.show()

In [None]:
import numpy as np
import pandas as pd
data = []
layers=len(model.roberta.encoder.layer)
for i in range(0,layers+1):
  st = set(np.random.choice(np.arange(0,layers),i,replace=False))
  _mld = delete_encoding_layers(st,model)
  data.append([i,str(st),_mld.num_parameters()])


Deleting set()
Deleting {6}
Deleting {10, 6}
Deleting {0, 8, 10}
Deleting {1, 5, 6, 7}
Deleting {0, 2, 4, 9, 11}
Deleting {1, 3, 5, 7, 9, 11}
Deleting {0, 1, 2, 3, 4, 8, 9}
Deleting {0, 1, 2, 3, 4, 5, 8, 10}
Deleting {1, 2, 3, 4, 5, 6, 7, 8, 9}
Deleting {0, 1, 2, 3, 4, 5, 6, 7, 9, 11}
Deleting {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Deleting {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}


In [None]:
df = pd.DataFrame(data,columns=["Removed layers count","removed layers","num params"])


In [None]:
_mld.roberta

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList()
  )
)

In [None]:
import plotly.express as px

fig = px.line(df, x="Removed layers count", y="num params")
fig.show()

In [None]:
import plotly.graph_objects as go
import numpy as np
df = df.sort_values('Removed layers count')
x = list(df['Removed layers count'])
x_rev = x[::-1]

# Line 1
y1 = df['num params']
y1_std = df['std inf time']
y1_upper = (y1+y1_std).to_list()
y1_lower = y1-y1_std
y1_lower = y1_lower[::-1]

y1_lower = y1_lower.to_list()
y1 = y1.to_list()


fig = go.Figure()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor='rgba(0,100,80,0.2)',
    line_color='rgba(255,255,255,0)',
    showlegend=False,
    name='Fair',
))


fig.add_trace(go.Scatter(
    x=x, y=y1,
    line_color='rgb(0,100,80)',
    name='RoBERTa-base',
))

fig.update_traces(mode='lines')

fig.update_layout(
    xaxis_title="Roberta Layers Removed",
    yaxis_title="Inference time (seconds)",
)

fig.show()