In [2]:
%cd drive/MyDrive/collab_sandbox/cerf_text_classification/
!ls

/content/drive/MyDrive/collab_sandbox/cerf_text_classification
blend_models
cerf_classification_distilroberta-base
cerf_classification_machine_learning.ipynb
cerf_classification_roberta-base
cerf_parsing.ipynb
CERF_texts_parsing.zip
cert_classification.ipynb
datasets
learnamericanenglishonline_links.csv
learnamericanenglishonline_links.gsheet
manual_dataset_texts.gsheet
models
wandb


In [14]:
!pip install -q transformers==4.6.1 datasets sentencepiece textstat

[K     |████████████████████████████████| 102kB 3.0MB/s 
[K     |████████████████████████████████| 1.9MB 17.4MB/s 
[?25h

In [15]:
import pandas as pd
import os
from transformers import TrainingArguments, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from torch import nn
import torch 
from datasets import load_dataset, load_metric
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import textstat

def seed_all():
  SEED = 1234
 
  # random.seed(SEED)
  np.random.seed(SEED)
  torch.manual_seed(SEED)
  torch.cuda.manual_seed(SEED)
  torch.backends.cudnn.deterministic = True
seed_all()

In [7]:
texts_dataset = pd.read_csv("./datasets/level_texts_1959.csv", lineterminator='\n')

In [8]:
pretrained_model_name = "distilroberta-base"
class CERFDataset(Dataset):
  def __init__(self, dataset_csv=None ):
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
    self.tokenizer = tokenizer
    self.dataset_csv = dataset_csv.reset_index().to_dict('records')
    self.max_len = 512
    self.dataset = None
    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    self.targets = {
        'A1': 0,
        'A2': 1,
        'B1': 2,
        'B2': 3,
        'C1': 4
    }

  def __len__(self):
    return len(self.dataset_csv)

  def __getitem__(self, idx):
    example = self.dataset_csv[idx]

    source_encoding = self.tokenizer(
      example['source_text'],
      max_length=self.max_len,
      padding='max_length',
      pad_to_max_length=True,
      truncation=True,
    )
    
    target_label = self.targets[example['level']]

    return {'input_ids': torch.tensor(source_encoding['input_ids'], ),#device=self.device),
            'label_ids': torch.tensor([target_label] ),#device=self.device), 
            'attention_mask': torch.tensor(source_encoding['attention_mask'], )}#device=self.device)}

In [None]:
train_data, val_data = train_test_split(texts_dataset, test_size=0.15, random_state=1234)
train_dataset = CERFDataset(train_data)
val_dataset = CERFDataset(val_data)
len(train_dataset), len(val_dataset)

In [40]:
pretrain_model = AutoModelForSequenceClassification.from_pretrained('./cerf_classification_roberta-base/', num_labels=5)

In [41]:
lang_model_probs = []
pretrain_model.eval()

for item in val_dataset:
  input_ids = item['input_ids'].unsqueeze(0)
  attention_mask = item['attention_mask'].unsqueeze(0)
  with torch.no_grad():
    text_prob = pretrain_model(input_ids, attention_mask)['logits'].cpu().numpy()
    lang_model_probs.append(text_prob)

In [None]:
# lang_model_probs

In [16]:
text_stat_functions = [
      textstat.flesch_reading_ease,
      textstat.smog_index,
      textstat.flesch_kincaid_grade,
      textstat.coleman_liau_index,
      textstat.automated_readability_index,
      textstat.dale_chall_readability_score,
      textstat.difficult_words,
      textstat.linsear_write_formula,
      textstat.gunning_fog,
      textstat.text_standard,
      textstat.fernandez_huerta,
      textstat.szigriszt_pazos,
      textstat.gutierrez_polini,
      textstat.crawford,
]

In [17]:
import nltk
import itertools


def process_example(dataset_item):
  level = dataset_item['level']
  source_text_len = dataset_item['source_text_len']
  text = dataset_item['source_text']
  
  dataset_item = {}
  dataset_item['source_text'] = text
  dataset_item['level'] = level
  dataset_item['source_text_len'] = source_text_len
  
  for stat_func in text_stat_functions:
    feature_name = str(stat_func.__name__)
    result = 0
    if 'float_output' in str(signature(stat_func)):
      result = stat_func(text, float_output=True)
    else:
      result = stat_func(text)

    dataset_item[feature_name] = result
  
  cerf_levels = {'A1': 0, 'A2': 0, 'B1': 0, 'B2': 0, 'C1': 0}

  level_word_occur = {}
  for item in nlp(text):
    lemma = str(item.lemma_)
    if word_list.get(lemma, False) and level_word_occur.get(lemma, True):
      level = word_list[lemma].upper()
      level_word_occur[lemma] = False
      cerf_levels[level] += 1
  
  for key in cerf_levels.keys():
    feature_name = 'word_count_' + key 
    dataset_item[feature_name] = cerf_levels[key]

  return dataset_item

In [20]:
LABELS_ENCODER =  {
  'A1': 0,
  'A2': 1,
  'B1': 2,
  'B2': 3,
  'C1': 4
}
tenses_entities = [
  'a1_be_have_do_in_the_past',
  'a1_can',
  'a1_comparative_exept',
  'a1_comparative_long',
  'a1_comparative_short',
  'a1_future_simple',
  'a1_have_has_got',
  'a1_past_simple_irreg',
  'a1_past_simple_reg',
  'a1_possesive_s_sing',
  'a1_possessive_s_plurar',
  'a1_present_continuous_act_rn',
  'a1_present_simple_3d_pers',
  'a1_present_simple_reg_act',
  'a1_special_questions',
  'a1_superlative_exept',
  'a1_superlative_long',
  'a1_superlative_short',
  'a1_there_is_am_are',
  'a1_there_was_were',
  'a1_there_will_be',
  'a1_to_be_future_will_be',
  'a1_to_be_past_was_were',
  'a1_to_be_present_is_am_are',
  'a1_want_would_like_to',
]

texts_dataset_stats = pd.read_csv("./datasets/texts_dataset_stats.csv" , lineterminator='\n')
texts_dataset_stats['level_num'] = texts_dataset_stats['level'].apply(lambda x: LABELS_ENCODER[x])
texts_dataset_stats = texts_dataset_stats.drop(columns=['source_text', 'level'])
texts_dataset_stats = texts_dataset_stats.drop(columns=tenses_entities)

In [21]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score


X = texts_dataset_stats.drop(columns=[ 'level_num']).values
y = texts_dataset_stats[[ 'level_num']].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 1234)
 
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LightGBM Model accuracy score: 0.8639


In [None]:
y_pred_prob = clf.predict_proba(X_test)

In [42]:
lang_model_probs_new = np.array(lang_model_probs).squeeze(1)

In [60]:
for weight in [0.5, 0.53, 0.56, 0.6]:
  weight_1 = weight
  weight_2 = 1 - weight_1
  blend_predictions = lang_model_probs_new*weight_1 + y_pred_prob*weight_2
  blend_predictions = blend_predictions.argmax(-1)
  accuracy = accuracy_score(blend_predictions, y_test)
  print(f"Accuracy => {accuracy}, weight={weight}")

Accuracy => 0.9013605442176871, weight=0.5
Accuracy => 0.9013605442176871, weight=0.53
Accuracy => 0.9013605442176871, weight=0.56
Accuracy => 0.9013605442176871, weight=0.6
