# Model incidència 'No ha impartit classe a aquest grup'

This notebook creates a evaluates a model to detect the issue 'No ha impartit classe a aquest grup'.

It will be created a model to each language: catalan, spanish and english.

Only are treated comments of type 'P : Professor'.




In [1]:
import json
import os.path
import sys
import csv
import numpy as np
import pandas as pd
from colorama import Fore

In [2]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path

In [3]:
import spacy
from spacy.util import minibatch, compounding

# Load English tokenizer, tagger, parser, NER and word vectors
#nlp = spacy.load("ca_fasttext_wiki")
#nlp = spacy.load("es_core_news_sm")
#nlp = spacy.load("en_core_web_sm")


In [4]:
# Loads information from preprocessed file to create de train and test data

def load_data(limit = 0, split = 0.8, language = "ca"):

    if (debug >= 1):
        print ("LOAD_DATA")
        
    # Load data from file
    file = "comentaris_" + language + ".csv"
    data = pd.read_csv(pathdest + file)
    if (debug >= 2):
        print ("Original data:")
        display (data.sample(5))

    # Calculates label and filter rows to be used    
    data_prof = data[data.TipusPregunta == "P"][["Comentari","TipusIncidencia"]]
    if (debug >= 1):
        print("Original data: ", data_prof.shape[0])
        
    if (debug >= 2):
        print ("Filtered data:")
        display (data_prof.sample(5))    

    # Calculates tuples row
    # Converts: label=True -> {"POSITIVE": True, "NEGATIVE": False}
    # label=False -> {"POSITIVE": False, "NEGATIVE": True}

    data_prof["label"] = data_prof["TipusIncidencia"] == "No ha impartit classe a aquest grup"
    data_prof["tuples"] = data_prof.apply(lambda row: (row["Comentari"], {"POSITIVE": bool(row["label"]), "NEGATIVE": not bool(row["label"])}), axis=1)    
    if (debug >= 2):
        print ("Tuples dataframe:")
        display (data_prof.sample(5))

    # Select positive cases
    data_true = data_prof[data_prof["label"]==True]
    if (debug >= 2):
        print ("Data True cases:", data_true.shape[0])
        
    train_data_true = data_true["tuples"].tolist()
    random.shuffle(train_data_true)
    if (debug >= 2):
        print ("Tuples list true:")
        print (train_data_true[:5])  
        
    # Split text and label of true cases into two lists
    texts_true, cats_true = zip(*train_data_true)
    if (debug >= 2):
        print ("Texts True cases:")
        print (texts_true[0:5])
        print ("Cats False cases:")
        print (cats_true[0:5])
    
    # Size of train_data_true and test_data_true
    split_true = int(len(train_data_true) * split)
    if (debug >= 1):
        print ("Train data True:", split_true, ", Test data True: ", len(train_data_true)-split_true)   
        
    # Select negative cases
    data_false = data_prof[data_prof["label"]==False]
    if (debug >= 2):
        print ("Data False cases:", data_false.shape[0])
        
    train_data_false = data_false["tuples"].tolist()
    random.shuffle(train_data_false)
    if (debug >= 2):
        print ("Tuples list false:")
        print (train_data_false[:5])  
        
    # Split text and label of false cases into two lists
    texts_false, cats_false = zip(*train_data_false)
    if (debug >= 2):
        print ("Texts True cases:")
        print (texts_true[0:5])
        print ("Cats False cases:")
        print (cats_true[0:5])
    
    # Size of train_data_false and test_data_false
    if (limit > 0) & ((len(train_data_false) + len(train_data_true)) * split > limit):
        train_split_false = int(limit - len(train_data_true) * split) 
        test_split_false = int(len(train_data_false) * (1 - split))
    else:
        train_split_false = int(len(train_data_false) * split)
        test_split_false = int(len(train_data_false) * (1 - split))
        
    if (debug >= 1):
        print ("Train data False:", train_split_false, ", Test data False: ", test_split_false)   
    
    
    # Mix true and false cases and split in train and devel
    # All the true cases are included
    train_texts = texts_true[:split_true] + texts_false[:train_split_false]
    train_cats = cats_true[:split_true] + cats_false[:train_split_false]
    test_texts = texts_true[split_true:] + texts_false[-test_split_false:]
    test_cats = cats_true[split_true:] + cats_false[-test_split_false:]
    if (debug >= 1):
        print ("Train texts: ", len(train_texts), ", Test texts: ", len(test_texts))
        
    # Return train data and test data
    return (train_texts, train_cats), (test_texts, test_cats)

In [5]:
def evaluate(tokenizer, textcat, texts, cats):
    fp_list = list()
    fn_list = list()
    
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
                fp_list.append(doc)
                if (debug >= 2):
                    print ("fp: ", doc)
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
                fn_list.append(doc)
                if (debug >= 2):
                    print ("fn: ", doc)                
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
        
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score, 
            "false_positive": fp_list, "false_negative": fn_list}


In [22]:
def train_model(model=None, language="ca", target_loss=0.001, n_texts=2000):
    
    # Load the model form spacy
    nlp = spacy.load(model)

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")

    # load training and test data
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(n_texts, 0.8, language)
#    train_texts = train_texts[:n_texts]
#    train_cats = train_cats[:n_texts]

    if debug:
        print(
            "Using {} examples ({} training, {} evaluation)".format(
                n_texts, len(train_texts), len(dev_texts)
            )
        )

    # converts the data to the format:
    # (text, {'cats': {'POSITIVE': True, 'NEGATIVE': False}}))
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
    if debug:
        print ("Train_data: ")
        print (train_data[:5])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
#        if init_tok2vec is not None:
#            with init_tok2vec.open("rb") as file_:
#                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        loss = 100
        prev_loss = 1000
        while (loss > target_loss) | (prev_loss - loss > target_loss):
            prev_loss = loss
#        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                if (debug >= 2):
                    print ("Batch: ")
                    print (batch)
                texts, annotations = zip(*batch)

                # Eliminación del 20% de los casos para evitar generalizaciones
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )  
            loss = losses["textcat"]
            
        print ()
        print ("False positives:")
        print (scores["false_positive"])
        print ()
        print ("False negatives")
        print (scores["false_negative"])
            
    # test the trained model
    print ("")
    print ("Some examples: ")
    test_texts ={"ca": ["Costa que pengi els materials acordats al campus",
                    "No he tingut aquesta professora",
                    "No ha impartit classe a aquest grup",
                    "No ha corregit examens fets a octubre, i estem al mes de gener."],
            "es": ["No he tenido contacto", 
                   "Muy  buenas clases",
                   "No nos dio clases",
                   "No vimos a en clase a este docente"
                   "No se ha adaptado a la actual situación"
                  ],
            "en": ["I haven't had this professor",
                   "We don't know this person",
                   "He isn't a good professor",
                   "We had only a few lessons"
                  ]
        }     
    for test_text in test_texts[language]:
        doc = nlp(test_text)
        print(test_text, doc.cats)
    print ("")
    
    # Save de model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk(pathmodel + language)
    print("Saved model to", pathmodel + language)
        
    # test the saved model
    print("Loading from", pathmodel+ language)
    nlp2 = spacy.load(pathmodel +  language)
    for test_text in test_texts[language]:
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
    

In [23]:
pathori = "../data/original"
pathdest = "../data/preprocessed/"
pathmodel = "../data/models/"
debug = 1

In [24]:
n_iter = 5
target_loss = 0.001
n_texts = 2000
languages = {"ca":"ca_fasttext_wiki", "es":"es_core_news_sm", "en":"en_core_web_sm" }
#languages = {"ca":"ca_fasttext_wiki"}

for language in languages:
    model = languages[language]
    
    print()
    print ("TRAINING LANGUAGE: " + language)    
    print ("Model: " + model)
    train_model (model, language, target_loss, n_texts)



TRAINING LANGUAGE: ca
Model: ca_fasttext_wiki




LOAD_DATA
Original data:  3955
Train data True: 58 , Test data True:  15
Train data False: 1941 , Test data False:  776
Train texts:  1999 , Test texts:  791
Using 2000 examples (1999 training, 791 evaluation)
Train_data: 
[("Aquest professor no m'ha impartit l'assignatura. Només m'ha penjat un PDF i unes instruccions per fer un treball amb el qual puntuar-nos. Això no és docència.", {'cats': {'POSITIVE': True, 'NEGATIVE': False}}), ('No he tingut al Juan!', {'cats': {'POSITIVE': True, 'NEGATIVE': False}}), ('Per la situació actual, no hem tingut classes presencials amb aquest professor. Els recursos online que ens ha donat estaven poc explicats.', {'cats': {'POSITIVE': True, 'NEGATIVE': False}}), ("No l'hem tingut cap dia.", {'cats': {'POSITIVE': True, 'NEGATIVE': False}}), ('Poques sessions per avaluar correctament.', {'cats': {'POSITIVE': True, 'NEGATIVE': False}})]
Training the model...
LOSS 	  P  	  R  	  F  
0.790	0.909	0.667	0.769
0.123	0.909	0.667	0.769
0.025	0.833	0.667	0.741




Costa que pengi els materials acordats al campus {'POSITIVE': 8.81040523381671e-06, 'NEGATIVE': 0.9999911785125732}
No he tingut aquesta professora {'POSITIVE': 0.9999407529830933, 'NEGATIVE': 5.9213289205217734e-05}
No ha impartit classe a aquest grup {'POSITIVE': 0.838776171207428, 'NEGATIVE': 0.16122381389141083}
No ha corregit examens fets a octubre, i estem al mes de gener. {'POSITIVE': 1.1811242075054906e-05, 'NEGATIVE': 0.9999881982803345}

TRAINING LANGUAGE: es
Model: es_core_news_sm
LOAD_DATA
Original data:  942
Train data True: 30 , Test data True:  8
Train data False: 723 , Test data False:  180
Train texts:  753 , Test texts:  188
Using 2000 examples (753 training, 188 evaluation)
Train_data: 
[('No sé quién es', {'cats': {'POSITIVE': True, 'NEGATIVE': False}}), ('No la he tenido', {'cats': {'POSITIVE': True, 'NEGATIVE': False}}), ('No lo he tenido ', {'cats': {'POSITIVE': True, 'NEGATIVE': False}}), ('No he tenido contacto', {'cats': {'POSITIVE': True, 'NEGATIVE': False}})