# Model incidència 'No ha impartit classe a aquest grup'

This notebook creates a evaluates a model to detect the issue 'No ha impartit classe a aquest grup'.

It will be created a model to each language: catalan, spanish and english.

Only are treated comments of type 'P : Professor'.




In [1]:
import json
import os.path
import sys
import csv
import numpy as np
import pandas as pd
from colorama import Fore

In [2]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path

In [3]:
import spacy
from spacy.util import minibatch, compounding

# Load English tokenizer, tagger, parser, NER and word vectors
#nlp = spacy.load("ca_fasttext_wiki")
#nlp = spacy.load("es_core_news_sm")
#nlp = spacy.load("en_core_web_sm")


In [4]:
# Loads information from preprocessed file to create de train and test data

def load_data(limit = 0, split = 0.8, language = "ca"):

    if (debug >= 1):
        print ("LOAD_DATA")
        
    # Load data from file
    file = "comentaris_" + language + ".csv"
    data = pd.read_csv(pathdest + file)
    if (debug >= 2):
        print ("Original data:")
        display (data.sample(5))

    # Calculates label and filter rows to be used    
    data["label"] = data["TipusIncidencia"] == "No ha impartit classe a aquest grup"

    data_prof = data[data.TipusPregunta == "P"][["Comentari","label"]]
    if (debug >= 2):
        print ("Filtered data:")
        display (data_prof.sample(5))

    if (debug >= 2):
        print ("Comment of this issue type:")
        display(data_prof[data_prof["label"]].sample(5))

    # Converts dataframe into list
    data_prof["tuples"] = data_prof.apply(lambda row: (row["Comentari"], row["label"]), axis=1)
    train_data = data_prof["tuples"].tolist()   
    if (debug >= 2):
        print ("Tuples:")
        print (train_data[:5])    

    # Takes an aleatori set of tuples
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    if (debug >= 2):
        print ("Shuffled tuples:")
        print (train_data[:5])    

    # Split text and label into two lists
    texts, labels = zip(*train_data)
    if (debug >= 1):
        print ("Texts:")
        print (texts[0:5])
        print ("Labels:")
        print (labels[0:5])


    # Converts: label=True -> {"POSITIVE": True, "NEGATIVE": False}
    # label=False -> {"POSITIVE": False, "NEGATIVE": True}
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    if (debug >= 1):
        print ("Cats:")
        print (cats[0:5])

    # Size of train_data and test_data
    split = int(len(train_data) * split)
    if (debug >= 1):
        print ("Train data:", split, "Test data: ", len(train_data)-split)   
        print ("")
    
    # Return train data and test data
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [32]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
                if (debug >= 2):
                    print ("fp: ", doc)
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
                if (debug >= 2):
                    print ("fn: ", doc)                
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}


In [20]:
def main (model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
    
    # Load the model form spacy
    nlp = spacy.load(model)

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")

    # load training and test data
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(0, 0.8, "ca")
    train_texts = train_texts[:n_texts]
    train_cats = train_cats[:n_texts]

    if debug:
        print(
            "Using {} examples ({} training, {} evaluation)".format(
                n_texts, len(train_texts), len(dev_texts)
            )
        )

    # converts the data to the format:
    # (text, {'cats': {'POSITIVE': True, 'NEGATIVE': False}}))
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
    if debug:
        print ("Train_data: ")
        print (train_data[:5])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                if (debug >= 2):
                    print ("Batch: ")
                    print (batch)
                texts, annotations = zip(*batch)

                # Eliminación del 20% de los casos para evitar generalizaciones
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )  
           
    # test the trained model
    print ("")
    print ("Some examples: ")
    test_texts = ["Costa que pengi els materials acordats al campus",
        "No he tingut aquesta professora",
        "No ha impartit classe a aquest grup",
        "No ha corregit examens fets a octubre, i estem al mes de gener."]
    for test_text in test_texts:
        doc = nlp(test_text)
        print(test_text, doc.cats)
    print ("")
    
    # Save de model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        
        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for test_text in test_texts:
            doc2 = nlp2(test_text)
            print(test_text, doc2.cats)
    

In [34]:
pathori = "../data/original"
pathdest = "../data/preprocessed/"
pathmodel = "../data/processed/"
debug = 1

model = "ca_fasttext_wiki"
output_dir = "../data/processed/"
n_iter = 5
n_texts = 2000
init_tok2vec=None

In [35]:
main (model, output_dir, n_iter, n_texts, init_tok2vec)


LOAD_DATA
Texts:
("Molt bon professor, ho explica tot d'una manera molt clara, entenedora i professional", "És un bon professor, tot i que moltes vegades no ens acaba d'aclarar els dubtes que tenim ni penjar el material que necessitem.", "Crec que no ha set conscient que la seva assignatura no ere l'única de la carrera i no ens ha escolatat ni tingut en compte la nostra opinió en cap moment. ", "Alguns treballs s'haurien de replantejar.\r\nL'assignatura la vam cursar durant horari d'avaluacions, és a dir, fora del plaç i abans de l'incici de la resta d'assignatures.", 'Molt bona professora, i molt pràctica amb la seva experiència a les classes.')
Labels:
(False, False, False, False, False)
Cats:
[{'POSITIVE': False, 'NEGATIVE': True}, {'POSITIVE': False, 'NEGATIVE': True}, {'POSITIVE': False, 'NEGATIVE': True}, {'POSITIVE': False, 'NEGATIVE': True}, {'POSITIVE': False, 'NEGATIVE': True}]
Train data: 2690 Test data:  673

Using 2000 examples (2000 training, 673 evaluation)
Train_data: 
