# Model incidència 'No ha impartit classe a aquest grup'

This notebook creates a evaluates a model to detect the issue 'No ha impartit classe a aquest grup'.

It will be created a model to each language: catalan, spanish and english.

Only are treated comments of type 'P : Professor'.




In [2]:
import json
import os.path
import sys
import csv
import numpy as np
import pandas as pd
from colorama import Fore

In [3]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path

In [4]:
import spacy
from spacy.util import minibatch, compounding

# Load English tokenizer, tagger, parser, NER and word vectors
#nlp = spacy.load("ca_fasttext_wiki")
#nlp = spacy.load("es_core_news_sm")
#nlp = spacy.load("en_core_web_sm")


In [41]:
# Loads information from preprocessed file to create de train and test data

def load_data(limit = 0, split = 0.8, language = "ca"):

    if (debug >= 1):
        print ("LOAD_DATA")
        
    # Load data from file
    file = "comentaris_" + language + ".csv"
    data = pd.read_csv(pathdest + file)
    if (debug >= 2):
        print ("Original data:")
        display (data.sample(5))

    # Calculates label and filter rows to be used    
    data_prof = data[data.TipusPregunta == "P"][["Comentari","TipusIncidencia"]]
    if (debug >= 2):
        print ("Filtered data:")
        display (data_prof.sample(5))    

    # Calculates tuples row
    # Converts: label=True -> {"POSITIVE": True, "NEGATIVE": False}
    # label=False -> {"POSITIVE": False, "NEGATIVE": True}

    data_prof["label"] = data_prof["TipusIncidencia"] == "No ha impartit classe a aquest grup"
    data_prof["tuples"] = data_prof.apply(lambda row: (row["Comentari"], {"POSITIVE": bool(row["label"]), "NEGATIVE": not bool(row["label"])}), axis=1)    
    if (debug >= 2):
        print ("Tuples dataframe:")
        display (data_prof.sample(5))

    # Converts dataframe into list
    train_data = data_prof["tuples"].tolist()   
    if (debug >= 2):
        print ("Tuples list:")
        print (train_data[:5])    

    # Takes an aleatori set of tuples
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    if (debug >= 2):
        print ("Shuffled tuples:")
        print (train_data[:5])    

    # Split text and label into two lists
    texts, cats = zip(*train_data)
    if (debug >= 1):
        print ("Texts:")
        print (texts[0:5])
        print ("Cats:")
        print (cats[0:5])

    # Size of train_data and test_data
    split = int(len(train_data) * split)
    if (debug >= 1):
        print ("Train data:", split, "Test data: ", len(train_data)-split)   
        print ("")
    
    # Return train data and test data
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [42]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
                if (debug >= 2):
                    print ("fp: ", doc)
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
                if (debug >= 2):
                    print ("fn: ", doc)                
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}


In [43]:
def train_model(model=None, language="ca", n_iter=20, n_texts=2000):
    
    # Load the model form spacy
    nlp = spacy.load(model)

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")

    # load training and test data
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(0, 0.8, language)
    train_texts = train_texts[:n_texts]
    train_cats = train_cats[:n_texts]

    if debug:
        print(
            "Using {} examples ({} training, {} evaluation)".format(
                n_texts, len(train_texts), len(dev_texts)
            )
        )

    # converts the data to the format:
    # (text, {'cats': {'POSITIVE': True, 'NEGATIVE': False}}))
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
    if debug:
        print ("Train_data: ")
        print (train_data[:5])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
#        if init_tok2vec is not None:
#            with init_tok2vec.open("rb") as file_:
#                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                if (debug >= 2):
                    print ("Batch: ")
                    print (batch)
                texts, annotations = zip(*batch)

                # Eliminación del 20% de los casos para evitar generalizaciones
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )  
           
    # test the trained model
    print ("")
    print ("Some examples: ")
    test_texts = ["Costa que pengi els materials acordats al campus",
        "No he tingut aquesta professora",
        "No ha impartit classe a aquest grup",
        "No ha corregit examens fets a octubre, i estem al mes de gener."]
    for test_text in test_texts:
        doc = nlp(test_text)
        print(test_text, doc.cats)
    print ("")
    
    # Save de model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk(pathmodel + language)
        print("Saved model to", pathmodel + language)
        
        # test the saved model
        print("Loading from", pathmodel+ language)
        nlp2 = spacy.load(pathmodel +  language)
        for test_text in test_texts:
            doc2 = nlp2(test_text)
            print(test_text, doc2.cats)
    

In [44]:
pathori = "../data/original"
pathdest = "../data/preprocessed/"
pathmodel = "../data/processed/"
debug = 1

language = "ca"
model = "ca_fasttext_wiki"
n_iter = 5
n_texts = 2000

In [46]:
train_model (model, language, n_iter, n_texts)


LOAD_DATA
Texts:
("Les correccions dels treballs no són constructivistes, al contrari, les rebem com una crítica sense opció a millora quan no s'ha donat una correcta comunicació entre docent i alumne/a.\r\nPel que fa a l'examen, no trobo coherent la manera de preguntar de l'examen amb els continguts dels Power Points que sovint es queden incomplets pel que ella demana.", 'Explica la matèria una mica desorganitzada, però és bona professora. ', 'Posa moltes facilitats per poder compaginar la seva assignatura amb la feina. És molt propera i molt pacient.', "No hem pogut fer gaires classes per a fer un comentari realista, però amb comparació amb l'altre professor, aquest explica molt millor i fa molts exemples per a que ho entenguem. També ens respon tots els dubtes.", "No sap explicar l'assignatura i no sap resoldre els exercicis que ha elaborat ell mateix.\r\nDiu merèixer respecte per la seva trajectòria professional.\r\nQuan se li comenta alguna opció relativa als exàmens per mirar d'a