# Model incidència 'No ha impartit classe a aquest grup'

This notebook creates a evaluates a model to detect the issue 'No ha impartit classe a aquest grup'.

It will be created a model to each language: catalan, spanish and english.

Only are treated comments of type 'P : Professor'.




In [3]:
import json
import os.path
import sys
import csv
import numpy as np
import pandas as pd
from colorama import Fore

In [4]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path

In [5]:
import spacy
from spacy.util import minibatch, compounding

# Load English tokenizer, tagger, parser, NER and word vectors
#nlp = spacy.load("ca_fasttext_wiki")
#nlp = spacy.load("es_core_news_sm")
#nlp = spacy.load("en_core_web_sm")


In [6]:
# Loads information from preprocessed file to create de train and test data

def load_data(limit = 0, split = 0.8, language = "ca"):

    if (debug >= 1):
        print ("LOAD_DATA")
        
    # Load data from file
    file = "comentaris_" + language + ".csv"
    data = pd.read_csv(pathdest + file)
    if (debug >= 2):
        print ("Original data:")
        display (data.sample(5))

    # Calculates label and filter rows to be used    
    data_prof = data[data.TipusPregunta == "P"][["Comentari","TipusIncidencia"]]
    if (debug >= 2):
        print ("Filtered data:")
        display (data_prof.sample(5))    

    # Calculates tuples row
    # Converts: label=True -> {"POSITIVE": True, "NEGATIVE": False}
    # label=False -> {"POSITIVE": False, "NEGATIVE": True}

    data_prof["label"] = data_prof["TipusIncidencia"] == "No ha impartit classe a aquest grup"
    data_prof["tuples"] = data_prof.apply(lambda row: (row["Comentari"], {"POSITIVE": bool(row["label"]), "NEGATIVE": not bool(row["label"])}), axis=1)    
    if (debug >= 2):
        print ("Tuples dataframe:")
        display (data_prof.sample(5))

    # Converts dataframe into list
    train_data = data_prof["tuples"].tolist()   
    if (debug >= 2):
        print ("Tuples list:")
        print (train_data[:5])    

    # Takes an aleatori set of tuples
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    if (debug >= 2):
        print ("Shuffled tuples:")
        print (train_data[:5])    

    # Split text and label into two lists
    texts, cats = zip(*train_data)
    if (debug >= 1):
        print ("Texts:")
        print (texts[0:5])
        print ("Cats:")
        print (cats[0:5])

    # Size of train_data and test_data
    split = int(len(train_data) * split)
    if (debug >= 1):
        print ("Train data:", split, "Test data: ", len(train_data)-split)   
        print ("")
    
    # Return train data and test data
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [7]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
                if (debug >= 2):
                    print ("fp: ", doc)
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
                if (debug >= 2):
                    print ("fn: ", doc)                
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}


In [8]:
def train_model(model=None, language="ca", n_iter=20, n_texts=2000):
    
    # Load the model form spacy
    nlp = spacy.load(model)

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")

    # load training and test data
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(0, 0.8, language)
    train_texts = train_texts[:n_texts]
    train_cats = train_cats[:n_texts]

    if debug:
        print(
            "Using {} examples ({} training, {} evaluation)".format(
                n_texts, len(train_texts), len(dev_texts)
            )
        )

    # converts the data to the format:
    # (text, {'cats': {'POSITIVE': True, 'NEGATIVE': False}}))
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
    if debug:
        print ("Train_data: ")
        print (train_data[:5])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
#        if init_tok2vec is not None:
#            with init_tok2vec.open("rb") as file_:
#                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                if (debug >= 2):
                    print ("Batch: ")
                    print (batch)
                texts, annotations = zip(*batch)

                # Eliminación del 20% de los casos para evitar generalizaciones
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )  
           
    # test the trained model
    print ("")
    print ("Some examples: ")
    test_texts ={"ca": ["Costa que pengi els materials acordats al campus",
                    "No he tingut aquesta professora",
                    "No ha impartit classe a aquest grup",
                    "No ha corregit examens fets a octubre, i estem al mes de gener."],
            "es": ["No he tenido contacto", 
                   "Muy  buenas clases",
                   "No nos dio clases",
                   "No vimos a en clase a este docente"
                   "No se ha adaptado a la actual situación"
                  ],
            "en": ["I haven't had this professor",
                   "We don't know this person",
                   "He isn't a good professor",
                   "We had only a few lessons"
                  ]
        }     
    for test_text in test_texts[language]:
        doc = nlp(test_text)
        print(test_text, doc.cats)
    print ("")
    
    # Save de model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk(pathmodel + language)
        print("Saved model to", pathmodel + language)
        
        # test the saved model
        print("Loading from", pathmodel+ language)
        nlp2 = spacy.load(pathmodel +  language)
        for test_text in test_texts[language]:
            doc2 = nlp2(test_text)
            print(test_text, doc2.cats)
    

In [9]:
pathori = "../data/original"
pathdest = "../data/preprocessed/"
pathmodel = "../data/models/"
debug = 1

language = "ca"
model = "ca_fasttext_wiki"
n_iter = 5
n_texts = 2000

In [12]:
languages = {"ca":"ca_fasttext_wiki", "es":"es_core_news_sm", "en":"en_core_web_sm" }

for language in languages:
    model = languages[language]
    
    print()
    print ("TRAINING LANGUAGE: " + language)    
    train_model (model, language, n_iter, n_texts)



TRAINING LANGUAGE: ca
LOAD_DATA
Texts:
("S'ha adaptat molt bé a la situació generada pel COVID-19.", "Encara no ens ha corregit un treball que va ser enviat el passat mes d'abril.", "Ha sigut una assignatura que m'ha fet llàstima no poder-la dur a terme presencialment perquè la M.Paz tenia pensades moltes activitats.. però tot i això, ha dut a terme l'assignatura des d'una altra vessant hi ha estat molt bé!", "aspectes negatius:te favoritismes no cumpleix els plaços de tancament d'activitats establerts per ella mateixa permetent nomes a alguns alumnes, aquells que no havien entregat encara, entregar fora de plaç i utilitzant metodes no oficials, tambe en les seves classes es posa a parlar de coses que no tenen res a veure amb l'assignatura i que no interessen a tota la classe cosa que fa perdre temps que es podria dedicar a aprofondir en la materia.\r\naspectes positius: corregeix rapid", "El professor és molt competent i es preocupa per l'alumnat. ")
Cats:
({'POSITIVE': False, 'NEGAT

0.505	1.000	1.000	1.000
0.174	0.800	1.000	0.889
0.095	1.000	0.750	0.857
0.105	1.000	1.000	1.000
0.072	1.000	0.750	0.857

Some examples: 
I haven't had this professor {'POSITIVE': 3.91346839023754e-05, 'NEGATIVE': 0.9999608993530273}
We don't know this person {'POSITIVE': 5.026111102779396e-05, 'NEGATIVE': 0.9999496936798096}
He isn't a good professor {'POSITIVE': 6.938586011528969e-05, 'NEGATIVE': 0.9999306201934814}
We had only a few lessons {'POSITIVE': 9.361172851640731e-05, 'NEGATIVE': 0.9999064207077026}

Saved model to ../data/models/en
Loading from ../data/models/en
I haven't had this professor {'POSITIVE': 3.3694846933940426e-05, 'NEGATIVE': 0.9999662637710571}
We don't know this person {'POSITIVE': 5.306455568643287e-05, 'NEGATIVE': 0.9999469518661499}
He isn't a good professor {'POSITIVE': 6.059076258679852e-05, 'NEGATIVE': 0.9999394416809082}
We had only a few lessons {'POSITIVE': 7.958371134009212e-05, 'NEGATIVE': 0.9999203681945801}
