In [355]:
import pandas as pd
from spacy.lang.fr import French
import spacy
from line_profiler import LineProfiler
from time import time
from spacy.matcher import Matcher
from spacy.cli.download import download
download('fr_core_news_md')
import cProfile
import io
import pstats
import re

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_md')


In [442]:
output_filename= "../../data/processed/DPEFs/dpef_paragraphs_sentences_long_format.csv"
df = pd.read_csv(output_filename,sep=";")
if 'tokens' in df.columns:
    df = df.drop('tokens', axis=1)

In [437]:
def profile(fnc):
    
    def inner(*args, **kwargs):
        pr = cProfile.Profile()
        pr.enable()
        rtval = fnc(*args, **kwargs)
        pr.disable()
        s = io.StringIO()
        sortby = 'cumulative'
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats()
        print(s.getvalue())
        return rtval
    return inner     
        
#@profile  
        
"""
Grammar module
"""


class Grammar(object):
    """
    Linguistic processing rules applied to sentences.
    """

    def __init__(self):
        # spaCy NLP
        self.nlp = spacy.load("fr_core_news_md")
        
    def create_match(self, matcher_name):
        matcher = Matcher(self.nlp.vocab)
        setattr(self, matcher_name, matcher)

    def add_match(self, matcher_name, pattern_name, pattern):
        getattr(self, matcher_name).add(pattern_name, None, pattern)
 
    def parse(self, text):
        """
        Parses text to NLP tokens.
        Args:
            text: input text
        Returns:
            tokens
        """
        tokens = None
        if text:
            # Run text through linguistic rules
            tokens = self.nlp(text)

            # Apply custom rules to token list
            #tokens = self.applyRules(tokens)
        return tokens

    def label(self, tokens):
        """
        Linguistic rules processing logic. Identifies non-informative sentences and labels them accordingly.
        Labels:
            - QUESTION: Any text ending in a question mark
            - FRAGMENT: Poorly structured sentences with limited information
        Args:
            tokens: parsed tokens
        Returns:
            label if detected else None
        """
        label = None

        if tokens:
            # Label non-informative sentences
            if self.isQuestion(tokens.text):
                label = "QUESTION"
            elif self.isFragment(tokens):
                label = "FRAGMENT"
        return label
    
    def isQuestion(self, text):
        """
        Determines if the text is a question
        Args:
            text: input text
        Returns:
            true if text is a question, false otherwise
        """
        # Questions have a ? mark at end
        return text.strip().endswith("?")

    def isFragment(self, tokens):
        """
        Run text against linguistic rules to determine if sentence is a fragment. Fragments are non descriptive.
        Args:
            tokens: nlp document tokens
        Returns:
            true if text is a sentence fragment, false otherwise
        """
        # Nominal subject Nouns/Proper Nouns
        nouns = any([t.pos_  in ["NOUN", "PROPN"]  and t.dep_ in ["nsubj", "nsubjpass"] for t in tokens])

        # Actions (adverb, auxiliary, verb)
        action = any([t.pos_ in ["ADV", "AUX", "VERB"] for t in tokens])

        # Consider root words with a nominal subject as an action word
        action = action or any([t.dep_ in ["appos", "ROOT"] and any([x.dep_ in ["nsubj", "nsubjpass"] for x in t.children]) for t in tokens])

        # Non-punctuation tokens and multi-character words (don't count single letters which are often variables used in equations)
        words = [t.text for t in tokens if t.pos_ not in ["PUNCT", "SPACE", "SYM"] and len(t.text) > 1]

        # Valid sentences take the following form:
        #  - At least one nominal subject noun/proper noun AND
        #  - At least one action/verb AND
        #  - At least 5 words
        valid = nouns and action and len(words) >= 5

        return not valid
       
    
    def isPercent(self, tokens):
        #We should be able to work this out with entities (any([t.label_ == "PERCENT" for t in doc.ents])

        # Iterate over the tokens in the doc
        for token in tokens[:-1]:
            # Check if the token resembles a number
            if token.like_num:
                # Get the next token in the document
                next_token = tokens[token.i+1]
                # Check if the next token's text equals '%'
                if next_token.text == "%":
                    return True
                else:
                    return False
            else:
                return False
        
    def isDate(self, tokens):
        #We should be able to work this out with entities (any([t.label_ == "DATE" for t in doc.ents])

        # Iterate over the tokens in the doc
        for token in tokens:
            # Check if the token resembles a number
            if token.like_num and token.is_digit:
                if len(token)==4 and int(token.text)>2013:
                    return True
                else:
                    return False
    
    def commitLevel(self, tokens):
        #Match for mentioning a commitment
        self.create_match('comm_level_LOW')
        pattern1 = [{"LEMMA": {"IN": ["engager", "viser", 'objectif', 'atteindre', 'prévoir' ]}}] 
        grammar.add_match('comm_level_LOW', 'pat_LOW', pattern1)

        #Match for pattern [subj+commit_verb] such as 'Le Groupe s’engage...'
        self.create_match('comm_level_MED')
        pattern2 = [{"POS": {"IN": ["PROPN", "NOUN"]}},{"POS": "PROPN", "OP": "?"}, {"POS": "AUX", "OP": "?"},
             {"LEMMA": {"IN": ["engager", "viser"]}}]
        self.add_match('comm_level_MED', 'pat_MED', pattern2)
        level = ''
        if self.comm_level_LOW(tokens):
            level = '1'+str(self.comm_level_LOW(tokens))
            if self.comm_level_MED(tokens):
                level = '2'+str(self.comm_level_MED(tokens))
                
                # Searching for label "DATE" or "PERCENT"
                """#Should work but problem with entities
                if any([t.label_=="DATE" for t in tokens.ents]):
                    level = '3'+str([t.label_ for t in tokens.ents])"""
                if self.isPercent(tokens) or self.isDate(tokens):
                    level = '3'
        return level
    

In [443]:
#This cell can take a few minutes to run due to multiple function calls
grammar = Grammar()
df['tokens'] = df['sentence'].apply(grammar.parse)
df['label']=df['tokens'].apply(grammar.label)
df = df[df['label']!=('QUESTION' and 'FRAGMENT')]
df['commit']=df['tokens'].apply(grammar.commitLevel)

In [466]:
output_filename= "../../data/processed/DPEFs/dpef_paragraphs_sentences_commit_level.csv"
df.to_csv(output_filename,sep=";", header=True)
