In [62]:
#Descomentar si se necesita instalar alguna de las librerias
import glob
import pickle
import sys
#!{sys.executable} -m pip install numpy
#!{sys.executable} -m pip install pandas
#!{sys.executable} -m pip install scikit-learn

In [63]:
import numpy as np
import pandas as pd

In [108]:
import spacy

In [64]:
import requests

RASA_HOST = '127.0.0.1'
RASA_PORT = 5005

rasa_endpoint = "http://{0}:{1}/model/parse".format(RASA_HOST, RASA_PORT)

In [176]:
def extract_entities(claim):
    res = requests.post(rasa_endpoint, json={"text": claim})
    data = res.json()
    present_entities = []
    if "entities" in data:
        for ent in data['entities']:
            if ent['entity'] not in present_entities:
                present_entities.append(ent['entity'])
        return present_entities
    return None

In [177]:
extract_entities('Alberto Fernández es el Presidente de Argentina')

['PER', 'GPE', 'MISC']

In [6]:
from constants import POS_TAGGED_FOLDER, SPACY_FOLDER
sys.path.append(SPACY_FOLDER)  
from feature_extractors import automatic_feature_extractor

In [7]:
def get_tagged_sentences(folder):
    # Load all the tagged sentences included in the .pickle files 
    parsed_sentences = []
    for filename in glob.glob(folder + '*.pickle'):
        with open(filename, 'rb') as tagged_file:
            parsed_sentences = parsed_sentences + pickle.load(tagged_file, encoding="bytes")
    return parsed_sentences

In [8]:
tagged_sentences = get_tagged_sentences(POS_TAGGED_FOLDER)

In [9]:
tagged_sentences[0][b'classification'].decode('utf8')

'non-fact-checkable'

In [11]:
tagged_sentences[30][b'sentence'].decode('utf8')

'Durante al año 2014 numerosos dirigentes internacionales presidentes organismos multilaterales de crédito y organismos multilaterales en general auguraban que el 2014 iba a ser un año donde culminara terminara la crisis iniciada por la caída de Lehman Brothers en el 2008'

In [14]:
print(tagged_sentences[0][b'pos_tag'][3][b'lemma'])

presidenta


In [15]:
tagged_sentences[0][b'pos_tag'][3].keys()

dict_keys([b'dep', b'text', b'pos', b'lemma', b'tag', b'like_num', b'is_punct', b'tense'])

In [19]:
from spacy.lang.es.stop_words import STOP_WORDS
from spacy.lang.es import Spanish

stop_words = STOP_WORDS

In [110]:
parser = spacy.load('es_core_news_lg')

In [238]:
def pos_tag(sentence, pos_tag=False):
    
    
    mytokens = parser(sentence)
    tags = []
    for token in mytokens:
        token_tags = []
        
        if pos_tag:
            pos = token.pos_
            token_tags.append(pos) if pos else None
        
        lemma = token.lemma_.lower()
        token_tags.append(lemma) if lemma else None
        
        #word['dep'] = token.dep_
        #is_punct = token.is_punct
        #like_num = token.like_num

        if "Tense" in token.tag_:
            trunk = token.tag_[token.tag_.find("Tense") :]
            trunk = trunk[trunk.find('=')+1:trunk.find('|')]
            tense = trunk
            token_tags.append(tense)
        else:
            tense = "undefined"
            
        named_entities = extract_entities(sentence)

        tags.extend(token_tags)
        tags.extend(named_entities)

    return tags

In [91]:
import string
punctuations = string.punctuation

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)
    
    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    
    # return preprocessed list of tokens
    return mytokens

In [121]:
spacy_tokenizer('Macri incrementó la pobreza')

['macri', 'incrementar', 'pobreza']

In [179]:
pos_tag('Macri incrementó la pobreza')

['PROPN',
 'macri',
 'PER',
 'TOPIC',
 'VERB',
 'incrementar',
 'Past',
 'PER',
 'TOPIC',
 'DET',
 'lo',
 'PER',
 'TOPIC',
 'NOUN',
 'pobreza',
 'PER',
 'TOPIC']

In [180]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [181]:
bow_vector = CountVectorizer(tokenizer = pos_tag, ngram_range=(1,3))

In [182]:
tfidf_vector = TfidfVectorizer(tokenizer = pos_tag)

In [183]:
tfidf_transformer = TfidfTransformer()

In [184]:
data = [{'sentence': item[b'sentence'].decode('utf8').lower(), 'target': item[b'classification'].decode('utf8')} for item in tagged_sentences]

In [185]:
df = pd.DataFrame(data)

In [186]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['target']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [187]:
strat_train_set.target.value_counts()

non-fact-checkable    1914
fact-checkable        1175
Name: target, dtype: int64

In [188]:
strat_test_set.target.value_counts()

non-fact-checkable    479
fact-checkable        294
Name: target, dtype: int64

In [189]:
X_train = strat_train_set.drop(['target'], axis=1)
X_test = strat_test_set.drop(['target'], axis=1)
y_train = strat_train_set['target']
y_test = strat_test_set['target']

In [209]:
from sklearn.model_selection import train_test_split
X = [e['sentence'] for e in data]
y = [e['target'] for e in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [204]:
count_vect = CountVectorizer(tokenizer = pos_tag, ngram_range=(1,3))

In [234]:
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [244]:
from sklearn.linear_model import LogisticRegression


classifier = LogisticRegression()

pipe = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('classifier', classifier)
])


# model generation
pipe.fit(X_train, y_train)

# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Precision:",metrics.precision_score(y_test, predicted, pos_label='fact-checkable'))
print("Logistic Recall:",metrics.recall_score(y_test, predicted, pos_label='fact-checkable'))

Logistic Regression Accuracy: 0.8305882352941176
Logistic Precision: 0.8307692307692308
Logistic Recall: 0.6835443037974683


In [243]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()

pipe = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('classifier', classifier)
])


# model generation
pipe.fit(X_train, y_train)

# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("MNB Accuracy:",metrics.accuracy_score(y_test, predicted))
print("MNB Precision:",metrics.precision_score(y_test, predicted, pos_label='fact-checkable'))
print("MNB Recall:",metrics.recall_score(y_test, predicted, pos_label='fact-checkable'))

MNB Accuracy: 0.8243137254901961
MNB Precision: 0.9111842105263158
MNB Recall: 0.5843881856540084


In [218]:
predicted[0]

'fact-checkable'

In [236]:
tfidf_vector = TfidfVectorizer(tokenizer = pos_tag)

In [245]:
classifier = MultinomialNB()

pipe = Pipeline([
                ('vectorizer', CountVectorizer()),
                ('classifier', classifier)])


# model generation
pipe.fit(X_train,y_train)

# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("MNB Accuracy:",metrics.accuracy_score(y_test, predicted))
print("MNB Precision:",metrics.precision_score(y_test, predicted, pos_label='fact-checkable'))
print("MNB Recall:",metrics.recall_score(y_test, predicted, pos_label='fact-checkable'))

MNB Accuracy: 0.8486274509803922
MNB Precision: 0.81859410430839
MNB Recall: 0.7616033755274262


In [233]:
from sklearn.linear_model import SGDClassifier

classifier = SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)

pipe = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('classifier', classifier)
])


# model generation
pipe.fit(X_train,y_train)

# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("SGD Accuracy:",metrics.accuracy_score(y_test, predicted))
print("SGD Precision:",metrics.precision_score(y_test, predicted, pos_label='fact-checkable'))
print("SGD Recall:",metrics.recall_score(y_test, predicted, pos_label='fact-checkable'))

SGD Accuracy: 0.8250980392156863
SGD Precision: 0.8177215189873418
SGD Recall: 0.6814345991561181
