In [None]:
import os
import sys
import math
import logging
from pathlib import Path
from strictyaml import YAMLValidationError

import numpy as np
import scipy as sp
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")
sns.set(style="ticks", color_codes=True)

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

import textacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
from ana_consult import _, __version__
from ana_consult.ac_conf import AnaConsultConf

APP_NAME = "mtes_analyze"
logger = logging.getLogger(APP_NAME + ".cluster")
cfg = ".mtes.yaml"
logger.info(_("Getting configuration data from %s"), cfg)
try:
    config = AnaConsultConf(cfg)
except YAMLValidationError:
    logger.critical(_("Incorrect content in YAML configuration %s"), cfg)
    sys.exit(0)


In [None]:
# Prepare NLP processing
logger.info(_("Preparing NLP text processing"))
fr_nlp = textacy.load_spacy_lang(
    "fr_core_news_sm", disable=("tagger", "parser", "ner")
)
logger.info(_("NLP pipeline: %s"), fr_nlp.pipe_names)
# Adjust stopwords for this specific topic
fr_nlp.Defaults.stop_words |= {"y", "france", "italie"}
fr_nlp.Defaults.stop_words -= {"contre"}


In [None]:
# Load data
corpus_file = Path.home() / (
    "ana_consult/data/interim/" + config.consultation_name + "_doc.pkl"
)
logger.info(_("Loading corpus from %s"), corpus_file)
corpus = textacy.Corpus.load(fr_nlp, corpus_file)
logger.info(_("Document size: %s"), corpus)

In [None]:
# Define vectorizer parameters
logger.info(_("Simplifying corpus"))
doc_lemma = pd.DataFrame(
    [
        [
            " ".join(
                list(
                    doc._.to_terms_list(
                        ngrams=1,
                        entities=False,
                        normalize="lemma",
                        as_strings=True,
                        filter_stops=True,
                        filter_punct=True,
                        filter_nums=True,
                    )
                )
            ),
            doc._.meta["opinion"],
        ]
        for doc in corpus[:1000000]
    ],
    columns=["text", "opinion"],
)
print(doc_lemma.head(20))

In [None]:
doc_lemma_cls = doc_lemma.dropna()
print(doc_lemma_cls.opinion.describe())
true_labels = [0 if d == "Favorable" else 1 for d in doc_lemma_cls.opinion]

In [None]:
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.9, min_df=0.05, stop_words=None, use_idf=True, ngram_range=(1, 3)
)
# Fit vectoriser to NLP processed column
logger.info(_("Fitting TF-IDF vectorizer to NLP data"))
tfidf_matrix = tfidf_vectorizer.fit_transform(doc_lemma_cls.text)
terms = np.array(tfidf_vectorizer.get_feature_names())
logger.info(_("TF-IDF (n_samples, n_features): %s"), tfidf_matrix.shape)

In [None]:
tfidf_matrix.data

In [None]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs


In [None]:
top_feats_by_class(tfidf_matrix, doc_lemma_cls.opinion, terms)

In [None]:
def plot_tfidf_classfeats_h(dfs):
    ''' Plot the data frames returned by the function plot_tfidf_classfeats(). '''
    fig = plt.figure(figsize=(12, 9), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Mean Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("label = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.tfidf, align='center', color='#3F5D7D')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.feature)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()

In [None]:
plot_tfidf_classfeats_h(top_feats_by_class(tfidf_matrix, doc_lemma_cls.opinion, terms))