In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

In [2]:
import json

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import spacy
from spacy.lang.nl.stop_words import STOP_WORDS

from readers import JsonReader
from analysis import TopicDetector

In [3]:
nlp = spacy.load("en_core_web_lg")
json_reader = JsonReader(source="death_penalty.json", subjects=["death penalty", "capital punishment"])
texts = json_reader.get_texts()
#from data.biomass import *
#texts = [TEXT_1, TEXT_2, TEXT_3]

In [4]:
def get_word_frame(texts, ngram):
    tfidf_vectorizer = TfidfVectorizer(stop_words=STOP_WORDS, ngram_range=(ngram, ngram))
    tfidf_vectorizer.fit(texts)
    feature_names = tfidf_vectorizer.get_feature_names()
    tfidf_vectors = tfidf_vectorizer.transform(texts)
    frame = pd.DataFrame(tfidf_vectors.toarray(), columns=feature_names)
    if ngram == 1:
        number_features = [feature for feature in feature_names if not feature.isalpha()]
        frame.drop(labels=number_features, axis=1, inplace=True)
    return frame

In [None]:
tfidf_frame = get_word_frame(texts, 1)
tfidf_words_sorted = tfidf_frame.sum(axis=0).sort_values(ascending=False)

#bi_frame = get_word_frame(texts, 2)
#tfidf_words_sorted_bi = bi_frame.sum(axis=0).sort_values(ascending=False)
#tri_frame = get_word_frame(texts, 3)
#tfidf_words_sorted_tri = tri_frame.sum(axis=0).sort_values(ascending=False)
#tetra_frame = get_word_frame(texts, 4)
#tfidf_words_sorted_tetra = tetra_frame.sum(axis=0).sort_values(ascending=False)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 5):
    print(tfidf_words_sorted)

#tfidf_words_sorted_bi
#tfidf_words_sorted_tri
#tfidf_words_sorted_tetra

In [None]:
all_words = tfidf_words_sorted.index.tolist()
all_tokens = list(map(lambda word: nlp.vocab[word], all_words))
most_important_tokens = all_tokens[:50]
for important_token in most_important_tokens:
    similarities = []
    for token in all_tokens:
        if token is important_token:
            continue
        similarities.append((token.text, important_token.similarity(token),))
    similarities = sorted(similarities, key=lambda item: item[1])
    print(important_token.text)
    print("*" * 10 + "most similar" + "*" * 10)
    most_similar = similarities[-5:]
    most_similar.reverse()
    print("\n".join(
        "{0} {1:.2f}".format(word, similarity)
        for word, similarity in most_similar
    ))
    print("*" * 10 + "most different" + "*" * 10)
    print("\n".join(
        "{0} {1:.2f}".format(word, similarity)
        for word, similarity in similarities[:5]
    ))
    print()
    print()

In [None]:
all_words = tfidf_words_sorted.index.tolist()
least_important_words = all_words[500:]
most_important_frame = tfidf_frame.drop(labels=least_important_words, axis=1)

most_important_cooccurence = most_important_frame.T.dot(most_important_frame)
#np.fill_diagonal(most_important_cooccurence.values, 0)
#most_important_cooccurence = most_important_cooccurence.applymap(lambda v: v if v >= 0.3 else 0.0)

most_important_cooccurence

In [None]:
mic_sum = most_important_cooccurence.sum(axis=0).sort_values()
mic_sum

In [None]:
life_text_ixs = tfidf_frame["life"].argsort()[::-1]
justice_text_ixs = tfidf_frame["justice"].argsort()[::-1]
court_text_ixs = tfidf_frame["court"].argsort()[::-1]
law_text_ixs = tfidf_frame["law"].argsort()[::-1]
reward_text_ids = tfidf_frame["reward"].argsort()[::-1]

In [None]:
life_text_ixs = tfidf_frame["life"].argsort()
justice_text_ixs = tfidf_frame["justice"].argsort()
court_text_ixs = tfidf_frame["court"].argsort()
law_text_ixs = tfidf_frame["law"].argsort()
reward_text_ids = tfidf_frame["reward"].argsort()

In [None]:
texts[life_text_ixs.iloc[0]]

In [None]:
texts[justice_text_ixs.iloc[2]]

In [None]:
texts[court_text_ixs.iloc[0]]

In [None]:
texts[law_text_ixs.iloc[2]]

In [None]:
texts[reward_text_ids.iloc[5]]

In [None]:
mins = most_important_cooccurence.min()
nzeros = mins[mins > 0]
frame = most_important_cooccurence.drop(labels=nzeros.index, axis=0)
frame = frame.drop(labels=nzeros.index, axis=1)
frame.shape

In [None]:
nodes = [{"name": column, "group": 0} for column in frame.columns]
node_names = [node["name"] for node in nodes]
links = [{"source": node_names.index(column), "target": node_names.index(key), "value": 1} for column, row in frame.iteritems() for key, value in row.iteritems() if value == 0]
with open("../data/cooccurence-graph.json", "w") as file:
    json.dump({
        "nodes": nodes,
        "links": links
    }, file, indent=4)

In [5]:
td = TopicDetector()
td(texts)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


{'cdata cdata cdata', 'international covenant civil', 'penalty information center', 'death penalty information', 'civil political rights', 'covenant civil political'}
{'cruel unusual', 'unusual punishment', 'information center', 'death row', 'cdata cdata', 'use death', 'york times', 'international covenant', 'covenant civil', 'new york', 'rights defenders', 'row phenomenon', 'row inmates', 'international human', 'support death', 'penalty cases', 'abolished death', 'row prisoners', 'abolition death', 'civil political', 'rights watch', 'death penalty', 'political rights', 'mandatory death', 'row inmate', 'inhuman degrading', 'degrading treatment', 'people death', 'seek death', 'years death', 'treatment punishment', 'abolish death', 'human rights', 'cruel inhuman', 'penalty information'}
{'department', 'support', 'scene', 'inhuman', 'evidence', 'families', 'abolished', 'http', 'took', 'trafficking', 'eye', 'drug', 'justice', 'lethal', 'murder', 'officer', 'york', 'killer', 'times', 'polit

In [11]:
#td.sorted_ngrams[2]

#td.sorted_ngrams[3]
td.sorted_ngrams[4]
#td.sorted_ngrams[1]

death penalty information center          1.041669
cdata cdata cdata cdata                   0.999750
international covenant civil political    0.970888
covenant civil political rights           0.961743
dtype: float64

In [19]:
results = []
for ix, serie in td.sorted_ngrams.items():
    results += [(topic, len(topic.split(" ")), importance) for topic, importance in serie[:10].items()]
results.sort(key=lambda result: (result[1], result[2],), reverse=True)
    

In [16]:
results

[('said', 1, 21.037530811737554),
 ('case', 1, 12.865137530695028),
 ('time', 1, 10.095929899088123),
 ('executions', 1, 10.077820107213585),
 ('mr', 1, 9.902615493435691),
 ('just', 1, 7.911555309169307),
 ('like', 1, 7.875731745235997),
 ('government', 1, 7.743277278129479),
 ('judge', 1, 7.546269919810048),
 ('texas', 1, 7.305590258000421),
 ('capital punishment', 2, 9.775320590656285),
 ('supreme court', 2, 7.065976598922405),
 ('sentenced death', 2, 5.354700885655385),
 ('death sentence', 2, 5.260773155454801),
 ('amnesty international', 2, 5.07166512553193),
 ('united states', 2, 4.853117253939576),
 ('death sentences', 2, 4.075661771929752),
 ('year old', 2, 3.854297472649489),
 ('criminal justice', 2, 3.3567607217501307),
 ('lethal injection', 2, 3.280613337761099),
 ('use death penalty', 3, 2.455883020586902),
 ('support death penalty', 3, 2.246474513464155),
 ('death row inmates', 3, 2.1472849609099067),
 ('death penalty cases', 3, 1.841431711311112),
 ('mandatory death penal

In [20]:
[
            (topic, importance,)
            for topic, word_count, importance in
            results
        ]

[('death penalty information center', 1.0416691278266346),
 ('cdata cdata cdata cdata', 0.9997500205914415),
 ('international covenant civil political', 0.97088836028874),
 ('covenant civil political rights', 0.9617425578637402),
 ('use death penalty', 2.455883020586902),
 ('support death penalty', 2.246474513464155),
 ('death row inmates', 2.1472849609099067),
 ('death penalty cases', 1.841431711311112),
 ('mandatory death penalty', 1.5805682327037942),
 ('death row inmate', 1.4605547783248343),
 ('abolished death penalty', 1.442974231780669),
 ('human rights watch', 1.441078915050974),
 ('abolish death penalty', 1.3944340772055757),
 ('new york times', 1.3505360450430923),
 ('capital punishment', 9.775320590656285),
 ('supreme court', 7.065976598922405),
 ('sentenced death', 5.354700885655385),
 ('death sentence', 5.260773155454801),
 ('amnesty international', 5.07166512553193),
 ('united states', 4.853117253939576),
 ('death sentences', 4.075661771929752),
 ('year old', 3.8542974726