In [None]:
!git clone https://github.com/dpasch01/polarlib

In [None]:
import sys
 
sys.path.insert(0, './polarlib')

In [None]:
OUTPUT_DIR="./polar-quickstart"

# **POLAR** Quickstart

## 1. Collect **News Corpus**

In [None]:
%load_ext autoreload
%autoreload 2
    
import spacy
   
from polarlib.polar.news_corpus_collector import *

keywords = ["openai", "altman", 'chatgpt', 'gpt']

corpus_collector = NewsCorpusCollector(
    output_dir=OUTPUT_DIR,
    from_date=date(year=2023, month=11, day=16),
    to_date=date(year=2023, month=11, day=23),
    keywords=keywords
)

corpus_collector.collect_archives()
corpus_collector.collect_articles(n_articles = 250)
corpus_collector.pre_process_articles()

## 2. Extract **Entities** and **Noun Phrases**

In [None]:
%load_ext autoreload
%autoreload 2
    
import spacy
   
from polarlib.polar.actor_extractor import *

entity_extractor = EntityExtractor(output_dir=OUTPUT_DIR)

If using `coref=True`, then set `n_processes=1` as the coreference resolution model operates sequentially.

In [None]:
entity_extractor = EntityExtractor(output_dir=OUTPUT_DIR, coref=False)

In [None]:
entity_extractor.extract_entities()

In [None]:
transformation_list = [
    ("replace", {
        "http://dbpedia.org/resource/Robert_Altman": "http://dbpedia.org/resource/Sam_Altman",
        "http://dbpedia.org/resource/Open_Archives_Initiative": "http://dbpedia.org/resource/OpenAI"
    }),
    ("delete", [
        "http://dbpedia.org/resource/Japanese_honorifics"
    ])
]

In [None]:
entity_extractor.apply_transformations(transformation_list)

In [None]:
from tqdm import tqdm

import json, itertools

article_entities_list = []

for root, folders, files in tqdm(list(os.walk(os.path.join(OUTPUT_DIR, 'entities')))):

    for p in files: 

        p = os.path.join(root, p)

        with open(p, 'r') as f:

            entities = json.load(f)

            if isinstance(entities, str):
            
                entities = json.loads(entities)

        article_entities_list += [e['title'] for e in list(itertools.chain.from_iterable([s['entities'] for s in entities['entities']]))]

In [None]:
from collections import Counter

for e in Counter(article_entities_list).most_common(50): print('- {0:100} {1}'.format(e[0], e[1]))

In [None]:
%load_ext autoreload
%autoreload 2

from polarlib.polar.actor_extractor import *

noun_phrase_extractor = NounPhraseExtractor(output_dir=OUTPUT_DIR)

In [None]:
noun_phrase_extractor.extract_noun_phrases()

In [None]:
noun_phrase_extractor.apply_transformations(transformation_list)

## 3. Identify **Discussion Topics**

Use `llama_wv=True` to leverage llama-based semantic vector embeddings to construct the topical clusters. Otherwise, the default is the `all-mpnet-base-v2` sentence transformers.

In [None]:
%load_ext autoreload
%autoreload 2
   
from polarlib.polar.topic_identifier import *

In [None]:
topic_identifier = TopicIdentifier(output_dir = OUTPUT_DIR, llama_wv=True)

topic_identifier.encode_noun_phrases()
topic_identifier.noun_phrase_clustering(threshold=0.8)

#### Filter Topics according **Contextual Relevance**

A way to improve the quality of the topics is to calculate their `contextual relevance` with the case study. To do so, we must extract the `seed words` and keep the topical clusters that are relevant with our case study.

To do so, we use the `KeyBERT` library to extract key phrases from each article. Then we iterate the topical clusters and we identify those that are contextually relevant, according to their average semantic distance with each phrase.

In [None]:
def replace_entities_with_placeholder(data, placeholder="[ENTITY]"):

    sentence = data['sentence']
    entities = sorted(data['entities'], key=lambda x: x['begin'], reverse=True)  

    for entity in entities:

        start, end = entity['begin'], entity['end']
        sentence   = sentence[:start - data['from']] + placeholder + sentence[end - data['from']:]

    return sentence

In [None]:
import json

docs = []

for a in noun_phrase_extractor.entity_paths:

    a_entities = load_article(a)
    a_text     = '\n\n'.join([replace_entities_with_placeholder(s, placeholder="######") for s in a_entities['entities']])

    docs.append(a_text)

In [None]:
seed_phrases = TopicIdentifier.extract_seed_phrases(docs, top_n=20)

In [None]:
contextual_relevance_scores = TopicIdentifier.calculate_contextual_relevance_scores(OUTPUT_DIR, seed_phrases)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 4))

plt.hist(contextual_relevance_scores.values(), rwidth=0.95, edgecolor='black', log=True)
plt.title("Contextual Relevance Scores")
plt.xlabel("Score")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

In [None]:
"""TopicIdentifier.contextual_relevance_filtering(output_dir, contextual_relevance_scores, thr=0.0)"""

In [None]:
with gzip.open(os.path.join(OUTPUT_DIR, 'topics.json.gz'), 'r') as f: polar_topics = json.load(f)

## 4. Extract **Sentiment Attitudes**

In [None]:
import spacy

spacy_nlp  = spacy.load("en_core_web_sm")

In [None]:
%load_ext autoreload
%autoreload 2

from polarlib.polar.attitude.syntactical_sentiment_attitude import *

sentiment_attitude_pipeline = SyntacticalSentimentAttitudePipeline(
	output_dir = OUTPUT_DIR,
    nlp        = spacy_nlp,
	mpqa_path  = "PARALLAX/Secondary/../../Sentiment Attitude Classification/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff"
)

In [None]:
sentiment_attitude_pipeline.calculate_sentiment_attitudes()

## 5. Construct **Sentiment Attitude Graph (SAG)**

In [None]:
%load_ext autoreload
%autoreload 2

from polarlib.polar.coalitions_and_conflicts import *
from polarlib.polar.sag_generator import *

In [None]:
sag_generator = SAGGenerator(OUTPUT_DIR)

sag_generator.load_sentiment_attitudes()

In [None]:
bins = sag_generator.calculate_attitude_buckets(verbose=True, figsize=(16, 4))

In [None]:
pair_frequency_dict = {k: len([_ for _ in v if _ != 0]) for k, v in sag_generator.pair_sentiment_attitude_dict.items()}

Apply the PARALLAX encoding: $\alpha (e_i, e_j) \geq thr$

We calculate the $thr$ as the average of the median values of positive ($A^+$) and negative ($A^-$) attitudes.

In [None]:
import itertools

attitudes = list(itertools.chain.from_iterable(sag_generator.pair_sentiment_attitude_dict.values()))

In [None]:
a_plus  = [a for a in attitudes if a > 0]
a_minus = [a for a in attitudes if a < 0]

a_thr = (numpy.median(a_plus) + numpy.median(a_minus)) / 2

print("median +:", numpy.median(a_plus))
print("median -:", numpy.median(a_minus))
print("thr     :", a_thr)

In [None]:
plt.figure(figsize=(16, 3))

plt.hist(list(pair_frequency_dict.values()), rwidth=0.95, log=True)

plt.show()

plt.figure(figsize=(16, 3))

plt.hist([v for v in list(pair_frequency_dict.values()) if v < 50], rwidth=0.95, log=True)

plt.show()

In [None]:
sag_generator.convert_attitude_signs(
    bin_category_mapping = {
        "NEGATIVE":  [(-1.00, a_thr)],
        "NEUTRAL":   [(a_thr, a_thr)],
        "POSITIVE":  [(a_thr, 1.00)]
    },
    minimum_frequency    = 5,
    verbose              = True
)

In [None]:
G, node_to_int, int_to_node = sag_generator.construct_sag()

In [None]:
print('Number of Nodes:', G.number_of_nodes())
print('Number of Edges:', G.number_of_edges())

## 6. Generate the **Entity Fellowships**

In [None]:
%load_ext autoreload
%autoreload 2
    
from polarlib.polar.coalitions_and_conflicts import *

fellowship_extractor = FellowshipExtractor(OUTPUT_DIR)

fellowships = fellowship_extractor.extract_fellowships(
    n_iter      = 10,
    resolution  = 0.075,
    merge_iter  = 10,
    jar_path    ='/home/dpasch01/pycharm/polar-framework/',
    verbose     = True,
    output_flag = True
)

## 7. Generate the **Fellowships Dipoles**

In [None]:
import sys
 
sys.path.insert(0, '/home/dpasch01/pycharm/polar-framework')

In [None]:
OUTPUT_DIR="./polar-quickstart"

In [None]:
%load_ext autoreload
%autoreload 2
    
from polarlib.polar.coalitions_and_conflicts import *

dipole_generator = DipoleGenerator(OUTPUT_DIR)
dipoles          = dipole_generator.generate_dipoles(f_g_thr=0.7, n_r_thr=0.5)

In [None]:
sorted(dipoles, key=lambda d: d[1]['neg'], reverse=True)[3]

## 8. Calculate the **Topical Attitudes**

In [None]:
%load_ext autoreload
%autoreload 2
    
from polarlib.polar.coalitions_and_conflicts import *

topic_attitude_calculator = TopicAttitudeCalculator(OUTPUT_DIR)

In [None]:
print('Number of NPs:   ', len(topic_attitude_calculator.np_topics_dict))
print('Number of Topics:', len(topic_attitude_calculator.topics))

In [None]:
print('Number of Attitude Paths:', len(topic_attitude_calculator.attitude_path_list))

In [None]:
topic_attitude_calculator.load_sentiment_attitudes()

In [None]:
dipole_topics_dict = topic_attitude_calculator.get_polarization_topics()

In [None]:
dipole_topics_dict = topic_attitude_calculator.dipole_topics_dict

In [None]:
dipoles = topic_attitude_calculator.dipoles

In [None]:
topic_attitudes = topic_attitude_calculator.get_topic_attitudes()

# **PRISM** Quickstart

In [None]:
%load_ext autoreload
%autoreload 2

from polarlib.prism.polarization_knowledge_graph import *

pkg = PolarizationKnowledgeGraph(output_dir = OUTPUT_DIR)

In [None]:
pkg.construct()

In [None]:
print('Nodes:', pkg.pkg.number_of_nodes())
print('Edges:', pkg.pkg.number_of_edges())

## **Entity-level** Polarization Analytics

In [None]:
%load_ext autoreload
%autoreload 2

from polarlib.prism.multi_level_polarization import POLEExecutor

In [None]:
!git clone https://github.com/zexihuang/POLE

In [None]:
%load_ext autoreload
%autoreload 2

from polarlib.prism.multi_level_polarization import EntityLevelPolarizationAnalyzer

entity_level_analyzer = EntityLevelPolarizationAnalyzer()

In [None]:
df = entity_level_analyzer.analyze(pkg, pole_path='./', output_dir=OUTPUT_DIR)

In [None]:
import pandas as pd

from tabulate import tabulate

df.sample(4)

#### Find **<span style="background-color:blue; color:white;">Protagonists</span>**

In [None]:
print(tabulate(df[(df['pos.'] + df['neg.']) > 0].sort_values(by=['score'], ascending=[False])[['entity', 'ssa', 'mu', 'pos.', 'neg.']].iloc[:5], headers='keys', tablefmt='grid'))

#### Find **<span style="background-color:red; color:white;">Antagonists</span>**

In [None]:
print(tabulate(df[(df['pos.'] + df['neg.']) > 0].sort_values(by=['score'], ascending=[True])[['entity', 'ssa', 'mu', 'pos.', 'neg.']].iloc[:5], headers='keys', tablefmt='grid'))

#### Most **Polarized** Entities

In [None]:
print(tabulate(df[(df['pos.'] + df['neg.']) > 5].sort_values(by=['mu'], ascending=[False])[['entity', 'ssa', 'mu', 'pos.', 'neg.']].iloc[:5], headers='keys', tablefmt='grid'))

## **Group-level** Polarization Analytics

In [None]:
import polarlib.prism.cohesiveness.cohesiveness as cohesiveness

Set `cohesiveness.DOWNLOAD_FLAG=True` if this is the first time executing the pipeline for this approach. This will fetch the wikipages for the entities to determine their political affiliations (if possible).

In [None]:
cohesiveness.DOWNLOAD_FLAG = False

In [None]:
from polarlib.prism.multi_level_polarization import GroupLevelPolarizationAnalyzer

group_analyzer = GroupLevelPolarizationAnalyzer()

coh_df, att_df = group_analyzer.analyze(pkg, output_dir=OUTPUT_DIR, download_flag=True, wlpa_flag=True)

In [None]:
att_df['representation'] = att_df['topic'].apply(lambda t: ', '.join(polar_topics[t]['noun_phrases'][:3]))

In [None]:
att_df.sort_values(by=['attitude_population', 'member_ratio'], ascending=[False, False]).iloc[:10]

In [None]:
for e in fellowships[9]: print('-', e)

print()

for e in fellowships[11]: print('-', e)

## **Topic-level** Polarization Analytics

In [None]:
%load_ext autoreload
%autoreload 2

from polarlib.prism.multi_level_polarization import TopicLevelPolarizationAnalyzer

topic_analyzer = TopicLevelPolarizationAnalyzer()

In [None]:
local_df, global_df = topic_analyzer.analyze(pkg)

In [None]:
global_df['representation'] = global_df['topic'].apply(lambda t: ', '.join(polar_topics[t]['noun_phrases'][:3]))

In [None]:
relevant_topics = [kv[0] for kv in contextual_relevance_scores.items() if kv[1] > 0.0]

In [None]:
global_df[global_df['topic'].isin(relevant_topics)].sort_values(by='score', ascending=False)[['topic', 'obst', 'mt', 'score', 'representation']].iloc[:25]