# Aim

For each article we extracted all keyword occurrences and then identified the sentences containing keyword occurrences and generating separate keyword-sentence pairs ("opinion contexts").

This dataset of keyword-sentences pairs was then manually coded for sentiment by reserachers.

During the manual coding, the dataset was further cleaned and refinsed as, for instance, duplicates of articles was still found and removed. This is because LexisNexus captured updates to the same article as separate entries.

The final dataset used for analysis is the dataset based on the keyword-sentence pairs (full text was also a column in that data) after manual coding and inspection.

# Set up and get data

In [None]:
import os
import pickle

In [None]:
import numpy as np
import pandas as pd

In [None]:
import psutil

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
import plotly.express as px
import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode()

In [None]:
pd.set_option('display.max_colwidth', None)

### Constant

In [None]:
DIR_DATA = os.environ.get("DIR_DATA_INTERIM")

In [None]:
DIR_DATA_EXTRA = os.environ.get("DIR_DATA_EXTRA")

In [None]:
# prominence
term_freqs_nm = "kword_rawfreq_2W-MON.csv"
doc_freqs_nm = "kword_docfreq_2W-MON.csv"
metrics_nm = "kword_rfrdf_2W-MON.csv"

In [None]:
# sentiment
sentiment_nm = "preproc_kword_sent.pickle"

In [None]:
# keywords to be excluded because of low frequency in the corpus
EXCLUDE_KWORDS = ['behav_insight', 'behavioural_economist', 'behav_analysis', 
                  'chater', 'american_behav_scientists', 'irrational_econ', 'nudge_choice']

In [None]:
EXCLUDE_NONKWORDS = ["herd_immunity", "behavioural_fatigue"]

## Import data

In [None]:
doc_freqs = pd.read_csv(os.path.join(DIR_DATA, doc_freqs_nm))

In [None]:
term_freqs = pd.read_csv(os.path.join(DIR_DATA, term_freqs_nm))

In [None]:
metrics = pd.read_csv(os.path.join(DIR_DATA, metrics_nm))

In [None]:
with open(os.path.join(DIR_DATA, sentiment_nm), "rb") as input_file:
        sentiments = pickle.load(input_file)

In [None]:
# save as csv for visual ispection by others
sentiments.to_csv(os.path.join(DIR_DATA, "preprc_kword_sent.csv"))


### Number of keywords per opinion context

Remove non keywords: "herd_immunity" and "behavioural_fatigue"

In [None]:
sentiments[sentiments.kword.isin(EXCLUDE_NONKWORDS)].shape

In [None]:
sentiments[~sentiments.kword.isin(EXCLUDE_NONKWORDS)].shape

In [None]:
nonkeyword_cases = sentiments[sentiments.kword.isin(EXCLUDE_NONKWORDS)]

In [None]:
sentiments = sentiments[~sentiments.kword.isin(EXCLUDE_NONKWORDS)].copy()

In [None]:
sentiments[['article_id', 'opinion_context_id']].drop_duplicates().nunique()

Before removing keywords based on low frequency

In [None]:
sentiments.columns

In [None]:
sentiments = sentiments[['article_id', 'title', 'pub_date', 'subkword', 'opinion_context', 'opinion_context_id',
       'keyword_sentiment', 'refers_to_gov', 'gov_sentiment', 'kword', 'pub_date_dt']].copy()

### After removing opinion context that contained "behavioural fatigue" or "herd immunity"
641 articles were left
This means that 6 articles only included "behavioural fatigue" or "herd immunity" as keywords

In [None]:
sentiments.article_id.nunique()

In [None]:
sentiments.shape

It means 6 articles only contained "behavioural fatigue" or "herd immunity". Which ones?

In [None]:
nonkeyword_cases[~nonkeyword_cases.title.isin(sentiments.title)].title.nunique()

In [None]:
nonkeyword_cases[~nonkeyword_cases.title.isin(sentiments.title)].title.unique()

#### Number of sentences-keyword pairs

In [None]:
sentiments.opinion_context_id.nunique()

Doesn't match number of rows, why?

In [None]:
sentiments.sort_values('opinion_context_id')

Some `opinion_context_id` are NA values.

In [None]:
sentiments[np.isnan(sentiments['opinion_context_id'])].shape

In [None]:
# duplicate opinion_context_id
sentiments[sentiments.duplicated('opinion_context_id')].opinion_context_id.tolist()


In [None]:
len(sentiments[sentiments.duplicated('opinion_context_id')].opinion_context_id.tolist())

In [None]:
1457+146

In [None]:
# duplicate cases of opinion_context_id
sentiments[sentiments.opinion_context_id.isin(
    sentiments[sentiments.duplicated('opinion_context_id')].opinion_context_id.tolist())].sort_values(
    'opinion_context_id')

May be due to when the two batches were merged (each with opinion context id starting from 1)

### Give unique id for each sentence-subkeyword pair

In [None]:
import random

In [None]:
sentiments['subkw_sentence_pair_id'] = np.nan

In [None]:
sentiments['subkw_sentence_pair_id'] = sentiments.groupby(['opinion_context', 'subkword'])['subkw_sentence_pair_id'].transform(lambda x: random.random())

In [None]:
sentiments['subkw_sentence_pair_id'].nunique()

In [None]:
sentiments[sentiments.subkw_sentence_pair_id.isin(
    sentiments[sentiments.duplicated('subkw_sentence_pair_id')].subkw_sentence_pair_id)].to_csv("../../sandpit/news_still_duplicates.csv")

In [None]:
os.getcwd()

#### need to consider article_id as well

As some opinion context are repeated

In [None]:
sentiments['subkw_sentence_pair_id'] = sentiments.groupby(
    ['article_id', 'opinion_context', 'subkword'])['subkw_sentence_pair_id'].transform(lambda x: random.random())

In [None]:
sentiments['subkw_sentence_pair_id'].nunique()

## ok

### Number of unique sentences

In [None]:
sentiments.opinion_context.nunique()

In [None]:
sorted(sentiments.opinion_context)

In [None]:
sentiments[['subkword', 'kword', 'opinion_context_id', 'opinion_context']].sort_values('opinion_context')

In [None]:
pd.DataFrame(sentiments.groupby("opinion_context").subkword.value_counts())

#### Apply an id to each sentence

We need to consider sentence - article for unique ids

In [None]:
import random

In [None]:
sentiments.opinion_context.nunique()

In [None]:
sentiments['sentence_id'] = np.nan

In [None]:
sentiments['sentence_id'] = sentiments.groupby(
    ['article_id', 'title', 'opinion_context', 'pub_date_dt'])['sentence_id'].transform(lambda x: random.random())

In [None]:
sentiments['sentence_id'].nunique()

In [None]:
from src.news_media.get_keywords_trend import *

In [None]:
uk_news = NewsArticles()

In [None]:
opinions_data = uk_news.data_raw.drop('full_text', axis=1).copy()

In [None]:
opinions_data = opinions_data[~opinions_data['kword'].isin(EXCLUDE_NONKWORDS)].copy()

In [None]:
opinions_data[['article_id', 'title', 'opinion_context', 'pub_date_dt']].drop_duplicates()

In [None]:
opinions_data.shape

In [None]:
opinions_data[['article_id', 'title', 'opinion_context', 'pub_date_dt']].drop_duplicates()

In [None]:
# number of subkeywords per sentence
sentiments.groupby("sentence_id").subkword.count().values

In [None]:
len(sentiments.groupby("sentence_id").subkword.count().values)

In [None]:
min(sentiments.groupby("sentence_id").subkword.count().values)

In [None]:
max(sentiments.groupby("sentence_id").subkword.count().values)

In [None]:
np.median(sentiments.groupby("sentence_id").subkword.count().values)

In [None]:
# historgram
fig, ax = plt.subplots(figsize=(10,10))  

plt.hist(sentiments.groupby("sentence_id").subkword.count().values, alpha=0.5)

plt.show()

# Frequency of kwords and subkeywords

In [None]:
sentiments.subkword.unique()

In [None]:
# named actors
sentiments[sentiments.subkword.isin(
    ['chater', 'michie', 'halpern', 'sunstein', 'thaler', 'kahneman'])].groupby('subkword').count()

In [None]:
term_freqs.iloc[:, 2:].sum(axis=0).sort_values(ascending=False)

In [None]:
sentiments.groupby('kword').sentence_id.count().sort_values(ascending=False)

In [None]:
term_freqs.drop(['herd_immunity', 'behavioural_fatigue', 'fortnight_starting', 'word_count'], axis=1).sum().sum()