In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
from scipy import stats

In [None]:
# Load pew dataset
pew = pd.read_csv('pew.csv')

#List of "invalid" answers in pew dataset
Nan_keywords = ['Refused', 'Don’t know', 'Don’t know (DO NOT READ)', 'Refused (DO NOT READ)',
       '(VOL) Refused', "(VOL)\xa0Don't know"]

pew = pew.replace(to_replace=Nan_keywords, value=np.NaN)

#map "sentiment" to float values
fav_dict = {'Somewhat favorable': 0.5, 'Very favorable':1,
       'Somewhat unfavorable':-0.5, 'Very unfavorable':-1}

pew = pew.replace(fav_dict)

country_list_pew = ['Argentina', 'Australia', 'Brazil', 'Canada', 'France', 'Germany', 'Greece', 'Hungary', 'India',
                'Indonesia', 'Israel', 'Italy', 'Japan', 'Kenya', 'Lebanon', 'Mexico', 'Netherlands', 'Nigeria',
                'Philippines', 'Poland', 'Russia', 'South Africa', 'South Korea', 'Spain', 'Sweden', 'Turkey',
                'Ukraine', 'United Kingdom', 'United States']

In [None]:
def drop_none_speaker(df):
    #print("Year : {} with {} quotes".format(year, len(df)))
    drop_mask = df[df.speaker == "None"].index
    df_dropped = df.drop(drop_mask)
    print("{} quotes are removed".format(len(drop_mask)))
    return df_dropped

df = pd.DataFrame()
for year in range(2015, 2021):
    print(year)
    df1 = pd.read_pickle('us_{}media.pkl.bz2'.format(year), compression='bz2')
    df = pd.concat([df, df1])

df = drop_none_speaker(df)

US_data = pd.DataFrame()
US_data = pd.concat([US_data, df], axis=0)


In [None]:
speaker_df = pd.read_parquet("speaker_attributes.parquet")
speaker_df = speaker_df[['id', 'nationality']]

US_data["speaker_id"] = US_data.qids.map(lambda x: x[0])

#join US_data with the corresponding speakers' attributes
US_merged = pd.merge(US_data, speaker_df, left_on=['speaker_id'], right_on=['id'], how='inner')


In [None]:
endpoint_url = "https://query.wikidata.org/sparql"

#sparkql query to get the country name and their respective quids from wikidata
query = """#List of present-day countries and capital(s)
SELECT DISTINCT ?country ?countryLabel
WHERE
{
  ?country wdt:P31 wd:Q3624078 .

  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ?countryLabel"""

'''
query internet dataset:
endpoint_url (string): url of corresponding dataset
query (string): sparksql query
'''
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

#Build a country dictionnary {quid, country name}
country_dict = {}
for result in results["results"]["bindings"]:
    country_id = result["country"]['value'].split('/')[-1]
    country_dict[country_id] = result["countryLabel"]['value']

In [None]:
# Add nationality of speaker from the country dictionnary
US_with_nation = US_merged.explode('nationality')
US_with_nation.nationality = US_with_nation.nationality.map(country_dict)


In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()
# predict the sentiment of each quotation in the range [-1, 1]
US_with_nation["sentiment"] = US_with_nation.quotation.apply(lambda x: sia.polarity_scores(x)["compound"])

In [None]:
US_with_nation["year"] = US_with_nation.date.dt.strftime('%Y')

In [None]:
country_list = ['Argentina', 'Australia', 'Brazil', 'Canada', 'France', 'Germany', 'Greece', 'Hungary', 'India',
                'Indonesia', 'Israel', 'Italy', 'Japan', 'Kenya', 'Lebanon', 'Mexico', 'Kingdom of the Netherlands', 'Nigeria',
                'Philippines', 'Poland', 'Russia', 'South Africa', 'South Korea', 'Spain', 'Sweden', 'Turkey',
                'Ukraine', 'United Kingdom', 'United States of America']

num_bin_quotebank = 50
bin_lims_quotebank = np.linspace(-1,1,num_bin_quotebank+1)
bin_centers_quotebank = 0.5*(bin_lims_quotebank[:-1]+bin_lims_quotebank[1:])
bin_widths_quotebank = bin_lims_quotebank[1:]-bin_lims_quotebank[:-1]

num_bin = 4
bin_lims = np.linspace(0,1,num_bin+1)
bin_centers_obama = [-0.125, 0.5-0.125, 1.0-0.125, 1.5-0.125]
bin_centers_trump = [0.125, 0.5+0.125, 1.0+0.125, 1.5+0.125]
bin_widths = [0.125, 0.125, 0.125, 0.125]

bin_centers_obama = [-0.125+0.0625, 0.5+0.0625-0.125, 1.0+0.0625-0.125, 1.5+0.0625-0.125]
bin_centers_trump = [0.125-0.0625, 0.5-0.0625+0.125, 1.0-0.0625+0.125, 1.5-0.0625+0.125]
bin_widths = [0.125, 0.125, 0.125, 0.125]

for country in country_list:
    print(country)
    cond_obama_pew = (pew["country"] == country) & ((pew["year"] == 2015) | (pew["year"] == 2016))
    cond_trump_pew = (pew["country"] == country) & ((pew["year"] == 2019) | (pew["year"] == 2020))

    data_pew = pd.DataFrame({'obama': pew[cond_obama_pew]['fav_us'], 'trump': pew[cond_trump_pew]['fav_us']})

    t_statistic_pew = ttest_ind(data_pew['obama'].dropna(), data_pew['trump'].dropna())[0]
    p_value_pew = ttest_ind(data_pew['obama'].dropna(), data_pew['trump'].dropna())[1]

    mean_us_fav_obama_pew = str(round(data_pew['obama'].mean(), 3))
    median_us_fav_obama_pew = str(round(data_pew['obama'].median(), 3))

    mean_us_fav_trump_pew = str(round(data_pew['trump'].mean(), 3))
    median_us_fav_trump_pew = str(round(data_pew['trump'].median(), 3))

    hist_obama_pew, _ = np.histogram(pew[cond_obama_pew]['fav_us'].dropna())
    hist_obama_norm_pew = hist_obama_pew / np.sum(hist_obama_pew) * 100

    hist_trump_pew, _ = np.histogram(pew[cond_trump_pew]['fav_us'].dropna())
    hist_trump_norm_pew = hist_trump_pew / np.sum(hist_trump_pew) * 100

    ##################################################################

    obama_cond = (US_with_nation["nationality"] == country) & ((US_with_nation["year"] == '2015') | (US_with_nation["year"] == '2016'))
    trump_cond = (US_with_nation["nationality"] == country) & ((US_with_nation["year"] == '2019') | (US_with_nation["year"] == '2020'))
    obama_sentiment = US_with_nation[obama_cond]["sentiment"]
    trump_sentiment = US_with_nation[trump_cond]["sentiment"]
    _, p_value = stats.ttest_ind(obama_sentiment,trump_sentiment,alternative='two-sided')
    print("Country : {},\tObama: {:.2f}\tTrump: {:.2f}".format(country, obama_sentiment.mean(), trump_sentiment.mean()))
    print("P value: {}\n".format(p_value))

    ##################################################################

    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(13, 13), dpi=200)

    fig.suptitle('US fav seen by ' + country +
                 '\n' + 'p-value pew: ' + str(round(p_value_pew, 4)) +
                 '\n' + 'p-value quotebank: ' + str(round(p_value, 4)))

    ax1.bar(bin_centers_obama, [hist_obama_norm_pew[0], hist_obama_norm_pew[2], hist_obama_norm_pew[7], hist_obama_norm_pew[9]],
            width=bin_widths, align='center')
    ax1.bar(bin_centers_trump, [hist_trump_norm_pew[0], hist_trump_norm_pew[2], hist_trump_norm_pew[7], hist_trump_norm_pew[9]],
            width=bin_widths, align='center', alpha=0.5)

    ax1.set_xticklabels(['', 'very unpopular', '', 'unpopular', '', 'popular', '', 'very popular'], fontdict=None,
                        minor=False, rotation=45)
    ax1.set_ylabel('%')
    ax1.set_title('pew' + '\n' +
                  'mean: ' + mean_us_fav_obama_pew + '  median: ' + median_us_fav_obama_pew + '\n' +
                  'mean: ' + mean_us_fav_trump_pew + '  median: ' + median_us_fav_trump_pew)

    hist1, _ = np.histogram(obama_sentiment, bins=bin_lims_quotebank)
    hist2, _ = np.histogram(trump_sentiment, bins=bin_lims_quotebank)
    hist1b = hist1 / np.sum(hist1)
    hist2b = hist2 / np.sum(hist2)

    ax2.bar(bin_centers_quotebank, hist1b*100, width=bin_widths_quotebank, align='center')
    ax2.bar(bin_centers_quotebank, hist2b*100, width=bin_widths_quotebank, align='center', alpha=0.5)

    ax2.set_ylabel('%')
    ax2.set_title('quotebank' + '\n' +
                  'mean: ' + str(round(obama_sentiment.mean(), 3)) + '  median: ' + str(round(obama_sentiment.median(), 3)) + '\n' +
                  'mean: ' + str(round(trump_sentiment.mean(), 3)) + '  median: ' + str(round(trump_sentiment.median(), 3)))

    plt.savefig('bar plot ' + country)
    plt.close()