In [1]:
import random
import numpy as np
import pandas as pd
import time
import re
import datetime
import os
import matplotlib.pyplot as plt
import scipy
from scipy import stats

In [2]:
def f(p,n=4):
    return '{0:.{1}f}'.format(p, n)

# LIWC Color Coding HTML

In [38]:
from bs4 import BeautifulSoup

In [35]:
with open('LIWCdata/Color Coding Sputnik.html', 'r') as f:
    htmlfile = f.read()

In [39]:
soup = BeautifulSoup(htmlfile, "html.parser")
body = soup.body

In [81]:
colorDict = {
    '#de911dff' : 'cogproc',
    '#4c82a3ff' : 'motion',
    '#a63d1bff' : 'socbehav',
    '#654b81ff' : 'emo_anx',
    '#30961eff' : 'tentat'
}
cat2words = dict()
cat2count = dict()
spans = body.find_all('span')
for span in spans:
    match = re.search(r"#[a-zA-Z0-9]+", str(span))
    start, end = match.span()
    color = str(span)[start:end]
    cat = colorDict[color]
    cat2words.setdefault(cat,set())
    cat2count.setdefault(cat, 0)
    cat2words[cat].add(span.text.lower())
    cat2count[cat] += 1

In [82]:
cat2count

{'cogproc': 43235,
 'motion': 6878,
 'socbehav': 34521,
 'tentat': 8155,
 'emo_anx': 911}

In [80]:
random.sample(cat2words['tentat'], 20)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(cat2words['tentat'], 20)


['mystery',
 'depends',
 'something',
 'hardly',
 'hope',
 'indirectly',
 'not really',
 "don't really",
 'alleged',
 'uncertainty',
 'may',
 'probably',
 'undecided',
 'marginalising',
 'if',
 'obscures',
 'mysterious',
 'doubting',
 'almost',
 'hopeful']

# LIWC Analysis CSV

In [7]:
df = pd.read_csv('LIWCdata/LIWC-22 Results - sputnik_interfax_USA_prewar_07___ - LIWC Analysis.csv', index_col=0)

In [9]:
df.tail()

Unnamed: 0,newsOutlet,dateSeen,url,title,language,sourceCountry,text,Segment,WC,Analytic,...,assent,nonflu,filler,AllPunc,Period,Comma,QMark,Exclam,Apostro,OtherP
35,en.interfax.com.ua,2021-11-10 08:00:00,https://en.interfax.com.ua/news/general/778744...,Kuleba arrives in Washington to discuss Ukrain...,English,Ukraine,Ukrainian Foreign Minister Dmytro Kuleba has a...,1,326,99.0,...,0.0,0,0,18.1,7.67,5.21,0.0,0.0,0.61,4.6
36,en.interfax.com.ua,2021-11-01 07:00:00,https://en.interfax.com.ua/news/general/776826...,"Kubilius : EU , US should be present in format...",English,Ukraine,"The Normandy format (Ukraine, Germany, France,...",1,477,71.11,...,0.0,0,0,10.48,3.98,3.14,0.0,0.0,0.21,3.14
37,en.interfax.com.ua,2022-03-14 07:00:00,https://en.interfax.com.ua/news/general/812609...,Podoliak on murder of NYT journalist : how muc...,English,Ukraine,Adviser to the head of the President's Office ...,1,174,95.85,...,0.0,0,0,13.79,4.6,4.02,0.57,0.0,1.72,2.87
38,en.interfax.com.ua,2021-11-12 08:00:00,https://en.interfax.com.ua/news/general/779213...,US warns Europe that Russia may plan Ukraine i...,English,Ukraine,The U.S. is raising the alarm with European Un...,1,170,94.14,...,0.0,0,0,16.47,7.65,4.12,0.0,0.0,2.94,1.76
39,en.interfax.com.ua,2022-02-18 08:00:00,https://en.interfax.com.ua/news/general/799328...,US President organizes meeting of EU and NATO ...,English,Ukraine,U.S. President Joe Biden is organizing a meeti...,1,142,99.0,...,0.0,0,0,19.01,4.93,11.27,0.0,0.0,0.0,2.82


### independent t-test

The indepentent T-test is a parametric test used to test for a statistically significant difference in the means between 2 groups.

In [13]:
dfIF = df[df['newsOutlet'] == 'en.interfax.com.ua']
dfSP = df[df['newsOutlet'] == 'sputniknews.com']

In [18]:
LIWCcolumns = dfIF.columns[8:].tolist()

In [20]:
allCoeffs = []
for category in LIWCcolumns:
    ifarray = np.array(dfIF[category])
    sparray = np.array(dfSP[category])
    coeff = scipy.stats.ttest_ind(ifarray, sparray, axis=0)
    allCoeffs.append({
        'cat': category,
        'interfaxAvg' : dfIF[category].mean(),
        'sputnikAvg': dfSP[category].mean(),
        'interfaxArray' : ifarray,
        'sputnikArray' : sparray,
        't_statistic': coeff.statistic,
        'pvalue' : coeff.pvalue
    })

In [21]:
df = pd.DataFrame.from_records(allCoeffs)
df.to_excel('LIWCdata/sputnik_interax_preanalysis.xlsx')

#### Statistically significant LWC Categories

In [22]:
mostsig = []
for c in allCoeffs:
    if c['pvalue'] <= .005:
        mostsig.append(c['cat'])
print('Most significant (<= .005):', mostsig)

sig = []
for c in allCoeffs:
    if c['pvalue'] > .005 and c['pvalue'] <= .05:
        sig.append(c['cat'])
print('\nSignificant (<= .05):', sig)
print()

Most significant (<= .005): ['WC', 'Authentic', 'Tone', 'Cognition', 'cogproc', 'tentat', 'differ', 'prosocial', 'motion', 'Period']

Significant (<= .05): ['quantity', 'tone_pos', 'tone_neg', 'emo_anx', 'Social', 'socbehav', 'moral', 'Culture', 'politic', 'reward', 'feeling']



#### Printing news outlet averages of significant LIWC categories

In [34]:
# Average comparisons
print('Most significant'.upper())
for c in allCoeffs:
    if c['pvalue'] <= .005:
        print(f"\t{c['cat']} (t-statistic: {c['t_statistic']}, p-value: {c['pvalue']}):")
        print(f"\t\tAverages: Interfax: {f(dfIF[c['cat']].mean())}\tSputnik: {f(dfSP[c['cat']].mean())}\n")
print('Significant'.upper())
for c in allCoeffs:
    if c['pvalue'] > .005 and c['pvalue'] <= .05:
        print(f"\t{c['cat']} (t-statistic: {c['t_statistic']}, p-value: {c['pvalue']}):")
        print(f"\t\tAverages: Interfax: {f(dfIF[c['cat']].mean())}\tSputnik: {f(dfSP[c['cat']].mean())}\n")

MOST SIGNIFICANT
	WC (t-statistic: -3.720310230919176, p-value: 0.0006412816314693797):
		Averages: Interfax: 195.2000	Sputnik: 474.9000

	Authentic (t-statistic: -3.013106906020835, p-value: 0.00458438138040964):
		Averages: Interfax: 17.1500	Sputnik: 31.2765

	Tone (t-statistic: 3.9185508474936372, p-value: 0.00035930637251162137):
		Averages: Interfax: 39.5070	Sputnik: 14.0830

	Cognition (t-statistic: -3.1827724701409745, p-value: 0.002906702655174733):
		Averages: Interfax: 5.7490	Sputnik: 9.1540

	cogproc (t-statistic: -3.662331659610095, p-value: 0.0007581430805750102):
		Averages: Interfax: 5.0650	Sputnik: 8.6720

	tentat (t-statistic: -4.300979513666212, p-value: 0.00011448876221827072):
		Averages: Interfax: 0.5795	Sputnik: 1.8205

	differ (t-statistic: -3.5582088038644843, p-value: 0.001021507572647197):
		Averages: Interfax: 1.2835	Sputnik: 2.5350

	prosocial (t-statistic: 3.1928211510280877, p-value: 0.002828316897639968):
		Averages: Interfax: 2.2680	Sputnik: 0.5730

	mot

In [26]:
dfIF.columns

Index(['newsOutlet', 'dateSeen', 'url', 'title', 'language', 'sourceCountry',
       'text', 'Segment', 'WC', 'Analytic',
       ...
       'assent', 'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'QMark',
       'Exclam', 'Apostro', 'OtherP'],
      dtype='object', length=125)

In [32]:
print(random.choice(dfIF['text'].tolist()))

The United States intends to support defense reform in Ukraine until it reaches the criteria for NATO membership, U.S. Secretary of Defense Lloyd Austin has said. Austin said at a briefing after the talks with Ukrainian Defense Minister Andriy Taran in Kyiv on Tuesday that following the NATO summit in June 2014, NATO allies pledged support for Ukraine in its intentions to become a full member of NATO. The United States remains unwavering in support of Ukraine's defense reform, which will help achieve the criteria for full NATO membership. He said that during the visit, he had effective negotiations with Taran on a framework agreement signed in Washington in August 2021. "This agreement provides us with a strategic framework for a defense partnership, and we are going to strengthen our strategic partnership," the head of the Ukrainian defense department said. Austin said the United States has a duty to help Ukraine, and U.S. President Joe Biden has noted that on several occasions. Austi

In [33]:
print(random.choice(dfSP['text'].tolist()))

WASHINGTON (Sputnik) - The new US sanctions against Russia, imposed in response to Moscow’s recognition of the breakaway Donetsk and Lugansk people’s republics (DPR and LPR), will affect Ukraine and Europe more than Russia itself, US-based Eurasia Center Executive Vice President Earl Rasmussen told Sputnik. The United States and its European allies imposed new sanctions against Russia after President Vladimir Putin signed decrees recognizing the independence of the DPR and LPR. Under new treaties, Moscow pledged to ensure the security of the two Russian-speaking republics. The sanctions target Russia’s Vnesheconombank (VEB) and the Promsvyazbank Public Joint Stock Company (PSB), the country's sovereign debt, its elites and their families. “Will the new sanctions impact Russia? Yes, to some degree, but Russia will recover and is fairly sanctions proof following the multiple rounds of sanctions they had already received,” Rasmussen said. However, the European and Ukrainian economies will