In [1]:
import random
import numpy as np
import pandas as pd
import time
import re
import datetime
import os
import matplotlib.pyplot as plt
import scipy
from scipy import stats
import nltk

In [2]:
from nltk import pos_tag, word_tokenize
from nltk.tokenize import sent_tokenize

In [3]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/paigelee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Load data

#### Penn Treebank

In [4]:
treebankstr = '''CC	Coordinating conjunction
DT	Determiner
EX	Existential there 
IN	Preposition or subordinating conjunction
LS	List item marker
MD	Modal
PDT	Predeterminer
PRP	Personal pronoun
PRP$	Possessive pronoun
RP	Particle
WDT	Wh-determiner
WP	Wh-pronoun
WP$	Possessive wh-pronoun
WRB	Wh-adverb
TO	to
CD	Cardinal number
FW	Foreign word
JJ	Adjective
JJR	Adjective, comparative
JJS	Adjective, superlative
NN	Noun, singular or mass
NNS	Noun, plural
NNP	Proper noun, singular
NNPS	Proper noun, plural
POS	Possessive ending
RB	Adverb
RBR	Adverb, comparative
RBS	Adverb, superlative
SYM	Symbol
UH	Interjection
VB	Verb, base form
VBD	Verb, past tense
VBG	Verb, gerund or present participle
VBN	Verb, past participle
VBP	Verb, non-3rd person singular present
VBZ	Verb, 3rd person singular present'''

In [5]:
pos2str = dict()
for m in treebankstr.split('\n'):
    a,b = m.split('\t')
    pos2str[a]=b

#### Dataframe

In [6]:
df = pd.read_excel('data/Jul21_3379_articles.xlsx',index_col=0)
df = df.reset_index()
df = df.drop_duplicates(subset=['title'])
print(len(df))
df.head()

3379


Unnamed: 0,index,newsOutlet,dateSeen,url,title,language,sourceCountry,text,category,textCharCt
0,0,apnews.com,2021-10-12 07:00:00,https://apnews.com/article/technology-business...,"EU , Ukraine to discuss military training and ...",English,United States,BRUSSELS (AP) — The European Union is consider...,western,3318
1,1,apnews.com,2021-10-12 07:00:00,https://apnews.com/cdfd9b277f6bade5697e0427429...,Blinken reaffirms US support for Ukraine amid ...,English,United States,"KYIV, Ukraine (AP) — U.S. Secretary of State A...",western,6032
2,2,apnews.com,2021-10-11 07:00:00,https://apnews.com/90b50e622cefcbaeba96cb5e716...,"EU , Ukraine to discuss military training and ...",English,United States,BRUSSELS (AP) — The European Union is consider...,western,3318
3,3,apnews.com,2021-10-06 07:00:00,https://apnews.com/0b689a7baaac58603e4d7453aa5...,Ukraine Holocaust center names Nazi Babi Yar k...,English,United States,"KYIV, Ukraine (AP) — The presidents of Ukraine...",western,3646
4,4,apnews.com,2021-10-13 07:00:00,https://apnews.com/89d2455274aad5fe166445a5f82...,Ukraine separatist areas overwhelmed by soarin...,English,United States,"KYIV, Ukraine (AP) — The Russia-backed separat...",western,2184


In [7]:
records = df.to_dict('records')

### Initialize StanfordPOSTagger

In [8]:
# from nltk import StanfordTagger
# from nltk.tag import StanfordPOSTagger

In [9]:
# model_filename = 'stanford-postagger-full-2020-11-17/models/english-bidirectional-distsim.tagger'
# path_to_jar = 'stanford-postagger-full-2020-11-17/stanford-postagger.jar'

In [10]:
# st = StanfordPOSTagger(model_filename=model_filename,path_to_jar=path_to_jar)

In [11]:
# text_tok = nltk.word_tokenize('What is the airspeed of an unladen swallow ?')

In [12]:
# st.tag(text_tok)

## Parse dataframe text

In [13]:
###
tokenCount = 0
for rec in records:
    tokenCount += len(word_tokenize(rec['text']))
print(tokenCount, 'total tokens in dataframe.')

1799389 total tokens in dataframe.


## Function words

In [20]:
function_str = '''CC
DT
EX
IN
LS
MD
PDT
PRP
PRP$
RP
WDT
WP
WP$
WRB
TO'''
function_pos = [f.strip() for f in function_str.split('\n')]

In [21]:
%%time
# get function words used above a certain total threshold
threshold = 5

fwCts = dict()
for recIdx, rec in enumerate(records):
    if recIdx % 500 == 0:
        print(f'{recIdx}/{len(records)} recs parsed with NLTK.')
    tokenizedTaggedText = pos_tag(word_tokenize(rec['text']))
    for token, pos in tokenizedTaggedText:        
        token = token.lower()
        if not token.isalpha():
            continue
        if pos in function_pos:
            fwCts.setdefault(token, {'western' : 0, 'russian' : 0})
            fwCts[token][rec['category']] += 1
to_delete = []
for fw in fwCts:
    if fwCts[fw]['western'] + fwCts[fw]['russian'] < 25:
        to_delete.append(fw)
for td in to_delete:
    del fwCts[td]

0/3379 recs parsed with NLTK.
500/3379 recs parsed with NLTK.
1000/3379 recs parsed with NLTK.
1500/3379 recs parsed with NLTK.
2000/3379 recs parsed with NLTK.
2500/3379 recs parsed with NLTK.
3000/3379 recs parsed with NLTK.
CPU times: user 50.1 s, sys: 154 ms, total: 50.3 s
Wall time: 50.3 s


In [22]:
fwCts.keys()

dict_keys(['the', 'a', 'to', 'amid', 'between', 'and', 'its', 'on', 'from', 'for', 'with', 'will', 'during', 'in', 'of', 'how', 'could', 'which', 'since', 'this', 'his', 'against', 'what', 'it', 'all', 'those', 'or', 'where', 'than', 'after', 'near', 'at', 'they', 'their', 'both', 'that', 'we', 'an', 'around', 'by', 'while', 'out', 'within', 'who', 'before', 'any', 'no', 'he', 'about', 'as', 'these', 'some', 'but', 'back', 'if', 'i', 'can', 'you', 'there', 'would', 'over', 'whether', 'should', 'why', 'when', 'under', 'down', 'us', 'despite', 'them', 'except', 's', 'so', 'because', 'whose', 'our', 'him', 'without', 'off', 'she', 'across', 'along', 'into', 'next', 'onto', 'though', 'although', 'such', 'himself', 'through', 'another', 'themselves', 'up', 'among', 'until', 'unlike', 'her', 'unless', 'toward', 'must', 'per', 'via', 'may', 'whatever', 'behind', 'either', 'every', 'might', 'your', 'each', 'like', 'ourselves', 'throughout', 'whom', 'me', 'my', 'once', 'inside', 'yet', 'itself'

In [23]:
# process texts
def tm_tokenize(txt):
    word_corrections = [
    # ('said',''),
    ('Reuters',''),
    ('REUTERS',''),
    ('Thomson',''),
    ('AP', ''),
    ('Sputnik',''),
    ('TASS',''),
    ('BBC',''),
    ('Kiev ','Kyiv '),
    ('Lugansk','Luhansk'),
    ('Donbas','Donbass'),
    ('U.S.','US'),
    ('United States','US'),
    ('Aleksandr', 'Alexander')
    ]
    takeoutif = [
        'This story has been corrected to show',
        'contributed to this report',
        'All rights reserved',
        'not responsible for the material quoted in these press',
        'Follow all AP stories',
        'https',
        'dedicated page',
        'Follow AP’s coverage',
        '©',
        'This video can not be played',
        '© 2022 Reuters.'
        'contributed reporting',
        'Please include your name, age and location with any submission.',
        'See here for',
        'contributed.',
        'reported from',
        'reaching billions of people worldwide every day',
        'quotes delayed a minimum',
        'contributed to this story.',
        'pic.twitter.com',
        'radio@sputniknews.com',
        'Screen for heightened risk individual and entities globally to help uncover hidden risks in business relationships and human networks.',
        'Browse an unrivalled portfolio of',
        'Access unmatched financial',
        'Reuters',
        'Build the strongest argument relying on',
        'The most comprehensive solution to manage all',
        'Access unmatched financial data',
        'The industry leader for online information for tax'
        ]
    for a, b in word_corrections:
        txt = txt.replace(a, b)
    for string in takeoutif:
        if string in txt:
            return None
    if not txt[0].isalpha() and txt[0] not in ['"',"'",'“']:
        return None
    if len(txt) <= 50:
        return None
    return txt

In [42]:
%%time
# function word spread across W and R

funcWordsPerSent = []
allTokenCt = dict()
allTokenCt['russian'] = 0
allTokenCt['western'] = 0

for recIdx, rec in enumerate(records):
    if recIdx % 500 == 0:
        print(f'{recIdx}/{len(records)} recs parsed with NLTK.')
    sents = sent_tokenize(rec['text'])
    for text in sents:
        text = tm_tokenize(text)
        if text == None:
            continue
        tokenizedTaggedText = pos_tag(word_tokenize(text))
        fwinSent = set()
        funcWordCt = 0
        ctDict = dict()
        for fw in fwCts:
            ctDict.setdefault(fw, 0)
        for token, pos in tokenizedTaggedText:
            allTokenCt[rec['category']] += 1
            token = token.lower()
            if token in fwCts:
                funcWordCt += 1
                fwinSent.add(token)
                ctDict[token] += 1
        recDict = {
            'category' : rec['category'],
            'percentFunction' : funcWordCt/len(tokenizedTaggedText),
            'funcTokens' : list(fwinSent),
            'sentTokens' : tokenizedTaggedText,
            'sentText' : text
        }
        for fw in fwCts:
            recDict[fw] = ctDict[fw]/len(tokenizedTaggedText)
            
        funcWordsPerSent.append(recDict)
fwDf = pd.DataFrame.from_records(funcWordsPerSent)
fwDf.head()

0/3379 recs parsed with NLTK.
500/3379 recs parsed with NLTK.
1000/3379 recs parsed with NLTK.
1500/3379 recs parsed with NLTK.
2000/3379 recs parsed with NLTK.
2500/3379 recs parsed with NLTK.
3000/3379 recs parsed with NLTK.
CPU times: user 1min 9s, sys: 2.47 s, total: 1min 12s
Wall time: 1min 14s


Unnamed: 0,category,percentFunction,funcTokens,sentTokens,sentText,the,a,to,amid,between,...,towards,neither,above,half,plus,away,alongside,shall,wo,ca
0,western,0.241379,"[its, a, and, between, amid, the, to]","[(BRUSSELS, NNP), ((, (), (), )), (—, VBZ), (T...",BRUSSELS () — The European Union is considerin...,0.034483,0.034483,0.034483,0.034483,0.034483,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,western,0.333333,"[on, a, from, for, the, with, to]","[(Acting, VBG), (on, IN), (a, DT), (request, N...",Acting on a request from Ukraine for help with...,0.074074,0.074074,0.037037,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,western,0.4,"[will, on, in, a, of, during, the, with]","[(The, DT), (topic, NN), (will, MD), (be, VB),...",The topic will be discussed during a summit Tu...,0.08,0.04,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,western,0.454545,"[in, how, of, yet, could, the, to]","[(The, DT), (results, NNS), (of, IN), (the, DT...",The results of the mission have yet to be anal...,0.136364,0.0,0.090909,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,western,0.375,"[will, s, in, and, back, for, the, which, to]","[(One, CD), (official, NN), (said, VBD), (the,...",One official said the EU’s political and secur...,0.09375,0.0,0.03125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
allTokenCt

{'russian': 915736, 'western': 746780}

In [47]:
915736 - 746780

168956

In [45]:
# t-test in differences of function word frequency
# 2 t-test
for funcToken in sorted(list(fwCts.keys())):
    westernPOSdf = fwDf[fwDf['category'] == 'western'][funcToken]
    russianPOSdf = fwDf[fwDf['category'] == 'russian'][funcToken]

    coeff = scipy.stats.ttest_ind(np.array(westernPOSdf), np.array(russianPOSdf), axis=0)
    if coeff.pvalue < .01:
    # if True:
        print(funcToken)
        print('western mean/median', westernPOSdf.mean(),westernPOSdf.median())
        print('russian mean/median', russianPOSdf.mean(),russianPOSdf.median())
        print('p-value', coeff.pvalue)
        print('t-statistic', coeff.statistic)
        print()

a
western mean/median 0.018990850330677104 0.0
russian mean/median 0.013757725613512546 0.0
p-value 8.72381989257547e-143
t-statistic 25.51669924702577

across
western mean/median 0.0002631760043879862 0.0
russian mean/median 9.079210594300266e-05 0.0
p-value 1.3124045162144377e-15
t-statistic 7.995779061448345

after
western mean/median 0.0017818895122195726 0.0
russian mean/median 0.0010401601601166264 0.0
p-value 2.9723441446471693e-34
t-statistic 12.211944727200718

against
western mean/median 0.0011676832733672913 0.0
russian mean/median 0.0016397786600950942 0.0
p-value 1.9659856522572611e-13
t-statistic -7.352924736501121

all
western mean/median 0.0011595439892439217 0.0
russian mean/median 0.0013271947530772055 0.0
p-value 0.008577811071266008
t-statistic -2.628531827536733

and
western mean/median 0.021366907618023855 0.0
russian mean/median 0.022140690041414113 0.017543859649122806
p-value 0.0005415811201702623
t-statistic -3.4595035971420893

around
western mean/median 0.00

## All Penn Treebank POS

In [98]:
%%time
# get function words used above a certain total threshold
threshold = 25

counts = dict()
for recIdx, rec in enumerate(records):
    if recIdx % 500 == 0:
        print(f'{recIdx}/{len(records)} recs parsed with NLTK.')
        
    sents = sent_tokenize(rec['text'])
    for text in sents:
        text = tm_tokenize(text)
        if text == None:
            continue
        tokenizedTaggedText = pos_tag(word_tokenize(text))
        for token, pos in tokenizedTaggedText:        
            token = token.lower()
            if not token.isalpha():
                continue
            counts.setdefault(pos, dict())
            counts[pos].setdefault(token, {'western' : 0, 'russian' : 0})
            counts[pos][token][rec['category']] += 1
to_delete = []
for pos in counts:
    for token in counts[pos]:
        if counts[pos][token]['western'] + counts[pos][token]['russian'] < threshold:
            to_delete.append((pos, token))
for dpos, dtoken in to_delete:
    del counts[dpos][dtoken]

0/3379 recs parsed with NLTK.
500/3379 recs parsed with NLTK.
1000/3379 recs parsed with NLTK.
1500/3379 recs parsed with NLTK.
2000/3379 recs parsed with NLTK.
2500/3379 recs parsed with NLTK.
3000/3379 recs parsed with NLTK.
CPU times: user 53.7 s, sys: 689 ms, total: 54.4 s
Wall time: 55.1 s


In [103]:
%%time
# function word spread across W and R

posCatPerSent = []

for recIdx, rec in enumerate(records):
    if recIdx % 500 == 0:
        print(f'{recIdx}/{len(records)} recs parsed with NLTK.')
        
    sents = sent_tokenize(rec['text'])
    for text in sents:
        text = tm_tokenize(text)
        if text == None:
            continue
        tokenizedTaggedText = pos_tag(word_tokenize(text))
        ctDict = dict()
        for posType in counts:
            ctDict.setdefault(posType, 0)
        for token, pos in tokenizedTaggedText:
            token = token.lower()
            if not token.isalpha():
                continue
            if token in counts[pos]:
                ctDict[pos] += 1
        recDict = {
            'category' : rec['category'],
            'sentTokens' : tokenizedTaggedText,
            'sentText' : text
        }
        for posType in counts:
            recDict[posType] = ctDict[posType]/len(tokenizedTaggedText)
            
        posCatPerSent.append(recDict)
posDf = pd.DataFrame.from_records(posCatPerSent)
posDf.head()

0/3379 recs parsed with NLTK.
500/3379 recs parsed with NLTK.
1000/3379 recs parsed with NLTK.
1500/3379 recs parsed with NLTK.
2000/3379 recs parsed with NLTK.
2500/3379 recs parsed with NLTK.
3000/3379 recs parsed with NLTK.
CPU times: user 57.9 s, sys: 8.85 s, total: 1min 6s
Wall time: 2min 2s


Unnamed: 0,category,sentTokens,sentText,NNP,DT,VBZ,VBG,JJ,NN,TO,...,RBS,NNPS,JJS,PDT,WP$,FW,POS,UH,$,SYM
0,western,"[(BRUSSELS, NNP), ((, (), (), )), (—, VBZ), (T...",BRUSSELS () — The European Union is considerin...,0.206897,0.068966,0.034483,0.068966,0.034483,0.103448,0.034483,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,western,"[(Acting, VBG), (on, IN), (a, DT), (request, N...",Acting on a request from Ukraine for help with...,0.074074,0.148148,0.0,0.037037,0.111111,0.222222,0.037037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,western,"[(The, DT), (topic, NN), (will, MD), (be, VB),...",The topic will be discussed during a summit Tu...,0.12,0.12,0.0,0.0,0.0,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,western,"[(The, DT), (results, NNS), (of, IN), (the, DT...",The results of the mission have yet to be anal...,0.0,0.136364,0.0,0.0,0.0,0.181818,0.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,western,"[(One, CD), (official, NN), (said, VBD), (the,...",One official said the EU’s political and secur...,0.0625,0.09375,0.03125,0.0,0.125,0.1875,0.03125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Run t-test on percent POS in sentences

In [114]:
# 2 t-test
for posType in sorted(list(counts.keys())):
    westernPOSdf = posDf[posDf['category'] == 'western'][posType]
    russianPOSdf = posDf[posDf['category'] == 'russian'][posType]

    coeff = scipy.stats.ttest_ind(np.array(westernPOSdf), np.array(russianPOSdf), axis=0)
    if coeff.pvalue < .05:
        print(pos2str[posType])
        print('western mean/median', westernPOSdf.mean(),westernPOSdf.median())
        print('russian mean/median', russianPOSdf.mean(),russianPOSdf.median())
        print('p-value', coeff.pvalue)
        print('t-statistic', coeff.statistic)
        print()

Coordinating conjunction
western mean/median 0.026841754446511608 0.024390243902439025
russian mean/median 0.02520580988624096 0.023809523809523808
p-value 1.1139448470807881e-11
t-statistic 6.792405742023669

Cardinal number
western mean/median 0.004178703254552122 0.0
russian mean/median 0.0026837135513907424 0.0
p-value 3.870810810270129e-43
t-statistic 13.781753081943485

Determiner
western mean/median 0.08315524550550957 0.08
russian mean/median 0.09377784787170222 0.09090909090909091
p-value 3.7404121044567827e-149
t-statistic -26.09093059531603

Preposition or subordinating conjunction
western mean/median 0.11114691380696354 0.1111111111111111
russian mean/median 0.11740974841495727 0.11764705882352941
p-value 1.5716440298164395e-45
t-statistic -14.17508662203616

Adjective, comparative
western mean/median 0.002518202587299472 0.0
russian mean/median 0.0010489112549767088 0.0
p-value 4.756391983865838e-88
t-statistic 19.928029891958847

Adjective, superlative
western mean/median