In this notebook we classify the extracted keywords into meaningful categories. We do this by labeling a small subset of keywords (top 50 occurring keywords per year) and bootstrapping a larger dataset.

The idea behind it is that we have 5 keywords per quotation, we then hand label the top 50 keywords, and assume keywords in a list with a hand-labelled keyword belong to the same topic. We then label all keywords that occur only once in the dataset of one year as outliers if they do not appear with one of the top 50 keywords. We then train a RandomForestClassifier + PCA to predict the labels. Finally, we predict on all the keywords and take those as the topic labels.

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import spacy 
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
def classify_keywords(df, topics, others=None):
    # Unpack keyword tuples
    keywords = pd.Series([x[0] for y in df.keywords for x in y])
    # Generate counts
    counts = keywords.value_counts()
    # Generate keyword list per quotation
    keyword_list = df.keywords.apply(lambda l: set([x[0] for x in l]))
    labels = np.zeros(len(keyword_list))    
    
    print('Generating labels')
    
    for i, x in tqdm(enumerate(keyword_list), total = len(keyword_list)):
        # Label explicit outliers
        if others:
            if x.intersection(others):
                labels[i] = -1
        
        group_intersects = np.zeros(len(topics))
        # Compute intersections with pre-labeled topic groups
        for y, group in enumerate(topics):
            group_intersects[y]=len(x.intersection(group))      
            
        # Check if there is a unique largest intersection, if so, label accordingly
        if (group_intersects == group_intersects.max()).sum() == 1:
            labels[i] =  np.argmax(group_intersects) + 1
            
        # If largest intersection not unique, exclude from training data and let classifier predict
        elif (group_intersects > 0).sum() > 1:
            labels[i] = 0
            
    # Repeat labels 5 times (5 keywords per quotation)
    labels = np.repeat(labels, 5)
    
    # Keep unique keywords
    _, index = np.unique(keywords, return_index=True)

    labels = labels[index]
    keywords = keywords[index]
    
    # Label unrelated keywords
    print('Generating outlier labels')
    for i, k in tqdm(enumerate(keywords), total = len(keywords)):
        if counts[k] == 1 and labels[i] == 0:
            labels[i] = -1    
    
    # Disable spacy functionality
    dis = ['ner', 'tagger', 'parser', 'entity_linker', 'entity_ruler',
       'textcat', 'textcat_multilabel', 'lemmatizer', 'morphologizer',
      'attribute_ruler', 'senter', 'sentencizer']

    embeddings = np.zeros([len(keywords),300])
    nlp = spacy.load('en_core_web_md')
    spacy.prefer_gpu()
    spacy.require_gpu()
    # Generate keyword embeddings
    print('Generating word embeddings')
    for i, doc in tqdm(enumerate(nlp.pipe(keywords, batch_size=600, disable=dis)), total = len(keywords)):
        if type(doc.vector) != np.ndarray:
            embeddings[i]=doc.vector.get()
        else:
            embeddings[i]=doc.vector
    
    # Train on non-zero labels (0 labels are unknown)
    X_train = embeddings[labels != 0]
    y_train = labels[labels != 0]    
    print('Training classifier')
    pipe = Pipeline([('pca', PCA(n_components = 50)), ('random forest', RandomForestClassifier(n_jobs = -1))])    
    pipe.fit(X_train, y_train)
    
    # Predict on all embeddings
    preds = pipe.predict(embeddings)

    return pd.DataFrame(np.column_stack((keywords, preds))
                        , columns = ['keyword', 'keyword_label'])

# 2020

In [3]:
trade = {'china trade', 'trade deal', 'china deal'}

covid = {'covid 19', 'coronavirus outbreak', 'china coronavirus',
        'china virus', 'coronavirus cases', 'chinese virus', 'new coronavirus',
        'death toll', 'new virus', 'travel ban', 'coronavirus fears', 'travel china',
        'coronavirus death', 'coronavirus patients', 'coronavirus pandemic',
        'virus outbreak', 'flights china', 'world health', 'public health',
        'spread coronavirus', 'amid coronavirus', 'tested coronavirus',
        'safe coronavirus', 'coronavirus crisis', 'virus china',
        'patients uk', 'receiving treatment', 'response coronavirus',
        'cases coronavirus', 'novel coronavirus'}


yuan = {'currency manipulator', 'china currency'}
huawei = {'huawei 5g', '5g role', 'huawei decision'}
groups1 = [trade, covid, yuan, huawei]
others1 = None

In [4]:
path_to_out = 'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2020.json.bz2'
quote_df = pd.read_json(path_to_out)

In [5]:
res = classify_keywords(quote_df, groups1, others1)

Generating labels


  0%|          | 0/146753 [00:00<?, ?it/s]

Generating outlier labels


  0%|          | 0/134175 [00:00<?, ?it/s]

Generating word embeddings


  0%|          | 0/134175 [00:00<?, ?it/s]

Training classifier


In [6]:
res.to_csv('C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_2020.csv')

In [7]:
for x in res['keyword_label'].unique():
    print(res[res['keyword_label'] == x].sample(5))

                 keyword keyword_label
24795      clamour grows          -1.0
75391    market billings          -1.0
75779     markets soared          -1.0
58327      heat jennifer          -1.0
19469  challenging japan          -1.0
                       keyword keyword_label
20578              china began           2.0
86123       outbreaks previous           2.0
22860            china textile           2.0
25325            closest loved           2.0
43394  emergencies ineffective           2.0
                        keyword keyword_label
79458           monitor signing           1.0
97845          real enforcement           1.0
100085  representatives chinese           1.0
21032          china delivering           1.0
40873               doubt trump           1.0
                         keyword keyword_label
128679               wake huawei           4.0
129423         warns information           4.0
123268  trustworthiness provider           4.0
60938              huawei result

# 2019

In [8]:
path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2019.json.bz2'
quote_df = pd.read_json(path_to_out)

In [9]:
hk = {'hong kong', 'kong protesters', 'kong protests', 'stand hong'}
trade = {'china trade', 'trade talks', 'trade war', 'deal china', 
         'trade deal', 'china tariffs', 'tariffs china',
        'trump trade', 'china deal', 'talks china',
        'chinese goods', 'china talks', 'deal trump',
        'war china', 'trade tensions', 'tariffs chinese',
        'tariff hike', 'new tariffs', 'trump tariffs', 'china tariff '}
climate = {'climate change'}
china_sea = {'china sea'}
north_korea = {'north korea'}
human_rights = {'human rights'}
others2 = None
groups2 = [hk, trade, climate, china_sea, north_korea, human_rights]

res = classify_keywords(quote_df, groups2, others2)

Generating labels


  0%|          | 0/444118 [00:00<?, ?it/s]

Generating outlier labels


  0%|          | 0/350023 [00:00<?, ?it/s]

Generating word embeddings


  0%|          | 0/350023 [00:00<?, ?it/s]

Training classifier


In [10]:
res.to_csv('C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_2019.csv')

In [11]:
for x in res['keyword_label'].unique():
    print(res[res['keyword_label'] == x].sample(5))

                   keyword keyword_label
9411         agrees russia          -1.0
5887    activists speaking          -1.0
49636           cdl pietro          -1.0
168352   italy netherlands          -1.0
219137             open ap          -1.0
                     keyword keyword_label
119611  farmers bankruptcies           2.0
198307       members liberal           2.0
278517            shift asia           2.0
137541       going sacrifice           2.0
145206            harbor ben           2.0
                 keyword keyword_label
202966     mistake trump           1.0
139980     govt hongkong           1.0
58064     china rejected           1.0
121966   feel threatened           1.0
110169  environment hong           1.0
                   keyword keyword_label
131066  fundamental rights           6.0
295563     subjects linked           6.0
235759    pompeo violation           6.0
56470      china including           6.0
14140      americans right           6.0
                

# 2018

In [12]:
path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2018.json.bz2'
quote_df = pd.read_json(path_to_out)

In [13]:
trade={'china trade','trade war','deal china','trade talks','trade deal','trump trade', 'china tariffs', 
       'tariffs china', 'war china','trump tariffs','trade tensions', 'new tariffs','trade dispute',
       'chinese goods', 'trade china','tariffs chinese','trade deficit','trade wars', 'trade truce', 'global trade',
      'intellectual property'}
huawei={'huawei cfo', 'huawei executive'}
north_korea = { 'north korea'}
china_sea={'china sea'}
climate={ 'climate change'}

others3 = None
groups3 = [trade, huawei, north_korea, china_sea, climate]

res = classify_keywords(quote_df, groups3, others3)

Generating labels


  0%|          | 0/272829 [00:00<?, ?it/s]

Generating outlier labels


  0%|          | 0/327933 [00:00<?, ?it/s]

Generating word embeddings


  0%|          | 0/327933 [00:00<?, ?it/s]

Training classifier


In [14]:
res.to_csv('C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_2018.csv')

In [15]:
for x in res['keyword_label'].unique():
    print(res[res['keyword_label'] == x].sample(5))

                 keyword keyword_label
44883           cars run          -1.0
144044  illegal machines          -1.0
174304        logo issue          -1.0
285297     tariffs steel          -1.0
64089   commissioner jay          -1.0
                keyword keyword_label
301325      trump react           1.0
19740      assess trade           1.0
71839     cost nebraska           1.0
314578       war losing           1.0
137865  hiccups houston           1.0
                keyword keyword_label
24247       bail crisis           2.0
19389       ask chinese           2.0
312850     wait chinese           2.0
240282  removing huawei           2.0
42035   canada angering           2.0
                        keyword keyword_label
164710              korea total           3.0
87802           different north           3.0
84856   denuclearziation korean           3.0
164085          korea exclusive           3.0
52779        china implementing           3.0
                 keyword keyword_l

# 2017

In [16]:
path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2017.json.bz2'
quote_df = pd.read_json(path_to_out)

In [17]:
north_korea = {'north korea','kim jong', 'north korean', 'korea trump', 'korea china', 'korea nuclear',
               'sanctions north', 'korea sanctions', 'korea problem', 'korea missile', 'new sanctions',
               'nuclear weapons', 'korean regime', 'korea just', 'korean peninsula', 'missile test'}

china_sea={'china sea', 'south china'}
trade={ 'china trade','trade china', 'china currency'}
hk = {'hong kong'}
climate={ 'paris agreement', 'paris climate','climate change'}
human_rights={ 'human rights'}
others4 = None
groups4 = [north_korea, china_sea, trade, hk, climate, human_rights]

res = classify_keywords(quote_df, groups4, others4)

Generating labels


  0%|          | 0/166833 [00:00<?, ?it/s]

Generating outlier labels


  0%|          | 0/272221 [00:00<?, ?it/s]

Generating word embeddings


  0%|          | 0/272221 [00:00<?, ?it/s]

Training classifier


In [18]:
res.to_csv('C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_2017.csv')

In [19]:
for x in res['keyword_label'].unique():
    print(res[res['keyword_label'] == x].sample(5))

                  keyword keyword_label
172790    opportunity ibm          -1.0
8755        allies fleets          -1.0
204241  return investment          -1.0
16798     auburn olympics          -1.0
196316     rains tomorrow          -1.0
                 keyword keyword_label
140477     lawyer jailed           6.0
203108   respect freedom           6.0
216829   sentenced years           6.0
25264         biden says           6.0
205678  rights continues           6.0
                   keyword keyword_label
223801         source glue           5.0
213502  scientists calling           5.0
234574  synagogues streets           5.0
50031      clear countries           5.0
212707    says withdrawing           5.0
                   keyword keyword_label
137761  koreans empowering           1.0
251454       trump talking           1.0
124900    interview russia           1.0
135408       korea changed           1.0
131069         jong marine           1.0
                  keyword keyword_

# 2016

In [20]:
path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2016.json.bz2'
quote_df = pd.read_json(path_to_out)

In [21]:
# 2016
china_sea={'south china','china sea','sea ruling', 'sea china'}
olymics={'rio olympics','usain bolt','gold medal','michael phelps','olympics 2016', 'olympic gold','olympic games'}
north_korea = {'north korea'}
climate={'climate change','climate deal', 'paris climate'}
trade={ 'free trade', 'china trade'}
hk = {'hong kong'}
economy={ 'global economy', 'chinese market','chinese investment'}
human_rights={ 'human rights'}
others5 = None
groups5 = [china_sea, olymics, north_korea, climate, trade, hk, economy, human_rights]

res = classify_keywords(quote_df, groups5, others5)

Generating labels


  0%|          | 0/73221 [00:00<?, ?it/s]

Generating outlier labels


  0%|          | 0/160641 [00:00<?, ?it/s]

Generating word embeddings


  0%|          | 0/160641 [00:00<?, ?it/s]

Training classifier


In [22]:
res.to_csv('C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_2016.csv')
for x in res['keyword_label'].unique():
    print(res[res['keyword_label'] == x].sample(5))

                    keyword keyword_label
25746         china culprit          -1.0
59716         fw38 positive          -1.0
135311  stock manipulations          -1.0
43736        directors exit          -1.0
67917        high livestock          -1.0
                    keyword keyword_label
100923  olympics continuing           2.0
73297          injury scare           2.0
21645               cap won           2.0
150844            usa begin           2.0
40402         debbie hilary           2.0
                      keyword keyword_label
6633    announce ratification           4.0
24001           change arctic           4.0
7536     approach regulations           4.0
99542      obama unilaterally           4.0
105529         people climate           4.0
                keyword keyword_label
51751   excess capacity           7.0
137373  summit shanghai           7.0
7512     approach china           7.0
395         1914 moment           7.0
47982       economy tpp           7.0
    

# 2015

In [23]:
path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2015.json.bz2'
quote_df = pd.read_json(path_to_out)
china_sea={'south china','china sea'}
climate={'climate change', 'climate talks', 'climate deal', 'paris climate'}
economy={ 'global economy', 'chinese economy', 'china economic', 'rate hike', 'stock market',
         'chinese market', 'economy china', 'china market', 'china currency', 'china economy'}
hk = {'hong kong'}
trade={ 'emerging markets', 'trade agreement', 'free trade'}
iran={ 'iran deal', 'nuclear deal', 'iran nuclear'}
north_korea = {'north korea'}
winter_olympics={'winter olympics'}
human_rights={ 'human rights', 'women rights'}

others6 = {'taipei times'}
groups6 = [china_sea, climate, economy, hk, trade, iran, north_korea, winter_olympics, human_rights]


res = classify_keywords(quote_df, groups6, others6)
res.to_csv('C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_2015.csv')
for x in res['keyword_label'].unique():
    print(res[res['keyword_label'] == x].sample(5))

Generating labels


  0%|          | 0/99917 [00:00<?, ?it/s]

Generating outlier labels


  0%|          | 0/226419 [00:00<?, ?it/s]

Generating word embeddings


  0%|          | 0/226419 [00:00<?, ?it/s]

Training classifier
               keyword keyword_label
208471   trudeau mania          -1.0
167722     reform euro          -1.0
1040         2015 lego          -1.0
187064    snowden lies          -1.0
191001  stars saturday          -1.0
                   keyword keyword_label
127601      market develop           3.0
127629     market equation           3.0
9337       angered warship           3.0
6223         ahead closing           3.0
65011   domestically china           3.0
                 keyword keyword_label
96378   harassment ships           1.0
138296   navy australian           1.0
179671           sea way           1.0
62020    differences sea           1.0
31386        castles sea           1.0
                      keyword keyword_label
134973          money markets           5.0
128575       markets universe           5.0
166256          reality china           5.0
193471  strategizing emerging           5.0
156647     positive australia           5.0
              

# 2014

In [24]:
path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2014.json.bz2'
quote_df = pd.read_json(path_to_out)

# 2014
hk = {'hong kong', 'kong protests', 'kong protesters'}
climate_change = {'climate change', 'climate deal','global warming', 'china climate'}
trade = {'china trade', 'chinese market', 'emerging markets', 'free trade'}
china_sea = {'china sea', 'china territorial', 'south china', 'territorial claims'}
north_korea = {'north korea'}
human_rights = {'human rights'}

others7 = {'channel newsasia', 'taipei times'}
groups7 = [hk, climate_change, trade, china_sea, north_korea, human_rights]


res = classify_keywords(quote_df, groups7, others7)
res.to_csv('C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_2014.csv')
for x in res['keyword_label'].unique():
    print(res[res['keyword_label'] == x].sample(5))

Generating labels


  0%|          | 0/85135 [00:00<?, ?it/s]

Generating outlier labels


  0%|          | 0/206785 [00:00<?, ?it/s]

Generating word embeddings


  0%|          | 0/206785 [00:00<?, ?it/s]

Training classifier
                      keyword keyword_label
71929   federal investigation          -1.0
116390        market improves          -1.0
159076            roach chris          -1.0
147191           profits amid          -1.0
148186   protectionist public          -1.0
                 keyword keyword_label
165262  security council           5.0
82921      gop interview           5.0
105719       korea exist           5.0
130768     obama release           5.0
105660       korea check           5.0
                      keyword keyword_label
145150  president ideological           2.0
145093        president clear           2.0
64170         emission target           2.0
198615           warming year           2.0
170231       skipping climate           2.0
                  keyword keyword_label
54670   demonstrators bbc           1.0
137227          paul hong           1.0
105352     kong difficult           1.0
114223  mainlanders small           1.0
105340      kong d

# 2013

In [26]:
path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2013.json.bz2'
quote_df = pd.read_json(path_to_out)

north_korea = {'north korea', 'north korean' }
china_sea = {'china sea', 'east china'}
hk = {'hong kong'}
trade = {'china trade', 'chinese market', 'emerging markets', 'free trade', 'solar panels', 'chinese solar'}
others8 = {'sarawak daily', 'news singapore'}
groups8 = [north_korea, china_sea, hk, trade]


res = classify_keywords(quote_df, groups8, others8)
res.to_csv('C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_2013.csv')
for x in res['keyword_label'].unique():
    print(res[res['keyword_label'] == x].sample(5))

Generating labels


  0%|          | 0/84712 [00:00<?, ?it/s]

Generating outlier labels


  0%|          | 0/213046 [00:00<?, ?it/s]

Generating word embeddings


  0%|          | 0/213046 [00:00<?, ?it/s]

Training classifier
                keyword keyword_label
35495         china lay          -1.0
148038  potential obama          -1.0
203519   walmart raises          -1.0
114626     line rosberg          -1.0
183349      stuff china          -1.0
              keyword keyword_label
108847      kong exit           3.0
118187  mainland hong           3.0
39251      chose hong           3.0
115853    london hsbc           3.0
75298     film dramas           3.0
                   keyword keyword_label
153723  pyongyang treasury           1.0
107958             kim say           1.0
193729        topics obama           1.0
36435           china role           1.0
109425         korea quite           1.0
                     keyword keyword_label
36980            china trade           4.0
178986  sports organizations           4.0
71835              f1 global           4.0
128400      movie storylines           4.0
202575         visitors hong           4.0
                  keyword keywor

# 2012


In [27]:
path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2012.json.bz2'
quote_df = pd.read_json(path_to_out)

london_olympics = {'london olympics', 'gold medal', 'michael phelps', '2012 olympics', 
                   'usain bolt', 'olympic games', 'olympic gold', 'olympics 2012'}
china_sea = {'china sea', 'south china'}
north_korea = {'north korea'}
trade = {'china trade', 'chinese market', 'emerging markets'}
iran = {'iran oil', 'iran sanctions'}
yuan = {'china currency', 'currency manipulator'}
human_rights = {'human rights', 'chen guangcheng'}
others9 = {'news australian', 'channel newsasia'}
groups9 = [london_olympics, china_sea, north_korea, trade, iran, yuan, human_rights]


res = classify_keywords(quote_df, groups9, others9)
res.to_csv('C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_2012.csv')
for x in res['keyword_label'].unique():
    print(res[res['keyword_label'] == x].sample(5))

Generating labels


  0%|          | 0/151151 [00:00<?, ?it/s]

Generating outlier labels


  0%|          | 0/347616 [00:00<?, ?it/s]

Generating word embeddings


  0%|          | 0/347616 [00:00<?, ?it/s]

Training classifier
                    keyword keyword_label
241995  presidency election          -1.0
333612         war receding          -1.0
208750         nasdaq china          -1.0
138932         granted swim          -1.0
53636        cheers gosford          -1.0
             keyword keyword_label
340710      won beat           1.0
344233   wvu alumnus           1.0
247617  proud really           1.0
36044    bolt eyeing           1.0
196872   medal allen           1.0
                   keyword keyword_label
140046            greet xi           4.0
217090           obama war           4.0
83209         dakota trade           4.0
61873   chinese travellers           4.0
318075      trade tensions           4.0
                keyword keyword_label
208146  myanmar reforms           7.0
29287    beijing arrive           7.0
216666     obama people           7.0
54112         chen wish           7.0
179024    leaders human           7.0
                       keyword keyword_label

# 2011

In [28]:
path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2011.json.bz2'
quote_df = pd.read_json(path_to_out)

china_sea = {'china sea'}
yuan = {'china currency', 'exchange rate', 'china yuan', 'currency manipulation'}
trade = {'china trade', 'emerging markets', 'trade war', 'chinese market'}
north_korea = {'north korea'}
dalai_lama = {'dalai lama'}
human_rights = {'human rights'}
others10 = {'news australian', 'channel newsasia'}
groups10 = [china_sea, yuan, trade, north_korea, dalai_lama, human_rights]


res = classify_keywords(quote_df, groups10, others10)
res.to_csv('C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_2011.csv')
for x in res['keyword_label'].unique():
    print(res[res['keyword_label'] == x].sample(5))

Generating labels


  0%|          | 0/136548 [00:00<?, ?it/s]

Generating outlier labels


  0%|          | 0/317897 [00:00<?, ?it/s]

Generating word embeddings


  0%|          | 0/317897 [00:00<?, ?it/s]

Training classifier
                      keyword keyword_label
65914   concerned development          -1.0
28580            better green          -1.0
16261           art exhibited          -1.0
184047         meetings seven          -1.0
225854         problems today          -1.0
                  keyword keyword_label
76852          cuts china           3.0
194433     nations brazil           3.0
221613  potential markets           3.0
180670   markets expected           3.0
33901     bounce equities           3.0
                keyword keyword_label
165032    lama politics           5.0
287036    tibetan issue           5.0
88005   discussed dalai           5.0
126279        god dalai           5.0
286929    tibet nations           5.0
                 keyword keyword_label
66272      condemn china           6.0
53919      china tibetan           6.0
80010     debate anxiety           6.0
244457      rights women           6.0
282012  teachers farmers           6.0
              

# 2010

In [29]:
path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2010.json.bz2'
quote_df = pd.read_json(path_to_out)

yuan = {'china currency', 'exchange rate', 'china yuan', 'currency war', 'chinese currency', 
        'currency manipulation', 'exchange rates'}
north_korea = {'north korea'}
dalai_lama = {'dalai lama', 'meet dalai','obama dalai'}
iran_sanctions = {'iran sanctions', 'sanctions iran'}
china_sea = {'china sea'}

others11 = {'canada news','times qatar', 'news australian','free encyclopedia'}
groups11 = [yuan, north_korea, dalai_lama, iran_sanctions, china_sea]


res = classify_keywords(quote_df, groups11, others11)
res.to_csv('C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_2010.csv')
for x in res['keyword_label'].unique():
    print(res[res['keyword_label'] == x].sample(5))

Generating labels


  0%|          | 0/125127 [00:00<?, ?it/s]

Generating outlier labels


  0%|          | 0/277363 [00:00<?, ?it/s]

Generating word embeddings


  0%|          | 0/277363 [00:00<?, ?it/s]

Training classifier
                   keyword keyword_label
64192     creating pension          -1.0
266895       way hollywood          -1.0
201004        quayle worst          -1.0
265482          war israel          -1.0
9798    america guantanamo          -1.0
                    keyword keyword_label
181013  overwhelmingly yuan           1.0
263867         vote support           1.0
66977     currency usatoday           1.0
47886       chinese believe           1.0
64726       crisis eurozone           1.0
                      keyword keyword_label
146269       leader secretary           3.0
245838       telephoned obama           3.0
176319           offers dalai           3.0
59651   constructive dialogue           3.0
146301          leaders china           3.0
                 keyword keyword_label
262550     views nations           4.0
70862    decisions obama           4.0
62532    countries align           4.0
217552  sanctions siasat           4.0
243603   talks sanction

# 2009

In [30]:
path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2009.json.bz2'
quote_df = pd.read_json(path_to_out)

climate_change = {'climate change', 'climate talks', 'climate deal','global warming', 
                  'climate summit','climate pact','copenhagen climate' }
beijing_olympics = {'olympics yahoo', 'world record','michael phelps','usain bolt','beijing olympics'}
north_korea = {'north korea'}
dalai_lama = { 'dalai lama'}
human_rights = {'human rights'} 
others12 = {'myspace blog', 'com blogs', 'taipei times'}
groups12 = [climate_change, beijing_olympics, north_korea, dalai_lama, human_rights]


res = classify_keywords(quote_df, groups12, others12)
res.to_csv('C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_2009.csv')
for x in res['keyword_label'].unique():
    print(res[res['keyword_label'] == x].sample(5))

Generating labels


  0%|          | 0/141055 [00:00<?, ?it/s]

Generating outlier labels


  0%|          | 0/315137 [00:00<?, ?it/s]

Generating word embeddings


  0%|          | 0/315137 [00:00<?, ?it/s]

Training classifier
               keyword keyword_label
50830       china halt          -1.0
284822    ties jakarta          -1.0
286481       today car          -1.0
264302     sport model          -1.0
302323  wants dialogue          -1.0
                 keyword keyword_label
96216       effect obama           1.0
285365       time herald           1.0
58645   climate politics           1.0
302872   warming fiction           1.0
109594     feeling china           1.0
                keyword keyword_label
232427   rebounds world           2.0
212079   phelps rattles           2.0
201161  olympics german           2.0
207212        park bolt           2.0
153841    jamaica usain           2.0
                     keyword keyword_label
8811          agrees nuclear           3.0
283665         threats calls           3.0
183246        military korea           3.0
15669   armistice agreements           3.0
160866    korea denuclearise           3.0
                keyword keyword_label


# 2008

In [32]:
path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2008.json.bz2'
quote_df = pd.read_json(path_to_out)

beijing_olympics = {'beijing olympics', 'olympics 2008','olympics yahoo','world racing',
                    'racing news', 'runner world', 'rugby cricket', 'michael phelps', 
                    'sport rugby', 'gold medal','olympic games', 'sport olympics'}
dalai_lama = {'dalai lama','meeting dalai'}
others13 = {'philippine news', 'channelnewsasia com', 'stuff nz', 'channelnewsasia com',
        'myspace blog','taipei times','com blogs','india news'}
groups13 = [beijing_olympics, dalai_lama]


res = classify_keywords(quote_df, groups13, others13)
res.to_csv('C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_2008.csv')
for x in res['keyword_label'].unique():
    print(res[res['keyword_label'] == x].sample(5))

Generating labels


  0%|          | 0/26748 [00:00<?, ?it/s]

Generating outlier labels


  0%|          | 0/78256 [00:00<?, ?it/s]

Generating word embeddings


  0%|          | 0/78256 [00:00<?, ?it/s]

Training classifier
                  keyword keyword_label
39947  lafferty announces          -1.0
8887          break amber          -1.0
70050           text iran          -1.0
3537       applied second          -1.0
74856       want designer          -1.0
               keyword keyword_label
23259         eat gold           1.0
38875    just olympics           1.0
44587  medal christine           1.0
44661     medal sports           1.0
53586      plans twins           1.0
               keyword keyword_label
70660  tibet situation           2.0
50519       open dalai           2.0
13670      china urges           2.0
56338    protest china           2.0
40033       lama urges           2.0
