In [170]:
import json
import spacy
import os
import numpy as np

In [171]:
all_comments = []

for fn in os.listdir():
    
    if "json" in fn:
        
        with open(fn, "r") as f:
            
            as_string = f.read()
            
            if len(as_string.strip()) > 0:
            
                comment = json.loads(as_string)

                all_comments += comment

In [172]:
# hypothesis: users that end with exactly four digits are Russian bots
def is_potential_russian_bot(username):
    return len(username) > 4 and username[-4:].isdigit() and not username[-5:].isdigit()

In [173]:
bot_comments = list(map(lambda c: c[1], list(filter(lambda c: is_potential_russian_bot(c[0]), all_comments))))

In [174]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

In [175]:
# multiple nouns if a noun can be spelled multiple different ways
# e.g. the name of the Ukrainian president
def find_adjectives_and_distances_to_noun(text, nouns):
    
    nouns = list(map(lambda n: n.lower(), nouns))
    
    adjectives_and_distances = []
    
    if all(noun not in text.lower() for noun in nouns):
        return adjectives_and_distances
    
    noun_indices = []
    
    doc = nlp(text)
    
    for i, token in enumerate(doc):
        
        if token.pos_ == "PROPN" and token.text.lower() in nouns:
            
            noun_indices.append(i)
            
    if len(noun_indices) > 0:
        
        adjectives_and_positions = [(j, token.text) for j, token in enumerate(doc) if token.pos_ == "ADJ"]
        
        for index in noun_indices:
            
            for adj_index, adj in adjectives_and_positions:
                
                adjectives_and_distances.append((adj, abs(index - adj_index)))

    return adjectives_and_distances

In [176]:
def find_all_distances_to_adjectives(comments, nouns):
    
    adjectives_and_distances = []
    
    for comment in comments:
        
        adjectives_and_distances += find_adjectives_and_distances_to_noun(comment, nouns)
        
    return adjectives_and_distances

In [177]:
def reduce_distances_to_adjectives(distances_to_adjectives):
    
    unique_word_distances = dict()
    
    # take mean distance if word appears multiple times
    for word, distance in distances_to_adjectives:
        
        if word not in unique_word_distances:
            
            unique_word_distances[word] = [distance]
            
        else:
            unique_word_distances[word].append(distance)
    
    metrics = []
    
    for word, distances in unique_word_distances.items():
        
        mean_distance = sum(distances) / len(distances)
        std_distance = np.std(np.array(distances))
        median_distance = sorted(distances)[len(distances) // 2]
        
        metrics.append((word, mean_distance, std_distance, median_distance))
        
    return metrics

In [178]:
def get_metrics(comments, nouns):
    
    adjectives_and_distances = find_all_distances_to_adjectives(comments, nouns)
    return reduce_distances_to_adjectives(adjectives_and_distances)

In [181]:
def print_metrics(comments, word, top_k = 10):
    
    metrics = get_metrics(comments, word)
    
    # we need a non-zero standard deviation to asses uncertainty
    metrics = list(filter(lambda t: t[2] > 0, metrics))
    
     # Assuming `top_k`, `word`, and `metrics` are defined
    print(f"The {top_k} nearest adjectives to the word {word}")
    print("By mean distance:")

    by_mean_distance = sorted(metrics, key=lambda t: t[1])[:top_k]

    for i, (w, mean, std, median) in enumerate(by_mean_distance):
        print(f"{i + 1}. {w:20}: mean distance = {mean:6.2f}, std distance = {std:6.2f}")

    print()
    print("By median distance:")

    by_median_distance = sorted(metrics, key=lambda t: t[3])[:top_k]

    for i, (w, mean, std, median) in enumerate(by_median_distance):
        print(f"{i + 1}. {w:20}: median = {median:6.2f}, std distance = {std:6.2f}")

In [186]:
print_metrics(bot_comments, ["Vladimir", "Putin"])

The 10 nearest adjectives to the word ['Vladimir', 'Putin']
By mean distance:
1. pure                : mean distance =   3.50, std distance =   1.50
2. best                : mean distance =   4.00, std distance =   4.05
3. glad                : mean distance =   4.33, std distance =   1.70
4. lifelong            : mean distance =   4.50, std distance =   3.50
5. New                 : mean distance =   4.50, std distance =   2.50
6. personal            : mean distance =   4.50, std distance =   0.50
7. scared              : mean distance =   5.25, std distance =   3.34
8. ok                  : mean distance =   7.50, std distance =   2.50
9. jealous             : mean distance =   7.50, std distance =   0.50
10. wise                : mean distance =   8.00, std distance =   5.00

By median distance:
1. best                : median =   2.00, std distance =   4.05
2. scared              : median =   4.00, std distance =   3.34
3. timid               : median =   4.00, std distance =  11.1

In [206]:
print_metrics(bot_comments, ["Tucker", "Carlson"])

The 10 nearest adjectives to the word ['Tucker', 'Carlson']
By mean distance:
1. exact               : mean distance =   4.50, std distance =   0.50
2. western             : mean distance =   5.50, std distance =   0.50
3. Interesting         : mean distance =   5.50, std distance =   0.50
4. weird               : mean distance =   5.50, std distance =   0.50
5. calm                : mean distance =   7.50, std distance =   0.50
6. normal              : mean distance =   7.50, std distance =   0.50
7. popular             : mean distance =   8.50, std distance =   2.50
8. controversial       : mean distance =   8.50, std distance =   7.50
9. only                : mean distance =   9.50, std distance =   8.99
10. bad                 : mean distance =  10.00, std distance =   9.19

By median distance:
1. meager              : median =   4.00, std distance =  23.81
2. only                : median =   5.00, std distance =   8.99
3. exact               : median =   5.00, std distance =   0.5

In [207]:
print_metrics(bot_comments, ["Zelensky", "Zelenskiy", "Zelenskyy", "Volodymyr"])

The 10 nearest adjectives to the word ['Zelensky', 'Zelenskiy', 'Zelenskyy', 'Volodymyr']
By mean distance:
1. more                : mean distance =  11.50, std distance =   6.50
2. ready               : mean distance =  13.50, std distance =   6.50
3. first               : mean distance =  17.50, std distance =   4.50
4. former              : mean distance =  22.50, std distance =  17.50
5. real                : mean distance =  24.50, std distance =   7.50
6. Western             : mean distance =  35.33, std distance =  24.23

By median distance:
1. more                : median =  18.00, std distance =   6.50
2. ready               : median =  20.00, std distance =   6.50
3. first               : median =  22.00, std distance =   4.50
4. real                : median =  32.00, std distance =   7.50
5. Western             : median =  39.00, std distance =  24.23
6. former              : median =  40.00, std distance =  17.50


In [208]:
print_metrics(bot_comments, ["Trump"])

The 10 nearest adjectives to the word ['Trump']
By mean distance:
1. first               : mean distance =   4.00, std distance =   1.41
2. able                : mean distance =   4.00, std distance =   2.00
3. intelligent         : mean distance =   8.50, std distance =   5.50
4. successful          : mean distance =  10.00, std distance =   6.00
5. decisive            : mean distance =  10.50, std distance =   5.50
6. latter              : mean distance =  10.50, std distance =   0.50
7. afraid              : mean distance =  10.50, std distance =   8.50
8. diplomatic          : mean distance =  11.50, std distance =   5.50
9. strong              : mean distance =  12.00, std distance =   4.00
10. constant            : mean distance =  12.00, std distance =   7.00

By median distance:
1. first               : median =   5.00, std distance =   1.41
2. able                : median =   6.00, std distance =   2.00
3. deep                : median =  11.00, std distance =  36.81
4. latter 

In [209]:
print_metrics(bot_comments, ["Joe", "Biden"])

The 10 nearest adjectives to the word ['Joe', 'Biden']
By mean distance:
1. competent           : mean distance =   3.00, std distance =   1.00
2. Corrupt             : mean distance =   3.50, std distance =   0.50
3. Open                : mean distance =   4.50, std distance =   0.50
4. inside              : mean distance =   4.50, std distance =   1.50
5. worse               : mean distance =   5.50, std distance =   0.50
6. sleepy              : mean distance =   7.00, std distance =   5.52
7. deep                : mean distance =   9.50, std distance =   0.50
8. loyal               : mean distance =  10.50, std distance =   7.50
9. terrible            : mean distance =  12.50, std distance =   0.50
10. current             : mean distance =  13.50, std distance =   0.50

By median distance:
1. illegal             : median =   2.00, std distance =  27.58
2. competent           : median =   4.00, std distance =   1.00
3. Corrupt             : median =   4.00, std distance =   0.50
4. 

In [210]:
print_metrics(bot_comments, ["Scholz", "Olaf"])

The 10 nearest adjectives to the word ['Scholz', 'Olaf']
By mean distance:
1. scared              : mean distance =   2.50, std distance =   0.50
2. weak                : mean distance =   5.50, std distance =   0.50
3. western             : mean distance =   9.50, std distance =   0.50
4. nein                : mean distance =  15.50, std distance =   0.50
5. little              : mean distance =  18.00, std distance =  16.00
6. drei                : mean distance =  27.50, std distance =   0.50

By median distance:
1. scared              : median =   3.00, std distance =   0.50
2. weak                : median =   6.00, std distance =   0.50
3. western             : median =  10.00, std distance =   0.50
4. nein                : median =  16.00, std distance =   0.50
5. drei                : median =  28.00, std distance =   0.50
6. little              : median =  34.00, std distance =  16.00


In [211]:
print_metrics(bot_comments, ["Johnson", "Boris"])

The 10 nearest adjectives to the word ['Johnson', 'Boris']
By mean distance:
1. Fucking             : mean distance =   1.50, std distance =   0.50
2. busy                : mean distance =   2.50, std distance =   0.50
3. 8th                 : mean distance =   4.50, std distance =   0.50
4. ill                 : mean distance =   5.50, std distance =   0.50
5. Responsible         : mean distance =   5.50, std distance =   0.50
6. proxy               : mean distance =   7.40, std distance =   3.20
7. 27th                : mean distance =   7.50, std distance =   0.50
8. many                : mean distance =   8.50, std distance =   0.50
9. more                : mean distance =   9.50, std distance =   3.04
10. Best                : mean distance =  12.50, std distance =   0.50

By median distance:
1. Fucking             : median =   2.00, std distance =   0.50
2. busy                : median =   3.00, std distance =   0.50
3. real                : median =   4.00, std distance =  74.25

In [212]:
print_metrics(bot_comments, ["Alexei", "Alexej", "Navalny"])

The 10 nearest adjectives to the word ['Alexei', 'Alexej', 'Navalny']
By mean distance:
1. sad                 : mean distance =   6.00, std distance =   4.32
2. covid               : mean distance =   6.50, std distance =   0.50
3. same                : mean distance =  11.89, std distance =  10.69
4. long                : mean distance =  12.50, std distance =  10.50
5. most                : mean distance =  13.50, std distance =   0.50
6. sure                : mean distance =  14.50, std distance =   0.50
7. full                : mean distance =  15.50, std distance =   0.50
8. brave               : mean distance =  15.75, std distance =   3.83
9. young               : mean distance =  17.00, std distance =   1.00
10. many                : mean distance =  18.00, std distance =   0.82

By median distance:
1. sad                 : median =   4.00, std distance =   4.32
2. same                : median =   6.00, std distance =  10.69
3. covid               : median =   7.00, std distan

In [213]:
print_metrics(bot_comments, ["Russia", "russian"])

The 10 nearest adjectives to the word ['Russia', 'russian']
By mean distance:
1. certain             : mean distance =   5.50, std distance =   1.50
2. afraid              : mean distance =   6.71, std distance =   4.95
3. stronger            : mean distance =   7.00, std distance =   5.00
4. crazy               : mean distance =   7.00, std distance =   5.00
5. average             : mean distance =   7.00, std distance =   6.00
6. horrible            : mean distance =   7.50, std distance =   4.50
7. strict              : mean distance =   8.50, std distance =   1.50
8. weak                : mean distance =   8.75, std distance =   5.89
9. timid               : mean distance =  10.00, std distance =   7.00
10. single              : mean distance =  10.33, std distance =   4.78

By median distance:
1. afraid              : median =   5.00, std distance =   4.95
2. last                : median =   6.00, std distance =  23.74
3. responsible         : median =   6.00, std distance =  53.9

In [214]:
print_metrics(bot_comments, ["Ukraine", "ukrainian"])

The 10 nearest adjectives to the word ['Ukraine', 'ukrainian']
By mean distance:
1. transparent         : mean distance =   5.00, std distance =   2.00
2. dangerous           : mean distance =   5.50, std distance =   2.50
3. More                : mean distance =   6.00, std distance =   2.00
4. poor                : mean distance =   6.50, std distance =   3.50
5. previous            : mean distance =   8.33, std distance =   4.92
6. last                : mean distance =   8.40, std distance =  10.23
7. full                : mean distance =   9.00, std distance =   3.74
8. future              : mean distance =  10.75, std distance =   8.79
9. socially            : mean distance =  12.00, std distance =   6.98
10. smart               : mean distance =  12.33, std distance =   3.86

By median distance:
1. last                : median =   3.00, std distance =  10.23
2. transparent         : median =   7.00, std distance =   2.00
3. full                : median =   8.00, std distance =   

In [215]:
print_metrics(bot_comments, ["United states", "USA", "US", "America"])

The 10 nearest adjectives to the word ['United states', 'USA', 'US', 'America']
By mean distance:
1. western             : mean distance =   4.33, std distance =   2.05
2. UNDERSTAND????THIS  : mean distance =   5.00, std distance =   2.00
3. anti                : mean distance =   6.00, std distance =   3.08
4. awesome             : mean distance =   6.50, std distance =   1.50
5. about               : mean distance =   7.00, std distance =   5.00
6. primary             : mean distance =   7.50, std distance =   0.50
7. crooked             : mean distance =   8.00, std distance =   7.00
8. critical            : mean distance =   8.00, std distance =   1.00
9. Syrian              : mean distance =   8.50, std distance =   0.50
10. drunk               : mean distance =   9.00, std distance =   7.00

By median distance:
1. western             : median =   4.00, std distance =   2.05
2. willing             : median =   5.00, std distance =  13.02
3. better              : median =   6.00, 

In [216]:
print_metrics(bot_comments, ["EU", "european"])

The 10 nearest adjectives to the word ['EU', 'european']
By mean distance:
1. American            : mean distance =   5.50, std distance =   3.50
2. Nuclear             : mean distance =  10.00, std distance =   3.00
3. foreign             : mean distance =  10.00, std distance =   7.00
4. direct              : mean distance =  13.00, std distance =   7.00
5. smart               : mean distance =  13.00, std distance =   5.00
6. wise                : mean distance =  13.00, std distance =   7.00
7. sad                 : mean distance =  13.00, std distance =  10.00
8. whole               : mean distance =  13.50, std distance =  10.50
9. Russian             : mean distance =  13.67, std distance =   8.96
10. stupid              : mean distance =  14.00, std distance =   8.00

By median distance:
1. American            : median =   9.00, std distance =   3.50
2. Russian             : median =  10.00, std distance =   8.96
3. Nuclear             : median =  13.00, std distance =   3.00
4

In [200]:
print_metrics(bot_comments, ["NATO"])

The 10 nearest adjectives to the word ['NATO']
By mean distance:
1. direct              : mean distance =   7.00, std distance =   3.00
2. proxy               : mean distance =   7.40, std distance =   1.96
3. Norwegian           : mean distance =   7.50, std distance =   1.50
4. full                : mean distance =   7.67, std distance =   4.50
5. agressive           : mean distance =   8.50, std distance =   6.50
6. regional            : mean distance =   8.67, std distance =   4.92
7. Ukrainian           : mean distance =   9.20, std distance =   3.49
8. peaceful            : mean distance =   9.50, std distance =   3.50
9. impossible          : mean distance =  10.00, std distance =   3.00
10. less                : mean distance =  11.00, std distance =   6.00

By median distance:
1. diplomatic          : median =   7.00, std distance =  12.28
2. full                : median =   8.00, std distance =   4.50
3. regional            : median =   8.00, std distance =   4.92
4. proxy   

In [217]:
print_metrics(bot_comments, ["Germany", "German"])

The 10 nearest adjectives to the word ['Germany', 'German']
By mean distance:
1. afraid              : mean distance =   8.50, std distance =   4.72
2. largest             : mean distance =   9.50, std distance =   5.50
3. reliant             : mean distance =  10.00, std distance =   5.72
4. last                : mean distance =  10.67, std distance =   5.44
5. true                : mean distance =  12.33, std distance =   8.34
6. same                : mean distance =  12.50, std distance =   8.50
7. Russias             : mean distance =  13.50, std distance =   9.50
8. Western             : mean distance =  16.50, std distance =   8.50
9. political           : mean distance =  18.50, std distance =   9.50
10. small               : mean distance =  19.00, std distance =   6.00

By median distance:
1. true                : median =   8.00, std distance =   8.34
2. last                : median =   9.00, std distance =   5.44
3. reliant             : median =  10.00, std distance =   5.7

In [218]:
print_metrics(bot_comments, ["UK", "United Kingdom", "Britain"])

The 10 nearest adjectives to the word ['UK', 'United Kingdom', 'Britain']
By mean distance:
1. covid               : mean distance =   9.00, std distance =   4.00
2. few                 : mean distance =  18.50, std distance =  15.50
3. missing             : mean distance =  23.00, std distance =   4.00
4. more                : mean distance =  31.33, std distance =  37.95
5. nuclear             : mean distance =  33.00, std distance =   3.00
6. Ukrainian           : mean distance =  37.50, std distance =  30.50
7. American            : mean distance =  38.00, std distance =  26.00
8. Russian             : mean distance =  56.50, std distance =  43.50
9. European            : mean distance =  83.50, std distance =  80.50

By median distance:
1. more                : median =   5.00, std distance =  37.95
2. covid               : median =  13.00, std distance =   4.00
3. missing             : median =  27.00, std distance =   4.00
4. few                 : median =  34.00, std distance =

In [220]:
print_metrics(bot_comments, ["media", "CNN", "MSNBC", "NBC", "mainstream"])

The 10 nearest adjectives to the word ['media', 'CNN', 'MSNBC', 'NBC', 'mainstream']
By mean distance:
1. fake                : mean distance =   2.80, std distance =   1.83
2. much                : mean distance =   3.00, std distance =   1.00
3. delusional          : mean distance =   5.00, std distance =   1.00
4. liberal             : mean distance =   5.00, std distance =   1.00
5. full                : mean distance =   5.67, std distance =   0.94
6. good                : mean distance =   6.00, std distance =   2.00
7. western             : mean distance =   6.50, std distance =   6.18
8. real                : mean distance =  10.00, std distance =   3.74
9. stupid              : mean distance =  12.33, std distance =   9.84
10. Russian             : mean distance =  13.80, std distance =   5.56

By median distance:
1. fake                : median =   2.00, std distance =   1.83
2. much                : median =   4.00, std distance =   1.00
3. western             : median =   5

In [199]:
# what's up with the COVID in the comments?
for covid_comment in list(filter(lambda c: "covid" in c.lower(), bot_comments)):
    print(covid_comment)

Trump faced Covid that hurt him.
Trump was useful to Putin and Putin played him like a father plays a stupid child. But Trump could not give Putin the fracking technology he wanted.  

Unlike Russia, the American President is a lot more controlled by Congress and the Constitution than Putin.

Still Trump feasted on the idea of himself as the  miraculous President....Until a real problem taking real leadership and real intelligence and real experience came along. Suddenly all the showboating, all the ballyhoo, all the lying meant nothing. Covid said hello to The Donald and The Donald was stuck for an answer.

Only in America could such a man be elected.  It was the outdated and dangerous Electoral College,  Russia and racial spite that put Trump into office. All three are mistakes we should have avoided. 

What happened to America's ability to detect a charlatan? What blinding force of nincompoopery controls the thinking of voters who just don't  know the facts but persist in voting for