In [2]:
import requests
import os
import json
import pandas as pd
import glob
import numpy as np

In [3]:
# Twitter API Keys read from file
with open("C:/STUFF/RESEARCH/TENet/SECURITY/keys.json", 'r') as f:
    tokens = json.load(f)

bearer_token = tokens['BEARERTOKEN']

In [4]:
def create_url_find_followers(user_id):
    # Replace with user ID below
    #user_id = 2244994945 #1561044881161994240 #2244994945
    return "https://api.twitter.com/2/users/{}/followers".format(user_id)

def create_url_tweet_lookup(tweet_ids):
    tweet_fields = "tweet.fields=lang,author_id"
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld
    ids = f"ids={tweet_ids}"
    # You can adjust ids to include a single Tweets.
    # Or you can add to up to 100 comma-separated IDs
    url = f"https://api.twitter.com/2/tweets?{ids}&{tweet_fields}"
    return url

def create_url_user_lookup(user_id):
    url = f"https://api.twitter.com/2/users/{user_id}"
    return url

def get_params():
    return {"user.fields": "created_at", "max_results": "1000"}


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """
    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2FollowersLookupPython"
    return r


def connect_to_endpoint(url, params=""):
    response = requests.request("GET", url, auth=bearer_oauth, params=params)
    print(url, params, response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(
                response.status_code, response.text
            )
        )
    return response.json()


def main():
    url = create_url_find_followers()
    params = get_params()
    json_response = connect_to_endpoint(url, params)
    print(json.dumps(json_response, indent=4, sort_keys=True))

In [183]:
connect_to_endpoint(create_url_tweet_lookup(1563638942968467456), {"media.fields":"url"})

https://api.twitter.com/2/tweets?ids=1563638942968467456&tweet.fields=lang,author_id {'media.fields': 'url'} 200


{'data': [{'id': '1563638942968467456',
   'text': 'GOOD SCRAP 🥊💥 @FaZe_Sensei https://t.co/6fJ6LN3xGl',
   'lang': 'en',
   'author_id': '183205850'}]}

In [97]:
connect_to_endpoint(create_url_tweet_lookup(1498310370661834754))

200


{'data': [{'text': 'In new @ipcc_ch report, we found key risks of human-caused #climatechange: species #extinctions, #tree death, #wildfire increase, #ecosystem #carbon loss, yet people cutting #carbon pollution can avert most drastic future risks https://t.co/dBfVAYeshH #ipcc #ar6 #climatereport https://t.co/K6oS4AWeXg',
   'lang': 'en',
   'id': '1498310370661834754',
   'author_id': '767859059793989632'}]}

In [100]:
connect_to_endpoint(create_url_user_lookup(767859059793989632))

200


{'data': {'id': '767859059793989632',
  'name': 'Patrick Gonzalez',
  'username': 'pgonzaleztweet'}}

In [101]:
connect_to_endpoint(create_url_user_lookup(2498932848))

200


{'data': {'id': '2498932848',
  'name': 'Berkeley Forests',
  'username': 'berkeleyforests'}}

In [98]:
connect_to_endpoint(create_url_find_followers(2244994945),get_params())

200


{'data': [{'name': '$GME🏴\u200d☠️🚀🟣🇺🇦🇺🇦🇺🇦🇺🇦🇪🇺🇺🇸',
   'created_at': '2022-08-22T00:33:08.000Z',
   'id': '1561511723881873408',
   'username': 'DRS_GME_FOREVER'},
  {'name': 'Alan Castro',
   'created_at': '2021-12-07T21:03:47.000Z',
   'id': '1468325355043831822',
   'username': 'AlanCas49083754'},
  {'name': 'James Torley',
   'created_at': '2022-08-27T16:11:59.000Z',
   'id': '1563559897262071808',
   'username': 'enablec_ch'},
  {'name': 'jovem.dev',
   'created_at': '2021-10-04T17:57:14.000Z',
   'id': '1445085570951221257',
   'username': 'ronn001'},
  {'name': 'أبو العالم شهارة',
   'created_at': '2021-01-28T01:40:12.000Z',
   'id': '1354604944523980806',
   'username': 'Abormah_772'},
  {'name': 'Sharon Hanger',
   'created_at': '2011-01-18T02:12:48.000Z',
   'id': '239629714',
   'username': 'sharonhanger'},
  {'name': 'Vasudev Patel',
   'created_at': '2021-12-17T09:56:14.000Z',
   'id': '1471780297142243330',
   'username': 'Vasudev65506890'},
  {'name': 'Jabour',
   'created

## prep for athena 

In [130]:
import re
import csv

folder = "C:\\STUFF\\RESEARCH\\Brandwatch\\DATA\\MainQuery\\2033735572"

all_files = glob.glob(folder + "\\*.csv")

detect_chars = r"[\n,.]+"
replace_chars = " "

for this_file in all_files:
    os.path.splittext(os.path.basename(this_file))
    df = pd.read_csv(this_file)

    df.rename(columns={ col: col.replace(" ","_") for col in df.columns }, inplace=True)

    df['Title'] = df['Title'].apply(lambda x: re.sub(detect_chars, replace_chars, x) if type(x) == str else x)
    df['Full_Text'] = df['Full_Text'].apply(lambda x: re.sub(detect_chars, replace_chars, x) if type(x) == str else x)
    df.to_csv(folder + f"\\fixed_{os.path.basename(this_file)}", index=False, quoting=csv.QUOTE_ALL)
    df.to_parquet(folder + f"\\fixed_{}.parquet")



  df = pd.read_csv(this_file)
  df = pd.read_csv(this_file)


In [197]:
# datatypes mapping for columns
dtype_to_output = {np.dtype('int64') : 'int', np.dtype('<M8[ns]'): 'timestamp', np.dtype('float64') : 'float',
       np.dtype('bool') : 'boolean', np.dtype('O'): 'string'}

cd = df.dtypes.to_dict()
mapping_string = ",".join([f"{k} {dtype_to_output[cd[k]]}" for k in cd])
print(mapping_string)

Query Id int,Query Name string,Date timestamp,Title string,Url string,Domain string,Sentiment string,Page Type string,Language string,Country Code string,Continent Code string,Continent string,Country string,City Code string,Account Type string,Added string,Assignment float,Author string,Avatar string,Category Details string,Checked boolean,City string,Display URLs float,Expanded URLs string,Facebook Author ID float,Facebook Comments int,Facebook Likes int,Facebook Role float,Facebook Shares int,Facebook Subtype float,Full Name string,Full Text string,Gender string,Hashtags string,Impact float,Impressions int,Instagram Comments int,Instagram Followers int,Instagram Following int,Instagram Interactions Count int,Instagram Likes int,Instagram Posts int,Interest string,Last Assignment Date float,Latitude float,Location Name string,Longitude float,Media Filter float,Media URLs string,Mentioned Authors string,Original Url string,Priority float,Professions string,Resource Id string,Short URL

## actor identification

In [12]:
dummy_data_dir = 'C:\\STUFF\\RESEARCH\\TENet\\DummyData\\'

file_table_1 = 'dummy_news_domain.csv'
file_table_2 = 'dummy_user_data.csv'
file_table_3 = 'dummy_social_media_data.csv'

file_table_5 = 'dummy_gdelt_events.csv'
file_table_6 = 'dummy_actor.csv'
file_table_7 = 'dummy_indv_actor.csv'
file_table_8 = 'dummy_comm_actor.csv'
file_table_9 = 'dummy_plat_actor.csv'

In [13]:
users = pd.read_csv(dummy_data_dir + file_table_2)
print(users.head())

msgs = pd.read_csv(dummy_data_dir + file_table_3)
print(msgs.head())

   user_id source_user_id platform
0        0     u0_twitter  twitter
1        1     u1_twitter  twitter
2        2     u2_twitter  twitter
3        3     u3_twitter  twitter
4        4     u4_twitter  twitter
   msg_id  user_id platform   plat_msg_id parent_plat_msg_id  \
0       0       81  twitter  m100_twitter        m35_twitter   
1       1      197   reddit   m100_reddit         m80_reddit   
2       2      231      gab      m100_gab            m63_gab   
3       3      371    4chan    m100_4chan          m85_4chan   
4       4       62  twitter  m101_twitter        m21_twitter   

                   time              article_url          domain_name  
0  2018-03-01T00:54:00Z    click2houston.com/a/1    click2houston.com  
1  2018-03-01T00:58:00Z             kwch.com/c/0             kwch.com  
2  2018-03-01T01:57:00Z          vulture.com/a/2          vulture.com  
3  2018-03-01T02:36:00Z  theconversation.com/e/2  theconversation.com  
4  2018-03-01T02:58:00Z        navytimes.com/

In [16]:
# assumption: source_msg_id is unique inside it's platform
src_msg_id_to_msg_id = msgs.set_index(['plat_msg_id','platform'])['msg_id'].to_dict()
src_msg_id_to_msg_id 

{('m100_twitter', 'twitter'): 0,
 ('m100_reddit', 'reddit'): 1,
 ('m100_gab', 'gab'): 2,
 ('m100_4chan', '4chan'): 3,
 ('m101_twitter', 'twitter'): 4,
 ('m101_reddit', 'reddit'): 5,
 ('m101_gab', 'gab'): 6,
 ('m101_4chan', '4chan'): 7,
 ('m102_twitter', 'twitter'): 8,
 ('m102_reddit', 'reddit'): 9,
 ('m102_gab', 'gab'): 10,
 ('m102_4chan', '4chan'): 11,
 ('m103_twitter', 'twitter'): 12,
 ('m103_reddit', 'reddit'): 13,
 ('m103_gab', 'gab'): 14,
 ('m103_4chan', '4chan'): 15,
 ('m104_twitter', 'twitter'): 16,
 ('m104_reddit', 'reddit'): 17,
 ('m104_gab', 'gab'): 18,
 ('m104_4chan', '4chan'): 19,
 ('m105_twitter', 'twitter'): 20,
 ('m105_reddit', 'reddit'): 21,
 ('m105_gab', 'gab'): 22,
 ('m105_4chan', '4chan'): 23,
 ('m106_twitter', 'twitter'): 24,
 ('m106_reddit', 'reddit'): 25,
 ('m106_gab', 'gab'): 26,
 ('m106_4chan', '4chan'): 27,
 ('m107_twitter', 'twitter'): 28,
 ('m107_reddit', 'reddit'): 29,
 ('m107_gab', 'gab'): 30,
 ('m107_4chan', '4chan'): 31,
 ('m108_twitter', 'twitter'): 32,


In [21]:
msgs['parent_msg_id'] = msgs.apply(lambda row: src_msg_id_to_msg_id[(row['parent_plat_msg_id'],row['platform'])] if (row['parent_plat_msg_id'],row['platform']) in src_msg_id_to_msg_id else -1, axis=1)

In [24]:
msg_id_to_user_id = msgs.set_index('msg_id')['user_id'].to_dict()
msg_id_to_user_id

{0: 81,
 1: 197,
 2: 231,
 3: 371,
 4: 62,
 5: 167,
 6: 252,
 7: 316,
 8: 85,
 9: 122,
 10: 218,
 11: 317,
 12: 79,
 13: 148,
 14: 279,
 15: 322,
 16: 81,
 17: 154,
 18: 263,
 19: 399,
 20: 60,
 21: 140,
 22: 260,
 23: 303,
 24: 79,
 25: 173,
 26: 225,
 27: 398,
 28: 71,
 29: 150,
 30: 202,
 31: 386,
 32: 21,
 33: 110,
 34: 288,
 35: 376,
 36: 76,
 37: 180,
 38: 252,
 39: 320,
 40: 93,
 41: 147,
 42: 236,
 43: 341,
 44: 91,
 45: 188,
 46: 256,
 47: 392,
 48: 32,
 49: 182,
 50: 201,
 51: 359,
 52: 86,
 53: 132,
 54: 241,
 55: 372,
 56: 45,
 57: 173,
 58: 282,
 59: 337,
 60: 21,
 61: 113,
 62: 265,
 63: 375,
 64: 21,
 65: 197,
 66: 286,
 67: 361,
 68: 54,
 69: 190,
 70: 273,
 71: 398,
 72: 38,
 73: 115,
 74: 291,
 75: 320,
 76: 90,
 77: 165,
 78: 214,
 79: 330,
 80: 80,
 81: 172,
 82: 226,
 83: 387,
 84: 82,
 85: 142,
 86: 211,
 87: 316,
 88: 86,
 89: 121,
 90: 285,
 91: 305,
 92: 58,
 93: 150,
 94: 240,
 95: 319,
 96: 35,
 97: 199,
 98: 277,
 99: 369,
 100: 77,
 101: 106,
 102: 227,
 10

In [27]:
msgs['parent_user_id'] = msgs['parent_msg_id'].apply(lambda x: msg_id_to_user_id[x] if x in msg_id_to_user_id else -1)

In [29]:
msgs.set_index('msg_id')

Unnamed: 0_level_0,user_id,platform,plat_msg_id,parent_plat_msg_id,time,article_url,domain_name,parent_msg_id,parent_user_id
msg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,81,twitter,m100_twitter,m35_twitter,2018-03-01T00:54:00Z,click2houston.com/a/1,click2houston.com,-1,-1
1,197,reddit,m100_reddit,m80_reddit,2018-03-01T00:58:00Z,kwch.com/c/0,kwch.com,-1,-1
2,231,gab,m100_gab,m63_gab,2018-03-01T01:57:00Z,vulture.com/a/2,vulture.com,-1,-1
3,371,4chan,m100_4chan,m85_4chan,2018-03-01T02:36:00Z,theconversation.com/e/2,theconversation.com,-1,-1
4,62,twitter,m101_twitter,m21_twitter,2018-03-01T02:58:00Z,navytimes.com/e/2,navytimes.com,-1,-1
...,...,...,...,...,...,...,...,...,...
3995,353,4chan,m1098_4chan,m253_4chan,2018-05-21T20:34:00Z,wgrz.com/b/3,wgrz.com,615,326
3996,26,twitter,m1099_twitter,m972_twitter,2018-05-21T20:46:00Z,jewishpress.com/a/2,jewishpress.com,3488,59
3997,147,reddit,m1099_reddit,m888_reddit,2018-05-21T21:05:00Z,feeds.reuters.com/e/2,feeds.reuters.com,3153,154
3998,297,gab,m1099_gab,m668_gab,2018-05-21T21:25:00Z,mydaytondailynews.com/a/4,mydaytondailynews.com,2274,206


In [45]:
edges = msgs.groupby(['parent_user_id','user_id'],as_index=False).size().rename(columns={'parent_user_id':'Source','user_id':'Target','size':'Count'})
edges = edges[edges['Source'] != -1]
edges

Unnamed: 0,Source,Target,Count
358,0,1,2
359,0,40,1
360,0,51,1
361,0,56,1
362,0,61,1
...,...,...,...
3292,399,349,1
3293,399,369,1
3294,399,378,1
3295,399,382,1


In [54]:
edges.to_csv(dummy_data_dir + 'dummy_user_retweet_network.csv', index=False)

In [60]:
import networkx as nx
from infomap import Infomap

In [48]:
G = nx.from_pandas_edgelist(edges,'Source','Target')

In [58]:
r = nx.betweenness_centrality(G)

In [59]:
r

{0: 0.0007829220310367549,
 1: 0.0006542059993259361,
 40: 0.000791011692758799,
 51: 0.000811866351595515,
 56: 0.0004246698107170283,
 61: 0.0003673558895914732,
 65: 0.0001862290854771629,
 70: 0.000835867883054417,
 91: 0.0016608186733488512,
 96: 0.0010481216651098938,
 99: 0.0009458519185422565,
 5: 0.0007157948645802112,
 8: 0.001747203245380321,
 18: 0.0006564684314334322,
 24: 0.0006241155331423227,
 58: 0.0014953406570878344,
 63: 0.0011606390681645118,
 75: 0.0005397634442463042,
 2: 0.0005689481140893983,
 15: 0.00021955836904943968,
 28: 0.00011811461686730488,
 54: 0.0009614291633556987,
 73: 0.00025973948894703915,
 77: 0.0015832243924213825,
 3: 0.00025451198423342606,
 37: 0.0010383977407165455,
 71: 0.0005161001808882788,
 84: 0.000514073521478161,
 4: 0.0005180161956770534,
 17: 0.000628489707138859,
 19: 0.00033618678274224,
 31: 0.0004471973520079348,
 44: 0.001063725506935903,
 53: 0.0005295964794276986,
 83: 0.0008336471898316442,
 11: 0.00016376998070343556,
 13

In [61]:
im = Infomap()

In [64]:
edges.apply(lambda row: im.add_link(row['Source'],row['Target']), axis=1)

358     None
359     None
360     None
361     None
362     None
        ... 
3292    None
3293    None
3294    None
3295    None
3296    None
Length: 2939, dtype: object

In [70]:
im.num_nodes

400

In [72]:
%%time
im.run()

Wall time: 39.7 ms


In [73]:
im.num_top_modules

4

In [74]:
im.codelength

6.5391060753649874

In [75]:
for node in im.tree:
    if node.is_leaf:
        print(node.node_id, node.module_id)

369 1
390 1
326 1
339 1
314 1
305 1
354 1
342 1
303 1
349 1
358 1
398 1
306 1
307 1
319 1
329 1
352 1
331 1
399 1
318 1
364 1
355 1
377 1
343 1
387 1
333 1
379 1
301 1
310 1
344 1
383 1
385 1
388 1
362 1
346 1
334 1
372 1
341 1
363 1
380 1
375 1
361 1
347 1
374 1
340 1
337 1
302 1
357 1
397 1
313 1
325 1
376 1
321 1
373 1
367 1
370 1
371 1
311 1
396 1
382 1
360 1
304 1
332 1
393 1
365 1
330 1
359 1
336 1
335 1
328 1
323 1
384 1
381 1
300 1
308 1
353 1
392 1
394 1
356 1
320 1
316 1
345 1
324 1
351 1
309 1
317 1
386 1
322 1
389 1
391 1
327 1
395 1
350 1
315 1
348 1
312 1
368 1
366 1
338 1
378 1
52 2
91 2
34 2
39 2
49 2
8 2
77 2
13 2
22 2
58 2
93 2
76 2
99 2
54 2
96 2
79 2
64 2
62 2
42 2
25 2
0 2
32 2
24 2
40 2
72 2
44 2
26 2
63 2
47 2
87 2
33 2
5 2
37 2
17 2
82 2
18 2
92 2
90 2
81 2
1 2
68 2
71 2
61 2
70 2
95 2
88 2
83 2
48 2
53 2
55 2
2 2
78 2
31 2
51 2
75 2
84 2
94 2
4 2
56 2
20 2
80 2
38 2
89 2
46 2
86 2
57 2
85 2
97 2
50 2
69 2
41 2
67 2
19 2
66 2
74 2
3 2
21 2
98 2
15 2
36 2
45 2
7 

In [84]:
import json

with open('C:\\STUFF\\RESEARCH\\TENet\\Source\\ten\\input_config_v1.json') as json_file:
    data = json.load(json_file)
data

{'osn_msgs_dir': 'C:\\STUFF\\RESEARCH\\Brandwatch\\DATA\\MainQuery\\All'}

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


Unnamed: 0,a,b,c,b2,a2
0,1,2,3,4,1
1,4,5,6,25,16
