In [10]:
import pandas as pd
%run ./connect_to_api.ipynb

In [14]:
def get_hit(hit_type, hit_value, how_many = 10,sort_by = 'id',get_clusters = True):
    """ 
    Get all retweets of a certain tweet ID between given dates
  
    Parameters: 
    feature (str): What type of hit to look for (retweets)
    rt_id (str): Tweet ID
    date_from (datetime): Datetime of when the tweet was made
    date_to (datetime): Datetime of when to check for retweeters till (a week after the tweet was made)
    
    Returns: 
    hits_df (DataFrame): Table with all retweets for a given tweet ID
  
    """

    query = "SELECT * FROM \
    \
    ( \
    SELECT \
    * \
    FROM \
    hits_twitter_{} \
    JOIN \
    map_nodes on map_nodes.node_id = hits_twitter_{}.node_id \
    WHERE \
    hits_twitter_{}.hit_value = '{}' \
    ) s \
    \
    LIMIT \
    {};;".format(hit_type, hit_type, hit_type, hit_value, how_many)
    
    r = cur.execute(query)
    hits = cur.fetchall()
    hits_df = pd.DataFrame(hits, columns=["id","hit_value", "node_id", "message_id","hit_time","params","id_2","node_id2","map_id","cluster_id"])
    
    hits_df["node_id"] = hits_df["node_id"].astype("str")
    hits_df["message_id"] = hits_df["message_id"].astype("str")
    hits_df["hit_type"] = hit_type
    hits_df.drop_duplicates([sort_by],inplace = True)
    
    if get_clusters:
        clustername_values = hits_df.apply(clustername_lookup,axis = 1,result_type = 'reduce')
        hits_df.insert(loc = hits_df.shape[1]-1,column = 'cluster_name', value = clustername_values)
    
    hits_df.drop(columns = ['node_id2','id_2','id','params'],inplace = True)
    
    return hits_df.set_index(['hit_value'])

In [3]:
# hit_type = input('Hit type: ')
# hit_value = input('Hit value: ')

In [4]:
def clustername_lookup(rcvd_series):
    map_id = rcvd_series['map_id']
    cluster_id = rcvd_series['cluster_id']
    url = 'https://api.graphika.com/maps/{}/clusters/{}'.format(map_id,cluster_id)
    r = requests.get(url, auth = (username,pswd))
    try:
        return r.json()['name']
    except:
        return 'Map access needed'

In [151]:
# hits_to_check_dict = {'urls':['theringer.com'],'retweets':['1227561237782855680'],'hashtags':['MAGA'],'mentions':['1343351467']}

# results_dict = {'urls':{},'retweets':{},'hashtags':{},'mentions':{}}


In [152]:
# for hit_type in hits_to_check_dict.keys():
#     for hit_value in hits_to_check_dict[hit_type]:
# #         print('{}: {}'.format(hit_type,hit_value))
#         results_dict[hit_type][hit_value] = (get_hit(hit_type,hit_value,50,'map_id'))
    

In [5]:
# results_dict

In [6]:
# # group influence: total map outdegs (excluding in group connections)
# # 4:07
# # group density: group connections/(n-1)^2
# # 4:07
# # analyst_tool.segment_stat (map_id = 100)

# node-level
#     get maps
#     get clusters
    
# map-level
#     group/cluster level
#         get influence
#         get connections
#             in
#             out


In [36]:
hitcounts_dict = {'node_id':[],'message_id':[],'map_id':[],'hit_time':[]}
for k in hitcounts_dict:
    hitcounts_dict[k].append(get_hit('urls', 'www.usatoday.com',500,get_clusters = False)[k].nunique())
    hitcounts_dict[k].append(get_hit('urls', 'www.usatoday.com',500,get_clusters = False)[k].value_counts())


In [32]:
temp.node_id.nunique()

39

In [37]:
hitcounts_dict

{'node_id': [39, 792173743              3
  51141130               2
  812091868272017409     2
  1031917907364605953    2
  15754281               2
  3040234038             2
  1969343886             2
  1034868411354861568    1
  263504949              1
  24016767               1
  347271473              1
  142009437              1
  789088080653201409     1
  33278902               1
  1004784664748359680    1
  2828496562             1
  3103213542             1
  166802501              1
  250742408              1
  887331727021981701     1
  1884977676             1
  30217192               1
  802352830036250624     1
  223539098              1
  123943277              1
  964644965950377987     1
  980534370539921408     1
  320909953              1
  259801469              1
  765883014              1
  2352600679             1
  1120562007994970115    1
  14859559               1
  2458812056             1
  20733757               1
  18249616               1
  2594232213 

In [38]:
for k in hitcounts_dict:
    print('For the given hit value, there are {} unique {}s'.format(hitcounts_dict[k][0],k))

For the given hit value, there are 39 unique node_ids
For the given hit value, there are 47 unique message_ids
For the given hit value, there are 29 unique map_ids
For the given hit value, there are 47 unique hit_times


In [62]:
from datetime import datetime
datetime.strftime(temp.hit_time.iloc[0],'%d-%b-%Y %H00')

'11-Feb-2020 2300'

In [58]:
temp.hit_time.iloc[0]

Timestamp('2020-02-11 23:35:00')