In [8]:
import pandas as pd
from datetime import datetime,timedelta
# %run ./connect_to_api.ipynb

In [3]:
def write_query(table,value,limit = 10,col = 'hit_value'):
    """
    This function writes a query for the hitcache based on given inputs
    Inputs:
        table: Which table to access, ie. replies, hashtags, etc
        value: Value of the hit
        limit: Whether or not to have a limit on returned data, and how much
        col: Which column to index the resulting dataframe on
    Outputs:
        q: Written query
    """
    if limit == False:
        limit = ''
    else:
        limit = 'LIMIT ' + str(limit)
    q = "SELECT * FROM \
    \
    ( \
    SELECT \
    * \
    FROM \
    hits_twitter_{0} \
    JOIN \
    map_nodes on map_nodes.node_id = hits_twitter_{0}.node_id \
    WHERE \
    hits_twitter_{0}.{3} = '{1}' \
    ) s \
    \
    {2};;".format(table, value, limit, col)
    return q

In [4]:
def get_hit(hit_type, hit_value, how_many = 10,sort_by = 'id',get_groups = True,get_clusters = True,get_nodenames = True):
    """
    This function gets hits by accessing the hitcache
    Inputs:
        hit_type: The type of hit to get
        hit_value: The value of the hit
        how_many: How many hits to retrieve
        sort_by: Which column to sort the resulting dataframe on
        get_groups: Whether to append group names or not
        get_clusters: Whether to append cluster names or not
        get_nodenames: Whether to append node names or not
    Outputs:
        hits_df: Returned dataframe of hits with associated data
    """
    if hit_type == 'nodes':
        hit_type == 'tweets'
        query = write_query('tweets',hit_value,how_many,'node_id')
    else:
        query = write_query(hit_type,hit_value,how_many)
    
    r = cur.execute(query)
    hits = cur.fetchall()
    hits_df = pd.DataFrame(hits, columns=["id","hit_value", "node_id", "message_id","hit_time","params","id_2","node_id2","map_id","cluster_id"])
    
    hits_df["node_id"] = hits_df["node_id"].astype("str")
    hits_df["message_id"] = hits_df["message_id"].astype("str")
    hits_df["hit_type"] = hit_type
    hits_df.drop_duplicates([sort_by],inplace = True)
    hits_df.drop(columns = ['node_id2','id_2','id','params'],inplace = True)
    
    date_values = hits_df.apply(date_extractor,axis = 1,result_type = 'reduce')
    hits_df.insert(loc = hits_df.shape[1]-1,column = 'date', value = date_values)
        
    if get_groups:
        groupname_values = hits_df.apply(groupname_lookup,axis = 1,result_type = 'reduce')
        hits_df.insert(loc = hits_df.shape[1]-1,column = 'group_name', value = groupname_values)
        
    if get_clusters:
        clustername_values = hits_df.apply(clustername_lookup,axis = 1,result_type = 'reduce')
        hits_df.insert(loc = hits_df.shape[1]-1,column = 'cluster_name', value = clustername_values)
        
    if get_nodenames:
        nodename_values = hits_df.apply(nodename_lookup,axis = 1,result_type = 'reduce')
        hits_df.insert(loc = hits_df.shape[1]-1,column = 'node_name', value = nodename_values)

    return hits_df.set_index(['hit_value'])

In [5]:
def clustername_lookup(rcvd_series):
    """
    This function returns a Series of cluster names
    Inputs:
        rcvd_series: An input of cluster IDs
    Outputs:
        r.json()['name']: Cluster name
    """
    map_id = rcvd_series['map_id']
    cluster_id = rcvd_series['cluster_no']
    url = 'https://api.graphika.com/maps/{}/clusters/{}'.format(map_id,cluster_id)
    r = requests.get(url, auth = (username,pswd))
    try:
        return r.json()['name']
    except:
        return '**Map access needed**'

In [6]:
def nodename_lookup(rcvd_series):
    """
    This function returns a Series of node names
    Inputs:
        rcvd_series: An input of node IDs
    Outputs:
        r.json()['name']: Node name
    """
    map_id = rcvd_series['map_id']
    node_id = rcvd_series['node_id']
#     cluster_id = rcvd_series['cluster_no']
    url = 'https://api.graphika.com/maps/{}/nodes/{}'.format(map_id,node_id)
    r = requests.get(url, auth = (username,pswd))
    try:
        return r.json()['name']
    except:
        return '**Map access needed**'

In [7]:
def groupname_lookup(rcvd_series):
    """
    This function returns a Series of group names
    Inputs:
        rcvd_series: An input of cluster IDs
    Outputs:
        r.json()['name']: Group name
    """
    map_id = rcvd_series['map_id']
    cluster_id = rcvd_series['cluster_no']
    url = 'https://api.graphika.com/maps/{}/clusters/{}'.format(map_id,cluster_id)
    r = requests.get(url, auth = (username,pswd))
    try:
        group_id = r.json()['group_no']
        url = 'https://api.graphika.com/maps/{}/groups/{}'.format(map_id,group_id)
        r = requests.get(url, auth = (username,pswd))
        return r.json()['name']
    except:
        return '**Map access needed**'

In [8]:
def date_extractor(rcvd_series):
    """
    This function returns a formatted date value
    Inputs:
        rcvd_series: An input of datetimes
    Outputs:
        r.json()['name']: Formatted datetimes
    """
    return datetime.strftime(rcvd_series.hit_time,'%d-%b-%Y %H00')

In [11]:
def get_summary(result):
    """
    This function returns a summary of a given dataframe
    Inputs:
        result: Resulting dataframe from a hit search
    Outputs:
        hitcounts_dict: A summary dataframe of the given dataframe
    """
    hitcounts_dict = {'node_id':[],'message_id':[],'map_id':[],'hit_time':[],'date':[]}
    for k in hitcounts_dict:
        hitcounts_dict[k].append(result[k].nunique())
        hitcounts_dict[k].append(result[k].value_counts())
    print('For the given hit value, there are:')
    for k in hitcounts_dict:
        print('---> {} unique {}s'.format(hitcounts_dict[k][0],k))
    return hitcounts_dict


In [119]:
def get_segment_data(level,get_density = True):
    """
    This function returns segment data on a group/cluster/map level
    Inputs:
        level: The level on which to get data (group/cluster/map)
        get_density: Whether or not to get group densities
    Outputs:
        r.json()['num_nodes']: Number of nodes in a segment
    """
    map_id = 2232
    group_id = 0
    cluster_id = 1
#     map_id = rcvd_series['map_id']
#     cluster_id = rcvd_series['cluster_id']
    %run group_density_getter.ipynb
    get_group_density(map_id)
    if level == 'map':
        url = 'https://api.graphika.com/maps/{}'.format(map_id)
    elif level == 'group':
        url = 'https://api.graphika.com/maps/{}/group/{}'.format(map_id,group_id)
    elif level == 'cluster':
        url = 'https://api.graphika.com/maps/{}/clusters/{}'.format(map_id,cluster_id)
    r = requests.get(url, auth = (username,pswd))
    try:
        return r.json()['num_nodes']
    except:
        return '**Map access needed**'