In [4]:
# %run misc_functions.ipynb
from datetime import datetime
# %run tweepy_tweets_getter.ipynb

In [17]:
map_tweets = pd.DataFrame()
map_hashtags = pd.DataFrame()
map_urls = pd.DataFrame()
map_retweets = pd.DataFrame()
map_feature_data = pd.DataFrame()
map_id = None
map_features = {'hashtags':pd.DataFrame(),'urls':pd.DataFrame(),'retweets':pd.DataFrame()}
debug_opt = True

In [6]:
#This function returns a dataframe with all nodes of a map after fetching their segment data

def get_node_data(map_id, grouped = True):
    """
    This function returns all nodes of a given map with their segment data
    Inputs:
        map_id: Map ID for which to fetch node data
        grouped: Whether or not to merge returned df on group data
    Outputs:
        df0: Dataframe of all nodes in a map with segment data included
    """
    
    #This data should also be saved
    url = "https://api.graphika.com/clustermaps/{}/nodes".format(map_id)
    r = requests.get(url,auth=(username, pswd))
    node_data = r.json()

    url = "https://api.graphika.com/clustermaps/{}/clusters".format(map_id)
    r = requests.get(url,auth=(username, pswd))
    cluster_data = r.json()


    url = "https://api.graphika.com/clustermaps/{}/groups".format(map_id)
    r = requests.get(url,auth=(username, pswd))
    group_data = r.json()


    df_nodes = pd.DataFrame({"screen_name":[n["screenname"] for n in node_data],\
                            "node_id":[n["service_user_id"] for n in node_data],\
                             "cluster_id":[n["attentive_cluster_id"] for n in node_data]})
    df_clusters = pd.DataFrame({"cluster_id":[n["id"] for n in cluster_data["clusters"]], \
                                "cluster_name": [n["name"] for n in cluster_data["clusters"]],\
                                "group_id": [n["group"] for n in cluster_data["clusters"]]})
    df_clusters["cluster_id"] = df_clusters["cluster_id"].astype("int")
    df_group = pd.DataFrame({"group_id":[n for n in group_data], \
                            "group_name":[v["name"] for v in group_data.values()]})

    df0 = pd.merge(df_nodes, df_clusters,on = "cluster_id")
    
    if grouped:
    
        df0 = pd.merge(df0,df_group, on = "group_id")
    
        df0 = df0[["screen_name","node_id","cluster_name","group_name","cluster_id"]]
        
        
    df0["node_id"] = df0["node_id"].astype("str")

    return df0


def get_screen_names(map_id):
    """
    This function returns all names of nodes in a given map
    Inputs:
        map_id: Map ID for which to fetch node data
    Outputs:
        df0: Dataframe of all nodes in a map with node names included
    """
    
    url = "https://api.graphika.com/clustermaps/%s/nodes" %map_id
    r = requests.get(url,auth=(username, pswd))
    node_hash = r.json()
    screen_names = [x['screenname'] for x in node_hash]
    service_ids = [x["service_user_id"] for x in node_hash]
    d = {'screen_name':screen_names,'id':service_ids}
    df = pd.DataFrame(d)
    return df

In [7]:
def get_hits_data(feature, map_id, use_map_dates = True, date_from = False, date_to = False, case = "default"):
    """
    This function executes a query on the hitcache and returns the result
    Inputs:
        feature: Which feature to look at, ie. hashtags, URLs, mentions
        map_id: Map ID for which to fetch hitcache data
        use_map_dates: Whether or not to use the first and last dates of the data collected in the map
        date_from: Manually entered beginning date
        date_to: Manually entered ending date
        case: Whether or not the search is case-sensitive
    Outputs:
        hits_df: Resulting dataframe from the query after some column name and type alterations
    """
    global debug_opt
    if use_map_dates:
        date_from, date_to = get_map_dates(map_id)
    limit = ''
    if debug_opt:
        limit = ' LIMIT 10000'
    
    if case == "default":
        query = "SELECT * FROM "\
        "(SELECT message_id, hits_twitter_{}.node_id, hit_time, hit_value as hit, map_nodes.map_id FROM hits_twitter_{} \
        join map_nodes on map_nodes.node_id = hits_twitter_{}.node_id) s \
        where s.map_id = {} \
        and s.hit_time BETWEEN '{}'::TIMESTAMP AND '{}'::TIMESTAMP{};;".format(feature, feature, feature, map_id, date_from, date_to,limit)
    
    if case == "standardize":
        query = "SELECT * FROM "\
        "(SELECT message_id, hits_twitter_{}.node_id, hit_time, lower(hit_value) as hit, map_nodes.map_id FROM hits_twitter_{} \
        join map_nodes on map_nodes.node_id = hits_twitter_{}.node_id) s \
        where s.map_id = {} \
        and s.hit_time BETWEEN '{}'::TIMESTAMP AND '{}'::TIMESTAMP{};;".format(feature, feature, feature, map_id, date_from, date_to,limit)

    r = cur.execute(query)
    print('...Querying database')
    
    hits = cur.fetchall()
    print('...Morphing dataframe')
    hits_df = pd.DataFrame(hits, columns=["message_id","node_id", "time", "hit_value", "map_id"])
    hits_df["node_id"] = hits_df["node_id"].astype("str")
    hits_df["message_id"] = hits_df["message_id"].astype("str")
    hits_df["hit_type"] = feature

    return hits_df


def get_map_dates(map_id):
    """
    This function returns the first and last dates of the data collected in a map
    Inputs:
        map_id: Map for which to fetch dates
    Outputs:
        earlier_date: the first date of data collected in the map
        later_date: the last date of data collected in the map
    """
    
    url = "https://api.graphika.com/clustermaps/{}".format(map_id)
    r = requests.get(url,auth=(username, pswd))
    map_data = r.json()
    map_dates = map_data["date_range"]
    later_date = datetime.utcfromtimestamp(map_dates[1]).strftime('%Y-%m-%d %H:%M:%S')
    earlier_date = datetime.utcfromtimestamp(map_dates[0]).strftime('%Y-%m-%d %H:%M:%S')
    return earlier_date,later_date

In [8]:
def get_map_hits(feature,map_id):
    """
    This function gets first the node data from a map, then gets its associated hits, and returns the merged result
    Inputs:
        feature: Which feature to look at, ie. hashtags, URLs, mentions
        map_id: Map ID for which to fetch hitcache data
    Outputs:
        map_hits: Resulting merged dataframe with both a map's node data and its hits
    """
    
    print('...Getting map nodes')
    map_nodes = get_node_data(map_id)
    print('...Getting hits')
    hits = get_hits_data(feature,map_id)
    print('...Merging nodes with hits')
    map_hits = pd.merge(map_nodes,hits)
    print('...Merging nodes with tags')
    map_hits = merge_tags(map_hits)
    print('...Done!')
    return map_hits

In [9]:
def run_map_activity_report(debug = False):
    """
    This function runs the entirety of the Map Activity Report (focused on segment data)
    Inputs:
        debug: A Boolean to limit the amount of data pulled and processed for debugging purposes
    Outputs:
        result: The resulting Map Activity dataframe with both nodes and hits data
    """
    global debug_opt
    debug_opt = debug
    global map_id
    global map_tweets
    
    input_map_id = input(">> Enter map id: ")
    if input_map_id != map_id:
        print('...Fetching map data')
        map_id = input_map_id
        map_tweets = get_map_hits("tweets",map_id)
    else:
        print('...Map data found!')
    result = sort_by_count(map_tweets)
    display(result)
    if input('>> Do you want to save this result to a CSV? (y/n) \n') == 'y':
        print_csv(result)

In [10]:
def sort_by_count(df):
    """
    This function sorts the input dataframe by the user's preferred segment (group, cluster, or nodes)
    Inputs:
        df: The input dataframe of nodes and hits data
    Outputs:
        activity_counts: Resulting sorted dataframe
    """
    
    count_by = input(">> Please enter which to aggregate the data by – tag, group, cluster, or account: ")
    countby_choice = {'group':'group_name','cluster':'cluster_name','account':'screen_name','tag':'tag'}
    try:
        activity_counts = map_tweets.groupby(countby_choice[count_by])["message_id"].count()
        activity_counts.name = count_by + "_tweet_count"
    except:
        print ("**Not a valid level to count tweets by**")
        return None

    activity_counts.sort_values(ascending = False, inplace = True)
    return pd.DataFrame(activity_counts)

In [11]:
def run_feature_activity_report(debug = False):
    """
    This function runs the entirety of the Feature Activity Report (focused on hits data)
    Inputs:
        debug: A Boolean to limit the amount of data pulled and processed for debugging purposes
    Outputs:
        result: The resulting Map Activity dataframe with both nodes and hits data
    """
    global debug_opt
    debug_opt = debug
    global map_id
    global map_features
    global map_feature_data
        
        
    input_map_id = input(">> Enter map id: ")
    feature_type = input(">> Search for hashtags, urls, or retweets: ")
    case_sensitive = input(">> Is this search case sensitive? (y/n) \n")
    feature_value = (input (">> Comma separate search parameters, or hit enter for all: "))
    
    if case_sensitive == 'n':
        case_sensitive = False
    else:
        case_sensitive = True
    feature_value = feature_value.split(",")

    #Case densitivity option should be available
        
    if input_map_id != map_id:
        print('...Fetching map data')
        map_id = input_map_id
        map_features = {'hashtags':pd.DataFrame(),'urls':pd.DataFrame(),'retweets':pd.DataFrame()}
    if map_features[feature_type].empty:
        print('...Fetching {} data'.format(feature_type))
        map_features[feature_type] = get_map_hits(feature_type,map_id)
    
    map_feature_data = map_features[feature_type]
        
    if feature_value:
        feature_results = pd.DataFrame()
        for searchterm in feature_value:
            searchterm = searchterm.strip()
            print('...Searching {} for <{}>'.format(feature_type,searchterm))
            feature_results = feature_results.append(map_feature_data[map_feature_data.hit_value.str.contains(searchterm,case=case_sensitive)])

        if not feature_results.empty:
        
            user_counts = pd.DataFrame(feature_results.groupby("screen_name").count()["hit_value"])
            user_counts.columns = ["number_of_tweets"]

            user_counts.sort_values(by = "number_of_tweets", inplace= True, ascending = False)
            
            display(feature_results)
            if input('>> Do you want to save the search results to a CSV? (y/n) \n') == 'y':
                print_csv(feature_results)
            display(user_counts)
            if input('>> Do you want to save the above table to a CSV? (y/n) \n') == 'y':
                print_csv(user_counts)
            
        else:
            print ("**No results for search term entered**")
    
    else:
        if input('>> Do you want to save the search results to a CSV? (y/n) \n') == 'y':
            print_csv(feature_results)
        return feature_results
        

In [12]:
# def run_full_report(debug = False):
# #     feature,map_id, q, search_type, date_from = "map_from", date_to = "map_to"
#     global debug_opt
#     debug_opt = debug
#     global map_id
#     global map_features
#     global map_feature_data
#     input_map_id = input(">> Enter map id: ")
#     feature_type = input(">> Search for hashtags, urls, or retweets: ")
#     case_sensitive = input(">> Is this search case sensitive? (y/n) \n")
# #     feature_value = (input (">> Comma separate search parameters, or hit enter for all: "))
    
#     if case_sensitive == 'n':
#         case_sensitive = False
#     else:
#         case_sensitive = True
# #     feature_value = feature_value.split(",")

#     #Case densitivity option should be available
        
#     if input_map_id != map_id:
#         print('...Fetching map data')
#         map_id = input_map_id
#         map_features = {'hashtags':pd.DataFrame(),'urls':pd.DataFrame(),'retweets':pd.DataFrame()}
#     if map_features[feature_type].empty:
#         print('...Fetching {} data'.format(feature_type))
#         map_features[feature_type] = get_map_hits(feature_type,map_id)
    
#     map_feature_data = map_features[feature_type]
    
# #     hits_df = get_hits_data(feature, map_id, date_from, date_to)
#     hits_df = maps_feature_data

# #     if search_type == "contains":
# #         results_df = hits_df[hits_df.hit_value.str.lower().apply(lambda x: any(term in x for term in q))]

# #     if search_type == "exact":
# #         results_df = hits_df[hits_df.hit_value.str.lower().isin(q)]

# #     nodes = get_node_data(map_id)
# #     nodes.node_id = nodes.node_id.astype(str)    

# #     df = pd.merge(results_df,nodes,left_on="node_id",right_on="node_id")

#     count_by = input(">> Please enter which to aggregate the data by – tag, group, cluster, or account: ")
#     countby_choice = {'group':'group_name','cluster':'cluster_name','account':'screen_name','tag':'tag'}
    
#     feature_count_tweets = df.pivot_table(index="hit_value",columns=countby_choice,aggfunc="count",values="message_id").fillna(0)
#     feature_count_tweets["total"] = feature_count_tweets.sum(axis=1)
#     feature_count_tweets.sort_values(by="total", ascending=False)

#     feature_count_part = df.pivot_table(index="hit_value",columns=countby_choice,aggfunc=pd.Series.nunique,values="node_id").fillna(0)
#     feature_count_part["total"] = feature_count_tweets.sum(axis=1)
#     feature_count_part.sort_values(by="total", ascending=False)

#     df.drop_duplicates(["message_id"], inplace=True)

#     tweet_counts = pd.DataFrame(df.groupby(countby_choice).count()["message_id"])
#     participation_counts = pd.DataFrame(df.groupby(countby_choice).nunique()["screen_name"])
#     users_counts = pd.DataFrame(df.groupby("screen_name").count()["message_id"]\
#     .sort_values(ascending = False)) 


#     if feature != "retweets":
#         retweets_df = get_hits_data("retweets",map_id, date_from, date_to)
#         retweets_df = retweets_df[["message_id","hit_value"]]
#         retweets_df.columns = ["message_id", "retweet_id"]


#         df = pd.merge(results_df,retweets_df,left_on="message_id", right_on="message_id",how = "left")

#         #save hit values for later join

#         hits_retweets = df[["retweet_id","hit_value"]]
#         hits_retweets.drop_duplicates(inplace=True)


#         df2 = pd.merge(df,nodes,left_on="node_id",right_on="node_id")


#         pivot_df = pd.pivot_table(data=df2, columns=countby_choice, values="message_id", aggfunc=pd.Series.nunique, index="retweet_id")\
#             .fillna(0)
#         pivot_df["map_total"] = pivot_df.sum(axis=1)
#         pivot_df.sort_values(by = "map_total", ascending=False, inplace=True)

#         pivot_df = pd.merge(pivot_df,hits_retweets,left_index=True, right_on="retweet_id", how="left")

#         tweets_json = [tweet for tweet in t.hydrate(list(set(pivot_df.index[0:5000])))]


#         all_rows = []
#         for tweet in tweets_json:
#             row_dict = {}
#             row_dict["id_str"] = tweet["id_str"]
#             row_dict["created_at"] = tweet["created_at"]
#             row_dict["full_text"] = tweet["full_text"]
#             row_dict["retweet_count"] = tweet["retweet_count"]
#             row_dict["favorite_count"] = tweet["favorite_count"]
#             row_dict["screen_name"] = tweet["user"]["screen_name"]

#             all_rows.append(row_dict)

#         tweets_df = pd.DataFrame(all_rows)

#         pivot_df.index = pivot_df.index.astype(str)
#         top_tweets_df = pd.merge(pivot_df,tweets_df,left_index=True,right_on="id_str")

#     writer = pd.ExcelWriter("./feature_report_{}.xlsx".format(map_id))


#     feature_count_tweets.to_excel(writer,"feature_count_tweets")
#     feature_count_part.to_excel(writer,"feature_count_partic")
#     tweet_counts.to_excel(writer,"tweet_counts")
#     participation_counts.to_excel(writer, "partic_counts")
#     users_counts.to_excel(writer,"users_counts")
#     top_tweets_df.to_excel(writer,"top_tweets")

#     writer.save()

In [13]:
# run_full_report(debug = True)

In [27]:
def run_full_activity_report(debug = False):
    ##############################
    global debug_opt
    debug_opt = debug
    global map_id
    global map_features
    global map_feature_data
    
    input_map_id = input(">> Enter map id: ")
    feature_type = input(">> Search for hashtags, urls, or retweets: ")
    case_sensitive = input(">> Is this search case sensitive? (y/n) \n")
    feature_value = (input (">> Comma separate search parameters, or hit enter for all: "))
    
    if case_sensitive == 'n':
        case_sensitive = False
    else:
        case_sensitive = True
    feature_value = feature_value.split(",")

    #Case sensitivity option should be available
        
    if input_map_id != map_id:
        print('...Fetching map data')
        map_id = input_map_id
        map_features = {'hashtags':pd.DataFrame(),'urls':pd.DataFrame(),'retweets':pd.DataFrame()}
    if map_features[feature_type].empty:
        print('...Fetching {} data'.format(feature_type))
        map_features[feature_type] = get_map_hits(feature_type,map_id)
        print('...Fetching {} data'.format('retweet'))
        map_features['retweets'] = get_map_hits(feature_type,map_id)
    
    df = map_features[feature_type].reset_index()
    retweets_df = map_features['retweets'].reset_index()
    results_df = df

    ###################################
#     hits_df = get_hits_data(feature, map_id, date_from, date_to)

#     if search_type == "contains":
#         results_df = hits_df[hits_df.hit_value.str.lower().apply(lambda x: any(term in x for term in q))]

#     if search_type == "exact":
#         results_df = hits_df[hits_df.hit_value.str.lower().isin(q)]


#     nodes = get_node_data(map_id)
#     nodes.node_id = nodes.node_id.astype(str)    

#     df = pd.merge(results_df,nodes,left_on="node_id",right_on="node_id")

#     df = merge_tags(df)
#     return df

    choice = input(">> Please enter which to aggregate the data by – tag or group: ")
    choices_dict = {'group':'group_name','tag':'tag'}
    groupby_name = choices_dict[choice]
    
    feature_count_tweets = df.pivot_table(index="hit_value",columns=groupby_name,aggfunc="count",values="message_id").fillna(0)
    feature_count_tweets["total"] = feature_count_tweets.sum(axis=1)
    feature_count_tweets.sort_values(by="total", ascending=False)

    feature_count_part = df.pivot_table(index="hit_value",columns=groupby_name,aggfunc=pd.Series.nunique,values="node_id").fillna(0)
    feature_count_part["total"] = feature_count_tweets.sum(axis=1)
    feature_count_part.sort_values(by="total", ascending=False)

    df.drop_duplicates(["message_id"], inplace=True)

    tweet_counts = pd.DataFrame(df.groupby(groupby_name).count()["message_id"])
    participation_counts = pd.DataFrame(df.groupby(groupby_name).nunique()["screen_name"])
    users_counts = pd.DataFrame(df.groupby("screen_name").count()["message_id"]\
    .sort_values(ascending = False)) 


    if feature_type != "retweets":
#         retweets_df = get_hits_data("retweets",map_id, date_from, date_to)
        retweets_df = retweets_df[["message_id","hit_value"]]
        retweets_df.columns = ["message_id", "retweet_id"]

#         df = df.reset_index()
#         results_df = results_df.reset_index()
        
        df = pd.merge(results_df,retweets_df,left_on="message_id", right_on="message_id",how = "left")

        #save hit values for later join
#         display(df)
#         df = df.reset_index()
#         display(df)
        hits_retweets = df[["retweet_id","hit_value"]]
        hits_retweets.drop_duplicates(inplace=True)


#         df2 = pd.merge(df,nodes,left_on="node_id",right_on="node_id")
        df2 = df


        pivot_df = pd.pivot_table(data=df2, columns=groupby_name, values="message_id", aggfunc=pd.Series.nunique, index="retweet_id")\
            .fillna(0)
        pivot_df["map_total"] = pivot_df.sum(axis=1)
        pivot_df.sort_values(by = "map_total", ascending=False, inplace=True)

        pivot_df = pd.merge(pivot_df,hits_retweets,left_index=True, right_on="retweet_id", how="left")

        tweets_json = [tweet for tweet in t.hydrate(list(set(pivot_df.index[0:5000])))]

        all_rows = []
        for tweet in tweets_json:
            row_dict = {}
            row_dict["id_str"] = tweet["id_str"]
            row_dict["created_at"] = tweet["created_at"]
            row_dict["full_text"] = tweet["full_text"]
            row_dict["retweet_count"] = tweet["retweet_count"]
            row_dict["favorite_count"] = tweet["favorite_count"]
            row_dict["screen_name"] = tweet["user"]["screen_name"]

            all_rows.append(row_dict)

        tweets_df = pd.DataFrame(all_rows)

        pivot_df.index = pivot_df.index.astype(str)
        top_tweets_df = pd.merge(pivot_df,tweets_df,left_index=True,right_on="id_str")

    writer = pd.ExcelWriter("./feature_report_{}.xlsx".format(map_id))


    feature_count_tweets.to_excel(writer,"feature_count_tweets")
    feature_count_part.to_excel(writer,"feature_count_partic")
    tweet_counts.to_excel(writer,"tweet_counts")
    participation_counts.to_excel(writer, "partic_counts")
    users_counts.to_excel(writer,"users_counts")
    top_tweets_df.to_excel(writer,"top_tweets")

    writer.save()