In [1]:
import matplotlib.pyplot as plt
import os,pickle,tqdm,json
import warnings,json,itertools
from collections import Counter
import datetime as dt
import networkx as nx
import pandas as pd
import numpy as np
from multiprocessing import Pool,cpu_count
warnings.simplefilter('ignore')

def convert_key_tag_top(df):
    
    cols_to_convert = ['keywords','tags','topics']
    
    keywords,tags,topics = [],[],[]
    for i,v in tqdm.tqdm(df.iterrows()):
        if v['keywords']==v['keywords']:
            keywords.append((i,list(json.loads(
                v['keywords']).keys())))
        else:keywords.append((i,[]))
        if v['tags']==v['tags']:
            tags.append((i,list(json.loads(v['tags']))))
        else:tags.append((i,[]))        
        if v['topics']==v['topics']:
            topics.append((i,list(json.loads(v['topics']))))
        else:topics.append((i,[]))

    to_join = []
    for ls_nm,ls_ls in zip(cols_to_convert,
                           [keywords,tags,topics]):
        to_join.append(pd.DataFrame(ls_ls).rename(
            columns={1:ls_nm}).set_index(0))

    df = df.drop(cols_to_convert,axis=1).join(
        pd.concat(to_join,axis=1))
    
    for col in cols_to_convert:
        df[f'n_{col}'] = df.apply(
            (lambda x:len(x[col])),axis=1)
        df[f'1_{col}'] = df.apply(
            (lambda x:int(len(x[col])<=1)),axis=1)
        
    return df

def convert_key_tag_top_novelty(df):
    
    dfs = []
    to_remake = ['keywords','topics','tags']
    for col in to_remake:
        ls = []
        for i,v in tqdm.tqdm(df.iterrows()):
            if v[col]==v[col]:
                ls.append((i,sorted(json.loads(
                    v[col].replace("\'",'\"')))))
            else:ls.append((i,[]))
        df_ = pd.DataFrame(ls).rename(columns={1:col}).set_index(0)
        dfs.append(df_)

    dfs = pd.concat(dfs,axis=1)
    df = df.drop(to_remake,axis=1).join(dfs)
    
    return df

def flatten_list_to_set(ls_of_ls):
    return set([it for sl in ls_of_ls for it in sl])

def plot_weighted_circular(G,weighted=True,
                           plot=True,community=None):
    
    if plot==True:
        plt.figure(figsize=(5,5))
        pos = nx.circular_layout(G)
        nx.draw_networkx_nodes(G,pos,node_size=20)
        
        if weighted:
            
            width = np.array(
                [info[2]['weight'] for\
                 info in G.edges(data=True)])
            
            if max(width)>10:
                width=width/10.0
                
            if community:
                
                edge_list = [
                    i for i,v in same_diff_comm.items() if v==True]
                nx.draw_networkx_edges(
                    G,pos,edge_list,width=width,alpha=0.25)
                
                edge_list = [
                    i for i,v in same_diff_comm.items() if v==False]
                nx.draw_networkx_edges(
                    G,pos,edge_list,edge_color='r',
                    width=width,alpha=0.25)  
                
            else:
                nx.draw_networkx_edges(G,pos,width=width,alpha=0.5)
                
        else:
            
            edge_list = [
                i for i,v in same_diff_comm.items() if v==True]
            nx.draw_networkx_edges(
                G,pos,edge_list,alpha=0.25)
            
            edge_list = [
                i for i,v in same_diff_comm.items() if v==False]
            nx.draw_networkx_edges(
                G,pos,edge_list,
                edge_color='r',alpha=0.25) 
            
        plt.show()
        
def get_pairwise_combo(ls,self_loop=True):
    
    if len(ls)>=1:
        combo = list(itertools.combinations(ls,2))
        combo = list(tuple(sorted(i)) for i in combo)
        
        if self_loop:
            for it in ls:
                combo.append((it,it))
            
    else:
        combo = []
    
    return combo
        
def get_observed_freq(df,col='topics',self_loop=False):
    
    nodes = []
    for sublist in df[col].values:
        for item in sublist:
            nodes.append(item)
    all_edges = get_pairwise_combo(nodes,self_loop)
    
    obs_edges = []
    for i,v in df.iterrows():
        
        if len(v[col])==1:
            combo = get_pairwise_combo(v[col],self_loop)
        else:
            combo = get_pairwise_combo(v[col],False)
            
        if combo:
            for pair in combo:
                pair = tuple(sorted(pair))
                obs_edges.append(pair)
    obs_edges = dict(Counter(obs_edges))

    final_counts = {edge:0 for edge in all_edges}
    for edge,weight in obs_edges.items():
        final_counts[edge] = weight
        
    return final_counts

def get_expected_freq(
    observed_freq,plot=True,self_loop=False):
    
    G = nx.Graph()
    for edge,weight in observed_freq.items():
        if weight>0:
            G.add_edge(edge[0],edge[1],weight=weight)

    if plot:
        print('number of edges:',len(G.edges(data=True)))
        plot_weighted_circular(G) 
        
    try:
        swaps = nx.connected_double_edge_swap(G,10)
    except:
        swaps = nx.double_edge_swap(G)
        
    if plot: 
        print('number of swaps:',swaps)
        print('randomly assign edge weights')
        
    weights = [w for w in list(
        observed_freq.values()) if w>0]
    
    for edge in G.edges(data=True):
        chosen = np.random.choice(weights)
        weights.remove(chosen)
        G[edge[0]][edge[1]]['weight'] = chosen
    
    if plot:
        print('number of edges:',len(G.edges(data=True)))
        plot_weighted_circular(G)

    expected_freq = {tuple(sorted(i)):0 for\
                           i,_ in observed_freq.items()}
    for u,v,w in G.edges(data=True):
        expected_freq[tuple(sorted((u,v)))]=w['weight']
        
    return expected_freq

def uzzi2013(posts,measure_col='topics',interval='m',datetime_col='datetime'):

    # first convert the datetime to the time interval we want
    # then we can look at the subgraph for each time interval
    try:
        posts[datetime_col] = pd.to_datetime(
            posts[datetime_col]).dt.to_period(interval)
    except: pass
    months = sorted(posts[datetime_col].value_counts().keys())
    n_posts_too_few_in_period = 0
    n_no_z_scores = 0
    
    print('\n\n')
    print(f'no. of cores found: {cpu_count()}\n')
    print(f'relying on 1 process')

    # we look at subgraphs by periods
    # here we use month but we can use days
    # depending on the dataset
    for count,month in enumerate(months):
        
        period = posts[posts[datetime_col]==month]
        observed_freq = get_observed_freq(
            period,measure_col,True)
        all_topics = set(
            [it for sl in period[measure_col] for it in sl])

        # if during the time period there is less than 2 post
        # of if during the time period there are less than 5 topics
        # which means that there are too few nodes
        if (len(all_topics)>=4) & (len(period)>=2):

            for idx,val in tqdm.tqdm(
                period.iterrows(),
                desc=f'{count+1}/{len(months)}'):

                z_score = {}
                z_score['observed'] = {}
                combos = get_pairwise_combo(val[measure_col])

                for combo in combos:
                    z_score['observed'][combo] = observed_freq[combo]

                for sim in range(20):
                    expected_freq = get_expected_freq(
                        observed_freq,False,True)

                    z_score[f'expected_{sim}'] = {}
                    for combo in get_pairwise_combo(val[measure_col]):                                            
                        z_score[f'expected_{sim}'][combo] = \
                        expected_freq[combo]

                z_score = pd.DataFrame.from_dict(z_score)   

                obs = z_score['observed']
                exp = z_score.drop('observed',axis=1).mean(axis=1)
                std = z_score.drop('observed',axis=1).std(axis=1)
                z = ((obs-exp)/(std)).values
                z = [val for val in z if val==val]

                if z:
                    posts.loc[idx,'novelty_median'] = np.nanmedian(z)
                    posts.loc[idx,'novelty_tenth'] = np.nanpercentile(z,10)
                else:
                    n_no_z_scores+=1
        else:
            n_posts_too_few_in_period+=1
            
    return posts

def foster2015(posts,measure_col='topics',
               interval='m',detection_method=None,
               datetime_col='datetime'):
    try:
        posts[datetime_col] = pd.to_datetime(
            posts[datetime_col]).dt.to_period(interval)
    except:pass
    months = sorted(posts[datetime_col].value_counts().keys())
    n_posts_too_few_in_period = 0
    n_no_z_scores = 0

    # we look at subgraphs by periods
    # here we use month but we can use days
    # depending on the dataset
    for count,month in enumerate(months):
        
        period = posts[posts[datetime_col]==month]
        observed_freq = get_observed_freq(
            period,measure_col,True)
        all_topics = set(
            [it for sl in period[measure_col] for it in sl])
        G = nx.Graph()

        for (u,v),w in observed_freq.items():
            G.add_edge(u,v,weight=w)

        if detection_method=='louvian':
            communities = nx.algorithms.community.louvain_communities(
                G,weight='weight')
        else:
            communities = nx.algorithms.community.greedy_modularity_communities(
                G,weight='weight')
        communities = {it:n for n,ls in enumerate(communities) for it in ls}
        same_diff_comm = {tuple(sorted((u,v))):\
                          communities[u]==communities[v] for u,v in G.edges()}

        if (len(all_topics)>=4) & (len(period)>=2):

            for idx,val in tqdm.tqdm(
                period.iterrows(),
                desc=f'{count+1}/{len(months)}'):
                combos = get_pairwise_combo(val[measure_col])
                counts = len(combos)
                # note that True means same community False means diff community
                scores = [0 if same_diff_comm[combo] else 1 for combo in combos]
                novelty = np.sum(scores)/counts
                posts.loc[idx,'novelty_foster'] = novelty
                
    return posts

def lee2015(posts,measure_col='topics',
            interval='m',
            datetime_col='datetime'):
    
    try:
        posts[datetime_col] = pd.to_datetime(
            posts[datetime_col]).dt.to_period(interval)
    except:pass
    
    months = sorted(posts[datetime_col].value_counts().keys())
    n_posts_too_few_in_period = 0
    n_no_z_scores = 0

    # we look at subgraphs by periods
    # here we use month but we can use days
    # depending on the dataset
    for count,month in enumerate(months):
        
        period = posts[posts[datetime_col]==month]
        observed_freq = get_observed_freq(
            period,measure_col,True); print('get observed freq...')
        all_topics = set(
            [it for sl in period[measure_col] for it in sl])
        Nt = len(observed_freq); print('get Nt...')
        Nit = len(all_topics)+1; print('get Nit...')
        Njt = len(all_topics)+1; print('get Njt...')
        denominator = Nit*Njt; print('get denominator...\n')

        for idx,val in tqdm.tqdm(period.iterrows(),
                                 desc=f'{count+1}/{len(months)}'):

            combos = get_pairwise_combo(val[measure_col])
            commonness = []
            for combo in combos:
                numerator = observed_freq[combo]*Nt
                commonness.append(numerator/denominator)  

            commonness = [c for c in commonness if c>0]
            commonness = np.percentile(commonness,10)
            posts.loc[idx,'commonness_lee'] = commonness

    return posts

def _uzzi2013_parallel_1_period(item):
    
    count,period,measure_col = item
    observed_freq = get_observed_freq(
        period,measure_col,True)
    all_topics = set(
        [it for sl in period[measure_col] for it in sl])

    # if during the time period there is less than 2 post
    # of if during the time period there are less than 5 topics
    # which means that there are too few nodes
    
    if (len(all_topics)>=4) & (len(period)>=2):

        for idx,val in tqdm.tqdm(period.iterrows()):

            z_score = {}
            z_score['observed'] = {}
            combos = get_pairwise_combo(val[measure_col])

            for combo in combos:
                z_score['observed'][combo] = observed_freq[combo]

            for sim in range(20):
                expected_freq = get_expected_freq(
                    observed_freq,False,True)

                z_score[f'expected_{sim}'] = {}
                for combo in get_pairwise_combo(val[measure_col]):                                            
                    z_score[f'expected_{sim}'][combo] = \
                    expected_freq[combo]

            z_score = pd.DataFrame.from_dict(z_score)   
            obs = z_score['observed']
            exp = z_score.drop('observed',axis=1).mean(axis=1)
            std = z_score.drop('observed',axis=1).std(axis=1)
            z = ((obs-exp)/(std)).values
            z = [val for val in z if val==val]

            if z:
                period.loc[idx,'novelty_median'] = np.nanmedian(z)
                period.loc[idx,'novelty_tenth'] = np.nanpercentile(z,10)
            else:
                period.loc[idx,'novelty_median'] = np.NaN
                period.loc[idx,'novelty_tenth'] = np.NaN
    
    else:
        period.loc[:,'novelty_median'] = np.NaN
        period.loc[:,'novelty_tenth'] = np.NaN
        
    return period

def uzzi2013_parallel_1(
    posts,measure_col='topics',interval='m',
    datetime_col='datetime'):
    
    # first convert the datetime to the time interval we want
    # then we can look at the subgraph for each time interval
    try:
        posts[datetime_col] = pd.to_datetime(
            posts[datetime_col]).dt.to_period(interval)
    except: pass

    months = sorted(posts[datetime_col].value_counts().keys())
    n_posts_too_few_in_period = 0
    n_no_z_scores = 0
    iteration = [(
        count,posts[posts[datetime_col]==month],
        measure_col) for count,month in enumerate(months)]
    
    print('\n\n')
    print(f'no. of cores found: {cpu_count()}\n')
    n_processes = min(len(iteration),cpu_count())
    print(f'relying on {n_processes} processes')
    pool = Pool(processes=n_processes)
    results = list(tqdm.tqdm(pool.map(
        _uzzi2013_parallel_1_period,iteration)))
    pool.close()
    pool.join()
    
    return pd.concat(results)

def _uzzi2013_parallel_2_row(item):
    
    observed_freq,measure_col,idx,val = item
    
    z_score = {}
    z_score['observed'] = {}
    combos = get_pairwise_combo(val[measure_col])

    for combo in combos:
        z_score['observed'][combo] = observed_freq[combo]

    for sim in range(20):
        expected_freq = get_expected_freq(
            observed_freq,False,True)

        z_score[f'expected_{sim}'] = {}
        for combo in get_pairwise_combo(val[measure_col]):                                            
            z_score[f'expected_{sim}'][combo] = \
            expected_freq[combo]

    z_score = pd.DataFrame.from_dict(z_score)   

    obs = z_score['observed']
    exp = z_score.drop('observed',axis=1).mean(axis=1)
    std = z_score.drop('observed',axis=1).std(axis=1)
    z = ((obs-exp)/(std)).values
    z = [val for val in z if val==val]

    if z:
        return (idx,np.nanmedian(z),np.nanpercentile(z,10))
    else:
        return (idx,np.NaN,np.NaN)
    
def uzzi2013_parallel_2(
    posts,measure_col='topics',interval='m',
    datetime_col='datetime'):

    # first convert the datetime to the time interval we want
    # then we can look at the subgraph for each time interval
    try:
        posts[datetime_col] = pd.to_datetime(
            posts[datetime_col]).dt.to_period(interval)
    except: pass
    
    months = sorted(posts[datetime_col].value_counts().keys())
    z_scores = []

    # we look at subgraphs by periods
    # here we use month but we can use days
    # depending on the dataset
    for count,month in enumerate(months):
        
        period = posts[posts[datetime_col]==month]
        observed_freq = get_observed_freq(
            period,measure_col,True)
        all_topics = set(
            [it for sl in period[measure_col] for it in sl])

        # if during the time period there is less than 2 post
        # of if during the time period there are less than 5 topics
        # which means that there are too few nodes
        if (len(all_topics)>=4) & (len(period)>=2):

            iteration = [(observed_freq,measure_col,idx,val) for\
                         idx,val in period.iterrows()]
            print('\n\n')
            print(f'no. of cores found: {cpu_count()}\n')
            n_processes = min(len(iteration),cpu_count())
            print(f'relying on {n_processes} processes')
            pool = Pool(processes=n_processes)
            results = list(tqdm.tqdm(pool.map(_uzzi2013_parallel_2_row,iteration,100)))
            pool.close()
            pool.join()
            z_scores+=results
            
    z_scores = pd.DataFrame(z_scores,columns=[
        'index','novelty_median','novelty_tenth'])
            
    return posts.join(z_scores.set_index('index'))

def get_novelty(df,measure_col='topics',interval='m',parallel=1):

    start_time = dt.datetime.now()
    
    if parallel==0:
        df = uzzi2013(
            df,
            measure_col=measure_col,
            interval=interval)
    elif parallel==1:
        df = uzzi2013_parallel_1(
            df,
            measure_col=measure_col,
            interval=interval)
    else:
        df = uzzi2013_parallel_2(
            df,
            measure_col=measure_col,
            interval=interval)
        
    print(f"total processing time:",
          dt.datetime.now() - start_time)
    return df

In [2]:
posts = convert_key_tag_top_novelty(pd.read_csv('./data/xhs/users_clean.csv',
                                                index_col='index'),'crawl_date')

posts_0 = posts.copy()
posts_1 = posts.copy()
posts_2 = posts.copy()

posts_0_res = get_novelty(posts_0,parallel=0)
posts_1_res = get_novelty(posts_1,parallel=1)
posts_2_res = get_novelty(posts_2,parallel=2)

2500it [00:00, 18793.33it/s]
2500it [00:00, 21996.05it/s]
2500it [00:00, 25107.77it/s]





no. of cores found: 52

relying on 1 process


1/6: 19it [00:00, 124.57it/s]
6/6: 2475it [03:03, 13.49it/s]

total processing time: 0:03:06.765137



no. of cores found: 52

relying on 6 processes



19it [00:00, 132.51it/s]
2475it [02:40, 15.41it/s]
100%|██████████| 6/6 [00:00<00:00, 32430.19it/s]

total processing time: 0:02:44.212025



no. of cores found: 52

relying on 19 processes



100%|██████████| 19/19 [00:00<00:00, 106539.81it/s]





no. of cores found: 52

relying on 52 processes


100%|██████████| 2475/2475 [00:00<00:00, 1311713.72it/s]

total processing time: 0:00:35.189069





In [3]:
novelty_cols = ['novelty_median','novelty_tenth']

In [4]:
pd.concat([posts_0_res[novelty_cols],posts_1_res[novelty_cols],posts_2_res[novelty_cols]],axis=1)

Unnamed: 0_level_0,novelty_median,novelty_tenth,novelty_median,novelty_tenth,novelty_median,novelty_tenth
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5aa6791a4eacab6fb05bc81f,4.239885,0.736751,1.019479,0.604474,1.019479,0.604474
5e5a296d0000000001001efa,0.531156,0.531156,0.810701,0.810701,0.810701,0.810701
5bf81a8adb2e604af5301e57,0.674944,0.674944,3.846498,3.846498,3.849591,3.849591
5ebeb442000000000101f543,5.913032,0.046471,3.337863,-0.197189,3.344355,-0.236372
5659bc677c5bb86279284626,1.919322,-0.140343,1.131582,0.421613,1.137764,0.380112
...,...,...,...,...,...,...
59d87fcd51783a781df21578,17.292169,17.292169,15.181288,15.181288,14.038235,14.038235
5ef0c8fc0000000001000fe7,11.133020,11.133020,3.055492,3.055492,21.569256,21.569256
5d2b5955000000001100c778,66.416464,66.416464,18.345776,18.345776,140.698783,140.698783
5b41fa9211be104a1dd6cf77,1.887051,1.887051,-0.002775,-0.002775,0.050157,0.050157


In [5]:
posts_0_res[novelty_cols]

Unnamed: 0_level_0,novelty_median,novelty_tenth
index,Unnamed: 1_level_1,Unnamed: 2_level_1
5aa6791a4eacab6fb05bc81f,4.239885,0.736751
5e5a296d0000000001001efa,0.531156,0.531156
5bf81a8adb2e604af5301e57,0.674944,0.674944
5ebeb442000000000101f543,5.913032,0.046471
5659bc677c5bb86279284626,1.919322,-0.140343
...,...,...
59d87fcd51783a781df21578,17.292169,17.292169
5ef0c8fc0000000001000fe7,11.133020,11.133020
5d2b5955000000001100c778,66.416464,66.416464
5b41fa9211be104a1dd6cf77,1.887051,1.887051


In [6]:
posts_1_res[novelty_cols]

TypeError: 'DataFrame' object is not callable

In [None]:
posts_2_res[novelty_cols]