In [1]:
import os,tqdm,warnings,json
from scipy import stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import convert_key_tag_top
warnings.simplefilter("ignore")

ModuleNotFoundError: No module named 'scipy'

# Import and clean user dataframe

In [None]:
users = pd.read_excel('./data/xhs/users.xlsx',
                      index_col='Creator_ID',
                      parse_dates=['Crawl_Date','insert_time'])
users = users.replace(
    {'腰部':1, '超头部':4, '头部':3, '尾部':0, '肩部':2})
users = users.rename(columns={'insert_time':'post_date',
                              'Creator_Type':'topics',
                              'collect':'Collect_Count',
                              'Creator_Like_Count':'Like_Count',
                              'Creator_Post_Counts':'Post_Count',
                              'Creator_Fan_Count':'Fan_Count'})
users = users[['post_date','Crawl_Date',
               'Gender','Post_Count','Fan_Count',
               'Like_Count','Collect_Count',
               'topics','tags','keywords',
               'kol_level']]
users = convert_key_tag_top(users)
users.index.name = 'index'
users.columns = [col.lower() for col in users.columns]
users.to_csv('./data/xhs/users_clean.csv')
print('dumped to: ./data/xhs/users_clean.csv')

#### Plot some distributions 

In [None]:
f,ax = plt.subplots(4,1,figsize=(5,5))
for n,i in enumerate([i for i in users.columns if 'count' in i]):
    ax[n].hist(users[i].values,bins=100,log=True)
    ax[n].set_title(i+' (log)',fontsize='small')
plt.tight_layout()
plt.show()

#### Plot more distributions

In [None]:
print('Mean of creators:')
f,ax = plt.subplots(3,1,figsize=(5,5))
for n,i in enumerate([i for i in users.columns if i.startswith('n_')]):
    ax[n].hist(users[i].values,bins=10)
    ax[n].set_title(i,fontsize='small')
plt.tight_layout()
plt.show()

# Import and clean post dataframe

In [None]:
posts = []
for i in tqdm.tqdm(os.listdir('./data/xhs/posts/')):
    if i.endswith('.xlsx'):
        posts.append(pd.read_excel('./data/xhs/posts/'+i,
                                   index_col='post_id'))

posts = pd.concat(posts)
posts['if_video'] = posts.apply((
    lambda x:1 if x['post_type']=='video' else 0),axis=1)
posts['if_cooperate'] = posts.apply((
    lambda x:1 if x['cooperate']==x['cooperate'] else 0),axis=1)
posts = posts.drop(['post_content','barrage','coin',
                    'barrages','barrage_keywords',
                    'cooperate'],axis=1)
posts = posts.rename(columns={
    'ai_pred_topics':'topics','tag_ids':'tags'})
posts = convert_key_tag_top(posts)
posts.index.name = 'index'
posts.columns = [col.lower() for col in posts.columns]
posts.to_csv('./data/xhs/posts_clean.csv')
print('dumped to: ./data/xhs/posts_clean.csv')

#### Plot post distibution by time

In [None]:
posts = posts.reset_index().set_index('user_id').join(users).set_index('post_id')
posts['post_date'] = pd.to_datetime(posts['post_date']).dt.to_period('d')
posts['crawl_date'] = pd.to_datetime(posts['crawl_date']).dt.to_period('d')
f,axes = plt.subplots(2,1,figsize=(8,4))
for n,col in enumerate(['post_date','crawl_date']):
    X,Y = [],[]
    for x,y in posts[[col]].value_counts().sort_index().items():
        X.append(str(x[0]))
        Y.append(y)
    
    axes[n].plot(X,Y)
    axes[n].set_title(col)
    axes[n].set_ylabel('No. of posts per day')
    axes[n].set_xticks(np.arange(0,len(X),len(X)//10))
    axes[n].set_xticklabels([X[i] for i in range(0,len(X),len(X)//10)],
                            rotation=15,va='center',position=(0,-0.08))
    
plt.tight_layout()
plt.show()

In [2]:
print('Average no. of keywords per post:',
      int(posts['n_keywords'].mean()))
print('Average no. of tags per post:',
      int(posts['n_tags'].mean()))
print('Average no. of topics per post:',
      int(posts['n_topics'].mean()))
print()
print('Percentage of posts with <= 1 keyword:',
      int(100*(posts['1_keywords'].sum()/len(posts))),'%')
print('Percentage of posts with <= 1 tag:',
      int(100*(posts['1_tags'].sum()/len(posts))),'%')
print('Percentage of posts with <= 1 topic:',
      int(100*(posts['1_topics'].sum()/len(posts))),'%')

NameError: name 'posts' is not defined

In [None]:
print('Posts with =1 topic:',posts[posts['n_topics']==1][
    'post_like'].mean(),'Average likes')
print('Posts with >1 topic:',posts[posts['n_topics']>=1][
    'post_like'].mean(),'Average likes')
print()
print('Posts with =1 topic:',posts[posts['n_topics']==1][
    'collect'].mean(),'Average collects')
print('Posts with >1 topic:',posts[posts['n_topics']>=1][
    'collect'].mean(),'Average collects')

In [None]:
months = sorted(posts['post_date'].value_counts().keys())
for count,month in enumerate(months):
    period = posts[posts['post_date']==month]
    for idx,val in tqdm.tqdm(period.iterrows(),
                             desc=f'{count+1}/{len(months)}'):   
        posts.loc[idx,'post_like_normed'] = val[
            'post_like']/period['post_like'].sum()
        posts.loc[idx,'collect_normed'] = val[
            'collect']/period['collect'].sum()

In [None]:
def corr_sig(df=None):
    
    p_matrix = np.zeros(shape=(df.shape[1],df.shape[1]))
    for col in df.columns:
        for col2 in df.drop(col,axis=1).columns:
            _ , p = stats.pearsonr(df[col],df[col2])
            p_matrix[df.columns.to_list().index(col),
                     df.columns.to_list().index(col2)] = p
    p_matrix = pd.DataFrame(p_matrix)
    to_rename = dict(enumerate(df.columns.values))
    
    return p_matrix.rename(index=to_rename,columns=to_rename)

def plot_cor_matrix(corr,mask=None,labels=None):
    
    f, ax = plt.subplots(figsize=(11,9))
    sns.heatmap(corr,ax=ax,
                mask=mask,
                annot=True,
                vmin=-1,vmax=1,
                center=0,
                cmap='coolwarm',
                linewidths=2, 
                square=True,
                linecolor='black', 
                cbar_kws={'orientation':'vertical'})
    
df = posts[['post_like','collect',
            'post_like_normed','collect_normed',
            'n_topics','n_tags','n_keywords',
            '1_topics','1_tags','1_keywords',]]
df = df.dropna(how='any').copy()
corr = df.corr()
pval = corr_sig(df)
corr_label = corr.copy()
for idx_row,row_val in corr.iterrows():
    for idx_col,col_val in row_val.items():
        pval_item = pval.loc[idx_row,idx_col]
        corr_label.loc[idx_row,idx_col] = stars(col_val,pval_item,2)
corr_label = corr_label.values.tolist()
mask = np.triu(corr)
plot_cor_matrix(corr.round(2),mask,corr_label)
plt.show()

In [None]:
plt.hist(posts['post_like_normed'],bins=100,log=True,cumulative=True)
plt.show()d

In [None]:
plt.hist(posts['collect_normed'],bins=100,log=True,cumulative=True)
plt.show()

In [None]:
# posts = posts[(posts['1_tags']==0) & (posts['if_video']==1)]
posts.index.name = 'index'
posts.to_csv('./data/xhs/posts_images_tags.csv')

In [None]:
posts = pd.read_csv('./data/xhs/posts_images_tags.csv',index_col='index')
dates = pd.DataFrame(posts['post_date'].value_counts()>=50).rename(
    columns={'post_date':'period_50'})
posts = posts.reset_index().set_index('post_date').join(dates)
posts = posts[posts['period_50']==True].reset_index().set_index('index')
posts = posts.rename(columns={'level_0':'post_date'})
posts['post_date'] = pd.to_datetime(posts['post_date']).dt.to_period('d')
posts['crawl_date'] = pd.to_datetime(posts['crawl_date']).dt.to_period('d')
posts = posts[(posts['post_date']>='2022-03-20') &\
              (posts['post_date']<='2022-09-20')]

In [None]:
f,axes = plt.subplots(1,1,figsize=(4,4))
for n,col in enumerate(['post_date']):
    X,Y = [],[]
    for x,y in posts[[col]].value_counts().sort_index().items():
        X.append(str(x[0]))
        Y.append(y)
    
    axes[n].plot(X,Y)
    axes[n].set_title(col)
    axes[n].set_ylabel('No. of posts per day')
    axes[n].set_xticks(np.arange(0,len(X),len(X)//10))
    axes[n].set_xticklabels([X[i] for i in range(0,len(X),len(X)//10)],
                            rotation=15,va='center',position=(0,-0.08))
    
plt.tight_layout()
plt.show()

In [None]:
plt.hist(,bins=100)
plt.show()
plt.hist(posts[['crawl_date']].value_counts().sort_index(),bins=100)
plt.show()

In [None]:
posts['post_date'].unique()

In [None]:
.unique()

In [None]:
all_nodes = []
all_edges = []
keywords_by_post = {}

for i,v in tqdm.tqdm(posts.iterrows()):
    if v['keywords']==v['keywords']:
        keywords = list(json.loads(v['keywords']).keys())
        all_nodes+=keywords
        all_edges+=[tuple(sorted(pair)) for pair in\
                    combinations(sorted(keywords),2)]
        keywords_by_post[i]=keywords
        
all_edges = Counter(all_edges)
all_edges = dict(sorted(all_edges.items(),
                        key=lambda item:item[1],reverse=True))

In [None]:
all_big_edges = {i:v for i,v in all_edges.items() if v>=100}
print(len(all_edges))
print(len(all_big_edges))

G = nx.Graph()
for pair,weight in tqdm.tqdm(all_edges.items()):
    G.add_edge(pair[0],pair[1],weight=weight)

H = nx.Graph()
for pair,weight in tqdm.tqdm(all_big_edges.items()):
    H.add_edge(pair[0],pair[1],weight=weight)

In [None]:

comms_lp = nx_comm.label_propagation_communities(
    H)
comms_al = nx_comm.asyn_lpa_communities(
    H,weight='weight')

In [None]:
print('Modularity:',nx_comm.modularity(H,comms_gm))
print('Modularity:',nx_comm.modularity(H,comms_lp))
print('Modularity:',nx_comm.modularity(H,comms_al))

In [3]:
comms = comms_gm
comms = [list(i) for i in comms]
comms_dict = []
for n,i in enumerate(comms):
    comms_dict+=list(zip(i,[n]*len(i)))
comms_dict = dict(comms_dict)    
cmap_counts = len([len(i) for i in comms if len(i)>10])

edges_same_comm = {i:[] for i in range(0,cmap_counts+1)}
edges_spec_comm = []
edges_diff_comm = []

for (j,k) in tqdm.tqdm(all_big_edges):
    
    j_c = comms_dict[j]
    k_c = comms_dict[k]
    
    if j_c==k_c:
        if j_c<=cmap_counts:
            edges_same_comm[j_c].append((j,k))
        else:
            edges_diff_comm.append((j,k))
    else:
        edges_spec_comm.append((j,k))

edgewidth = [int(np.log(i[2]['weight'])) for i in H.edges(data=True)]
edgewidth = [edge_-min(edgewidth)+1 for edge_ in edgewidth]
pos = nx.spring_layout(H,k=.75,seed=100)

fig, ax = plt.subplots(figsize=(9,9),dpi=200)
for n,comm in enumerate(comms):
    if n<=cmap_counts:
        rgba = cmap(n/cmap_counts)
        color = matplotlib.colors.rgb2hex(rgba)
    else:
        color = 'grey'
    nx.draw_networkx_nodes(H,pos,nodelist=comm,node_size=10,
                           node_color=color,alpha=.75)

for n,comm in edges_same_comm.items():
    rgba = cmap(n/cmap_counts)
    color = matplotlib.colors.rgb2hex(rgba)
    nx.draw_networkx_edges(H,pos,edgelist=comm,width=edgewidth,
                           edge_color=color,alpha=.25)
nx.draw_networkx_edges(H,pos,edgelist=edges_spec_comm,width=edgewidth,
                       edge_color='green',alpha=.25)    
nx.draw_networkx_edges(H,pos,edgelist=edges_diff_comm,width=edgewidth,
                       edge_color='k',alpha=.25)
plt.show()

NameError: name 'comms_gm' is not defined

In [None]:
for i,v in tqdm.tqdm(keywords_by_post.items()):
    novelty = 0
    for pair in [tuple(sorted(pair)) for pair in combinations(sorted(v),2)]:
        if pair in edges_spec_comm:
            novelty+=1
    posts.loc[i,'novelty'] = novelty

In [None]:
posts.to_csv('./data/xhs/posts_novelty.csv')

In [None]:


label_options = {"ec": "k", "fc": "white", "alpha":.70}
nx.draw_networkx_labels(H, pos, font_size=10, bbox=label_options)
font = {"color": "k", "fontweight": "bold", "fontsize": 14}
ax.set_title("Topics as Nodes, Posts as Edges", font)
ax.set_xlim(-1.05,1.05)
# ax.set_ylim(-1.15, 0.85)
# ax.text(
#     0.55,
#     0.14,
#     "Edge width = log number \nof posts w/o self-loops)",
#     horizontalalignment="left",
#     transform=ax.transAxes,
#     fontdict=font)

# ax.text(
#     0.55,
#     0.06,
#     "Node size = number of \nposts w/ self-loops)",
#     horizontalalignment="left",
#     transform=ax.transAxes,
#     fontdict=font)

plt.show()

In [None]:
iteration = range(1,len(topics)+1)
all_combi = {i:0 for i in combinations(iteration,2)}
all_combi.update({(i,i):0 for i in iteration})
topic_count = {i:0 for i in iteration}

na = 0
for i,v in tqdm.tqdm(posts.iterrows()):
    topic_ = v['ai_pred_topics']

    if topic_==topic_:
        
        if len(topic_)>=5:
        
            topic_ = [int(re.sub('[\W_]+', '', s)) \
                      for s in topic_.split(',')]
            
            for i in topic_:
                topic_count[i]+=1

            if len(topic_)==1:
                topic_ = [(topic_[0],topic_[0])]
            else:
                topic_ = list(combinations(sorted(topic_),2))

            for top_ in topic_:
                all_combi[top_]+=1
        else:
            na +=1
    else:
        na += 1
        
topic_count = {topics.loc[i]['name_en']:v for i,v in \
               topic_count.items() if v>0}
topic_count = pd.DataFrame.from_dict(topic_count,orient='index')

In [None]:
topic_count

In [None]:
f,ax = plt.subplots(1,1,figsize=(7,4))
ax.bar(np.arange(len(topic_count[0].values)),
        topic_count[0].values)
ax.set_xticks(np.arange(len(topic_count[0].values)))
ax.set_xticklabels(topic_count.index.values,rotation=90)
plt.show()

In [None]:
total = sum([v for i,v in all_combi.items()])
print(total)
all_combi_log = {i:int(np.log(v)) for i,v in all_combi.items() if v>0}
all_combi_log = dict(sorted(all_combi_log.items(),
                            key=lambda item:item[1],reverse=True))
all_combi_log.pop((163,163))

In [None]:
topic_count

In [None]:
# G = nx.MultiGraph()
G = nx.Graph()
for edge,weight in all_combi_log.items():
    G.add_edge(edge[0], edge[1], weight=weight)
labels = topics[['name_en']].to_dict()['name_en']
G = nx.relabel_nodes(G,labels)

pos = nx.spring_layout(G,k=.5,seed=100)
edgewidth = [i[2]['weight'] for i in G.edges(data=True)]
nodesize = [int(topic_count.loc[i,0]/30) for i in G.nodes()]

fig, ax = plt.subplots(figsize=(9,9),dpi=200)
nx.draw_networkx_edges(G, pos, alpha=.15, width=edgewidth, edge_color="k")
nx.draw_networkx_nodes(G, pos, node_size=nodesize, node_color="k", alpha=.70)
label_options = {"ec": "k", "fc": "white", "alpha":.70}
nx.draw_networkx_labels(G, pos, font_size=10, bbox=label_options)
font = {"color": "k", "fontweight": "bold", "fontsize": 14}
ax.set_title("Topics as Nodes, Posts as Edges", font)
ax.set_xlim(-1.05,1.05)
# ax.set_ylim(-1.15, 0.85)
# ax.text(
#     0.55,
#     0.14,
#     "Edge width = log number \nof posts w/o self-loops)",
#     horizontalalignment="left",
#     transform=ax.transAxes,
#     fontdict=font)

# ax.text(
#     0.55,
#     0.06,
#     "Node size = number of \nposts w/ self-loops)",
#     horizontalalignment="left",
#     transform=ax.transAxes,
#     fontdict=font)

plt.show()
# nx.write_gexf(G,'./topics.gexf')

In [None]:
all_combi_weighted = {i:(total-v)/total for i,v in all_combi.items() if v>0}

for i,v in tqdm.tqdm(posts.iterrows()):
    topic_ = v['ai_pred_topics']

    if topic_==topic_:
        
        if len(topic_)>=5:
            topics_ = [int(re.sub('[\W_]+', '', s)) for s in topic_.split(',')]
            if len(topics_)==1:
                topics_ = [(topics_[0],topics_[0])]
            else:
                topics_ = list(combinations(sorted(topics_),2))
            scores_ = []
            for i_ in topics_:
                scores_.append(all_combi_weighted[i_])
            posts.loc[i,'creativity'] = np.mean(scores_)
        else:
            posts.loc[i,'creativity'] = np.NaN
    else:
        posts.loc[i,'creativity'] = np.NaN            

In [None]:
all_combi_weighted = {i:(total-v)/total for i,v in all_combi.items() if v>0}

for i,v in tqdm.tqdm(posts.iterrows()):
    topic_ = v['ai_pred_topics']

    if topic_==topic_:
        
        if len(topic_)>=5:
            topics_ = [int(re.sub('[\W_]+', '', s)) for s in topic_.split(',')]
            if len(topics_)==1:
                topics_ = [(topics_[0],topics_[0])]
            else:
                topics_ = list(combinations(sorted(topics_),2))
            scores_ = []
            posts.loc[i,'creativity_1'] = len(topics_)            
        else:
            posts.loc[i,'creativity_1'] = np.NaN
    else:
        posts.loc[i,'creativity_1'] = np.NaN            

In [None]:
posts['creativity_1'].value_counts()