In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data_dir = '/nas/eclairnas01/users/siyiguo/eating_disorder_data/df_emo_RT_toxicity_scores.csv'

In [5]:
df = pd.read_csv(data_dir,lineterminator='\n',nrows=1000)
df['created_at'] = pd.to_datetime(df['created_at'])
df.columns

Index(['tweet_id', 'author_id', 'created_at', 'text', 'retweet',
       'parent_author_id', 'parent_created_at', 'community_id', 'Anger',
       'Anticipation', 'Disgust', 'Fear', 'Joy', 'Love', 'Optimism',
       'Pessimism', 'Sadness', 'Surprise', 'Trust', 'No emotion detected',
       'max_emo', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult',
       'identity_attack'],
      dtype='object')

In [20]:
df['created_at'].min()

Timestamp('2023-03-10 22:18:50+0000', tz='UTC')

In [30]:
entire_time_range = pd.date_range(start='2023-03-10',end='2023-03-11',freq='12H',tz='utc')

user_ts_count = df.groupby(['author_id',pd.Grouper(freq='12H',key='created_at')])['tweet_id'].count()
# user_ts_count = user_ts_count.reindex(pd.MultiIndex.from_product([user_ts_count.index.levels[0],entire_time_range],names=['author_id','created_at']),fill_value=0)


In [28]:
user_ts_count#.groupby(level=0).mean()

author_id            created_at               
764870               2023-03-10 12:00:00+00:00    1
6633782              2023-03-10 12:00:00+00:00    1
11891512             2023-03-10 12:00:00+00:00    1
14106569             2023-03-10 12:00:00+00:00    1
14285025             2023-03-10 12:00:00+00:00    1
                                                 ..
1634328743798874114  2023-03-10 12:00:00+00:00    2
1634329930405236738  2023-03-10 12:00:00+00:00    1
1634330007853228033  2023-03-10 12:00:00+00:00    1
1634330325298868224  2023-03-10 12:00:00+00:00    1
1634331406833795073  2023-03-10 12:00:00+00:00    1
Name: tweet_id, Length: 818, dtype: int64

In [31]:
user_ts_count.reindex(pd.MultiIndex.from_product([user_ts_count.index.levels[0],entire_time_range],names=['author_id','created_at']),fill_value=0)

author_id            created_at               
764870               2023-03-10 00:00:00+00:00    0
                     2023-03-10 12:00:00+00:00    1
                     2023-03-11 00:00:00+00:00    0
6633782              2023-03-10 00:00:00+00:00    0
                     2023-03-10 12:00:00+00:00    1
                                                 ..
1634330325298868224  2023-03-10 12:00:00+00:00    1
                     2023-03-11 00:00:00+00:00    0
1634331406833795073  2023-03-10 00:00:00+00:00    0
                     2023-03-10 12:00:00+00:00    1
                     2023-03-11 00:00:00+00:00    0
Name: tweet_id, Length: 2454, dtype: int64

In [None]:
from collections import Counter
import pandas as pd
import re
import pickle
import numpy as np
from ast import literal_eval
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix

### Plot TSNE

In [None]:
def plot_tsne(tsne_results,labels,target_set,title,legend=None):
    plt.figure(figsize=(6, 6))
    # plt.xlim((-40, 40))
    # plt.ylim((-40, 40))

    if legend is None:
        legend = target_set

    markers = ['.','1','v','s','p','*','+','X','d','^']

    for target,color,leg,mark in zip(reversed(target_set),sns.color_palette("hls", len(target_set)),reversed(legend),markers):
        fig = plt.scatter(tsne_results[np.isin(labels,target)][:, 0],
                          tsne_results[np.isin(labels,target)][:, 1],
                          color=color,alpha=0.2,marker=mark,
                          label=leg)
    plt.legend()
    plt.title(title)
    return fig

def cluster_analysis(embeddings_dir,gt_dir,preds_dir,embedding_type,tsne_results=None):
    # embeddings from the model
    embeddings = pickle.load(open(embeddings_dir,'rb'))
    preds = pickle.load(open(preds_dir,'rb'))
    test_gt = pd.read_csv(gt_dir)
    labels = test_gt['label'].values
    cluster0_labels = test_gt['cluster0_label'].values

    if tsne_results is None:
        tsne = TSNE(n_components=2, verbose=0) #, perplexity=40, n_iter=300
        tsne_results = tsne.fit_transform(embeddings)

    # plot biggest cluster 0 and unknow cluster 95
    plot_tsne(tsne_results,labels,[0,max(labels)],f'{embedding_type} - biggest cluster & noise in GT')
    plt.show()
    
    # plot biggest cluster 0
    plot_tsne(tsne_results,labels,[0],f'{embedding_type} - biggest cluster in GT')
    plt.show()

    # # plot top 7 clusters other than the biggest cluster 0
    # plot_tsne(tsne_results,labels,[33, 43, 49, 22, 54, 56, 6],f'{embedding_type} - other top 7 cluster in GT')
    # plt.show()

    # plot biggest cluster 0 by different hashtag usage
    g = plot_tsne(tsne_results,cluster0_labels,list([0.1,0.2,0.3,0.4,0.5,0.6]),f'{embedding_type} - biggest cluster in GT by different hashtags',legend=['#Marine2017','#JLM2017','#Fillon2017','#Hamon2017','#Gabon','#RPFavecFF'])
    # handles, legend_labels  =  g.get_legend_handles_labels()
    # g.legend(handles, )
    plt.show()

    # plot biggest cluster 0 by different hashtag usage plus 95 unknown
    g = plot_tsne(tsne_results,cluster0_labels,list([0.0,0.1,0.2,0.3,0.4,0.5,0.6,max(labels)]),f'{embedding_type} - biggest cluster in GT by different hashtags and noise',legend=['unknown in cluster 0','#Marine2017','#JLM2017','#Fillon2017','#Hamon2017','#Gabon','#RPFavecFF', f"unknwon {max(labels)}"])
    # handles, legend_labels  =  g.get_legend_handles_labels()
    # g.legend(handles, )
    plt.show()

    # plot model predicted clusters
    plot_tsne(tsne_results[np.isin(labels,[0])],preds[np.isin(labels,[0])],np.unique(preds[np.isin(labels,[0])]),f'{embedding_type} - Model Predicted Clusters')

    # compare predicted cluster with gt labels in the biggest cluster
    # conf_mtx = confusion_matrix(cluster0_labels[np.isin(labels,[0])]-1, preds[np.isin(labels,[0])])
    # print('confusion matrix:')
    # print(conf_mtx)

    return tsne_results #, conf_mtx

In [None]:
embeddings_dir = '../test_phase1a_bert_pca_lownoise/test_data_embeddings.pkl'
preds_dir = '../test_phase1a_bert_pca_lownoise/test_data_preds.pkl'
gt_dir = '/nas/eclairnas01/users/siyiguo/incas_data/phase1a_time_coord_gt_data_df.csv'

embeddings = pickle.load(open(embeddings_dir,'rb'))
tsne = TSNE(n_components=2, verbose=0) #, perplexity=40, n_iter=300
tsne_results_bertpca_lownoise = tsne.fit_transform(embeddings)

pickle.dump(tsne_results_bertpca_lownoise,open('../test_phase1a_bert_pca_lownoise/test_data_tsne_bertpca.pkl','wb'))

# tsne_results_bertpca_lownoise = pickle.load(open('../test_phase1a_bert_pca_midnoise/test_data_tsne_bertpca.pkl','rb'))

tsne_results_bertpca_lownoise = cluster_analysis(embeddings_dir,gt_dir,preds_dir,'Feature: BERT PCA-5',tsne_results=tsne_results_bertpca_lownoise)