In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from nltk.tokenize import word_tokenize
from datetime import date
import seaborn as sns

In [3]:
with open("raw_tables_2023_03_05/raw_tweets", "rb") as raw_timeline_file:
    tweet_df = pd.read_pickle(raw_timeline_file)
    print(tweet_df["content"].head(5))

0    I applaud North Carolina’s announcement to exp...
1    I’m so proud to have the Young Men’s Leadershi...
2    Happy Women’s History Month! When women succee...
3                    Stay safe out there, North Texas!
4    Wishing my fellow Texans everywhere a happy Te...
Name: content, dtype: object


In [4]:
tweet_df.set_index("id", inplace=True)

In [5]:
user_tweet_df = tweet_df.groupby("user_name")["like_count"].nlargest(2)
user_tweet_df = pd.DataFrame(user_tweet_df)
user_tweet_df = user_tweet_df.merge(tweet_df, on="id")
print(user_tweet_df.columns)
print(user_tweet_df)

Index(['like_count_x', 'url', 'user_name', 'content', 'creation_date',
       'reply_count', 'retweet_count', 'like_count_y', 'quote_count',
       'view_count', 'conversation_id', 'language', 'source', 'source_url',
       'source_label', 'links', 'media_types', 'reply_tweet_id',
       'reply_user_name', 'hashtags', 'cashtags', 'vibe_text', 'context_id',
       'context_url', 'context_user_name', 'context_content',
       'context_creation_date', 'context_reply_count', 'context_retweet_count',
       'context_like_count', 'context_quote_count', 'context_view_count',
       'context_conversation_id', 'context_language', 'context_source',
       'context_source_url', 'context_source_label', 'context_links',
       'context_media_types', 'context_reply_tweet_id',
       'context_reply_user_name', 'context_hashtags', 'context_cashtags',
       'context_vibe_text', 'context_types'],
      dtype='object')
                     like_count_x  \
id                                  
15836301790

In [6]:
content_df = user_tweet_df[["user_name", "content"]]
concatenated_content = content_df.groupby("user_name")["content"].apply(lambda x: " ".join(x)).reset_index(name="concatenated_content")
print(concatenated_content)


           user_name                               concatenated_content
0         AlLawsonJr  Great night supporting my good friend and our ...
1       AlbioSiresNJ  It's an honor to earn the support of NJ's lead...
2     AndrewJBates46  Dark Brandon is crushing it https://t.co/w0L8x...
3             Atrios  lol https://t.co/AAAPmgVCAG my guy you just ha...
4    AustinScottGA08  Guilty until proven innocent. That, my friends...
..               ...                                                ...
578      sethmoulton  As the Representative of Salem, MA, I can conf...
579       stevebenen  Mulvaney said Dems are only seeking Trump's ta...
580       thegarance  Washington Postal Workers Defy USPS Orders And...
581         timkaine  I wanted to be first at my polling place, but ...
582     virginiafoxx  North Carolinians made their voice heard in vo...

[583 rows x 2 columns]


In [7]:
l = concatenated_content.concatenated_content.map(len)

In [8]:
sorted(l)

[33,
 63,
 71,
 78,
 79,
 95,
 97,
 97,
 99,
 103,
 105,
 107,
 108,
 110,
 115,
 116,
 116,
 117,
 118,
 119,
 119,
 120,
 120,
 122,
 125,
 130,
 130,
 138,
 138,
 145,
 147,
 149,
 149,
 151,
 151,
 154,
 161,
 161,
 166,
 168,
 169,
 170,
 171,
 171,
 172,
 172,
 173,
 174,
 175,
 179,
 185,
 186,
 189,
 196,
 196,
 196,
 197,
 198,
 202,
 203,
 205,
 206,
 206,
 207,
 208,
 208,
 210,
 211,
 212,
 213,
 214,
 215,
 215,
 215,
 216,
 217,
 217,
 218,
 218,
 218,
 220,
 222,
 224,
 226,
 228,
 229,
 229,
 230,
 232,
 232,
 233,
 233,
 235,
 235,
 239,
 241,
 241,
 242,
 242,
 243,
 244,
 245,
 246,
 247,
 249,
 249,
 250,
 250,
 252,
 252,
 253,
 253,
 256,
 257,
 257,
 258,
 258,
 260,
 261,
 264,
 265,
 266,
 268,
 269,
 270,
 271,
 272,
 272,
 275,
 275,
 277,
 277,
 279,
 281,
 282,
 282,
 282,
 283,
 284,
 285,
 286,
 286,
 286,
 286,
 287,
 288,
 289,
 291,
 292,
 293,
 293,
 294,
 295,
 295,
 296,
 297,
 298,
 301,
 302,
 302,
 303,
 304,
 304,
 304,
 304,
 305,
 305,
 308,
 

In [9]:
concatenated_content_sample = concatenated_content.sample(n = 10)

In [10]:
from transformers import XLNetTokenizer, XLNetModel
import numpy as np
import torch

# Load XLNet tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')

# Generate the embeddings
embeddings = []
for i, row in concatenated_content.iterrows():
    with torch.no_grad():
        user = row["user_name"]
        text = row["concatenated_content"]
        print(i, user, text)
        encoded_dict = tokenizer.encode_plus(text, padding='max_length', max_length=512, truncation=True, return_attention_mask=True, return_tensors='pt')
        input_ids = encoded_dict['input_ids']
        attention_mask = encoded_dict['attention_mask']
        model_output = model(input_ids, attention_mask=attention_mask)
        embedding = model_output[0][:, 0, :].numpy()  # Extract the first token ([CLS]) of the last hidden state for each input
        embeddings.append(embedding)

concatenated_content["embedding"] = embeddings

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0 AlLawsonJr Great night supporting my good friend and our next governor,  @CharlieCrist. #FL02 https://t.co/Y88Zzk26qH The agenda for Black America put forth by @JoeBiden is what our nation needs. We must work together to address health disparities, inequities in our justice system, &amp; work to advance economic mobility in the Black community. #LiftEveryVoice #TeamJoe #FL05
joebiden.com/blackamerica/
1 AlbioSiresNJ It's an honor to earn the support of NJ's leading Democrats @PhilMurphyNJ @LtGovOliver @BobMenendezNJ @CoryBooker who have seen my record of standing up for our values and delivering for the people I represent up close. newjerseyglobe.com/fr/new-jerseys… Thank you to @RaviBhalla and the entire Hoboken City Council for their support! @mike_defusco @RubenRamosJr @Tiffanie_Fisher @jenforhoboken @HobokenEmily @MichaelRussoNJ @PhilipHCohen + Council members Doyle and Falco newjerseyglobe.com/congress/slew-…
2 AndrewJBates46 Dark Brandon is crushing it https://t.co/w0L8xCzIW8 .

In [13]:
concatenated_content.to_pickle("user_embeddings")
concatenated_content.to_csv("user_embeddings.csv")