# Tweet Analysis (WIP)

##### Analyze correlation between log-likelihood and Masked-LM score and reactions to each tweet

In [1]:
from __future__ import annotations

import os
import subprocess
from dataclasses import dataclass

import tweepy
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from RoBERTa_japanese.bert_score import Args, bert_scoring

load_dotenv()

@dataclass
class AuthenticationInfo:
    api_key: str = "",
    api_secret_key: str = "",
    bearer_token: str = "",
    access_token: str = "",
    access_token_secret: str = "",

In [None]:
auth_info = AuthenticationInfo(
    api_key=os.getenv("API_KEY"),
    api_secret_key=os.getenv("API_SECRET_KEY"),
    bearer_token=os.getenv("BEARER_TOKEN"),
)

client = tweepy.Client(
    consumer_key=auth_info.api_key,
    consumer_secret=auth_info.api_secret_key,
    bearer_token=auth_info.bearer_token,
)

In [None]:
def get_tweets_for_eval(client: tweepy.Client, user_name: str, user_id: str | None = None, limit: int = 200) -> pd.DataFrame:

    if user_id is None:
        user_id = client.get_user(username=user_name).data["id"]
    
    tweets = dict(
        tweet_id=[],
        text=[],
        score=[],
        n_likes=[],
        n_retweets=[],
    )
    # Get tweets
    for tweet in tweepy.Paginator(
        client.get_users_tweets,
        id=user_id,
        max_results=100,
        exclude=["retweets"],
        tweet_fields=["public_metrics"],
    ).flatten(limit=limit):
        
        # If log-likelihood score is not available, skip the tweet
        if "score: " not in tweet.text:
            continue
        
        splitted_tweet = tweet.text.split("score: ")
        tweets["tweet_id"].append(tweet.id)
        tweets["text"].append(splitted_tweet[0].replace("\n", "<br>"))
        tweets["score"].append(float(splitted_tweet[1]))
        tweets["n_likes"].append(tweet.data["public_metrics"]["like_count"])
        tweets["n_retweets"].append(tweet.data["public_metrics"]["retweet_count"])
        
    return pd.DataFrame(tweets)

In [None]:
df_tweets = get_tweets_for_eval(client=client, user_name="AI_15R", limit=400)

In [None]:
df_tweets_with_mlm = df_tweets.assign(
    mlm_score=bert_scoring(
        Args(
            model="RoBERTa-ja_base", 
            context=list(df_tweets.text), 
            split_tag="",
            value_only=True, 
            output_max=False,
            gpu=b"0",
        )
    )
)

In [None]:
fig = make_subplots(
    rows=1, 
    cols=2, 
    subplot_titles=[
        "いいね数+リツイート数 vs. 対数尤度のスコア",
        "いいね数+リツイート数 vs. Masked LM スコア"
    ]
)

fig.add_trace(
    go.Scatter(
        x=df_tweets_with_mlm.n_likes + df_tweets_with_mlm.n_retweets,
        y=df_tweets_with_mlm.score,
        mode="markers",
        marker=go.scatter.Marker(
            size=10,
            color="skyblue",
        ),
        hovertext=df_tweets_with_mlm.text,
    ),
    row=1,
    col=1,
)

fig.add_trace(
    go.Scatter(
        x=df_tweets_with_mlm.n_likes + df_tweets_with_mlm.n_retweets,
        y=df_tweets_with_mlm.mlm_score,
        mode="markers",
        marker=dict(
            size=10,
            color="salmon",
        ),
        hovertext=df_tweets_with_mlm.text,
    ),
    row=1,
    col=2,
)

fig.update_layout(
    width=1200,
    height=600,
    title="AI_15Rのツイート評価",
    xaxis=dict(
        title="Number of Likes + Retweets",
    ),
    yaxis=dict(
        title="Log-likelihood Score",
    ),
    xaxis2=dict(
        title="Number of Likes + Retweets",
    ),
    yaxis2=dict(
        title="Masked LM Score",
    ),
    template="plotly_dark",
    font={"family": "Ubuntu", "size": 10},
    showlegend=False,
)

fig.to_html()

fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df_tweets_with_mlm.score,
        y=df_tweets_with_mlm.mlm_score,
        mode="markers",
        marker=go.scatter.Marker(
            size=df_tweets_with_mlm.n_likes / 2 + df_tweets_with_mlm.n_retweets,
            color="salmon",
        ),
        text=df_tweets_with_mlm.text,
        hovertemplate="<b>%{text}</b><extra></extra>",
        name="各ツイート<br>円の大きさはいいね数と<br>リツイート数の合計"
    ),
)

fig.update_layout(
    width=850,
    height=800,
    title="AI_15Rのツイート評価 - Log-likelihood score vs. Masked-LM score",
    xaxis=dict(
        title="Log-likelihood score",
    ),
    yaxis=dict(
        title="Masked-LM score",
    ),
    template="plotly_dark",
    font={"family": "Ubuntu", "size": 10},
    showlegend=True,
    hovermode="x unified",
    hoverlabel=dict(bgcolor="rgba(255, 255, 255, 0.2)"),
)

fig.to_html()

fig.show()

In [14]:
df = df_tweets_with_mlm.copy(deep=True)

In [30]:
df[(df.score < -30) & (df.score > -90) & (df.mlm_score > -11)].n_likes.mean()

10.352941176470589

In [31]:
df[(df.score < -30) & (df.score > -90) & (df.mlm_score > -11)]

Unnamed: 0,tweet_id,text,score,n_likes,n_retweets,mlm_score
0,1550771205804027904,新宿駅で待ち合わせをし、待ち合わせ場所に行く、駅前のロータリーで降り、待ち合わせ場所に行く、...,-69.733,0,0,-0.118859
3,1550747858722508800,逆張りでやるのも飽きたし情報だけになってることが露になっている今の社会を壊すのが狙いみたいな...,-38.445,1,0,-3.214007
4,1550740161771610112,ハグとインターリンク作りてえです<br>ツイートピラーとインターリンク作りてえです<br>ツ...,-77.744,2,0,-0.317715
7,1550717011273318400,寿司とかいう・・・！！寿司！寿司！！寿司！！・・・寿司！！寿司！！寿司！！寿司！！寿司！？・...,-77.478,9,0,-1.502545
10,1550694096377114624,解像度高ッw<br>解像度解像度すぎッｗｗｗｗｗｗｗｗｗ<br>めちゃくちゃダサい、解像度が...,-85.152,18,5,-9.484549
...,...,...,...,...,...,...
215,1549271765574029312,殴ったことないからわからないけど、グーが入ったからパンツと風呂に入れたらしい<br>偉すぎる...,-78.301,3,0,-0.089267
223,1549220949009915905,寿司行くためにタクシー乗ったぜタクシー!寿司行くためにタクシー乗ったぜタクシー!寿司行くため...,-55.174,3,0,-0.186298
231,1549159652616990720,薬物が盛り上げてる時に苦しんでばかりいるってこと？脳に悪いし、他に方法ないな…<br>tmb...,-43.725,0,0,-1.576441
236,1549121464145948672,自分が不幸でしたこと全部言ってしまったので一生忘れません死んで良かった今となっては意味が無い...,-34.712,0,0,-3.054955


In [26]:
df[((df.score >= -30) | (df.score <= -120)) & (df.mlm_score <= -11)].n_likes.mean()

2.3076923076923075

In [None]:
df = df_tweets_with_mlm.copy(deep=True)

In [None]:
df[(df.score < -30) & (df.score > -90) & (df.mlm_score > -11)].n_likes.mean()