# SPI Pipeline

1. Get info about users in Instagram and Twitter
2. Compute sentiment
3. Get percentiles


## Import libraries

In [1]:
from instagrapi                  import Client
from datetime                    import date
from pysentimiento.preprocessing import preprocess_tweet
from transformers                import AutoTokenizer, AutoModelForSequenceClassification
from pysentimiento               import create_analyzer
from datetime                    import datetime as dt

import pandas   as  pd
import numpy    as  np

import torch
import tweepy
import time




## Import models

In [None]:
tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
language_detector = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
analyzer_en = create_analyzer(task="sentiment", lang="en")
analyzer_es = create_analyzer(task="sentiment", lang="es")

In [3]:
def rescale_probs(proba_dict):
    keys, values = list(proba_dict.keys()), list(proba_dict.values())
    pred_key = keys[np.argmax(values)]
    pred_value = 0
    if pred_key == "NEG":
        pred_value = 1-np.max(values)
    elif pred_key == "POS":
        pred_value = np.max(values)
    else:
        neg_value = values[0]
        pos_value = values[-1]

        add_val = pos_value if pos_value > neg_value else -neg_value

        pred_value = 0.5 + (1-np.max(values))*add_val/2
    return pred_value
    
def preprocess_comment_adv(string_, max_word_count=50):
    substrs_to_remove = ["cara emoji", "emoji", "   ", "\n"]
    procs_str = preprocess_tweet(string_)
    for substr in substrs_to_remove:
        procs_str = procs_str.replace(substr, "")
    procs_str = procs_str.replace("Jjaja", "Jajaja")
    
    return " ".join(procs_str.split(" ")[:max_word_count])

def get_sentiment(comments, lang_detector, en_analyzer, sp_analyzer, tokenizer):
    # Quitar esto
    if len(comments) == 0: # Sentimiento del usuario cuando no tiene comentarios = 0.5
        return 0.5
    else:
        preprocessed_comments = [preprocess_comment_adv(comment) for comment in comments]
        languages_detected = []
        for comment in preprocessed_comments:
            inputs = tokenizer(comment, return_tensors="pt")
            with torch.no_grad():
                logits = lang_detector(**inputs).logits
            predicted_class_id = logits.argmax().item()
            languages_detected.append(lang_detector.config.id2label[predicted_class_id])

        sentiment_probas = []
        for ix in range(len(preprocessed_comments)):
            if languages_detected[ix] == "en":
                estimation = en_analyzer.predict(preprocessed_comments[ix]).probas
            elif languages_detected[ix] == "es" or languages_detected[ix] == "pt":
                estimation = sp_analyzer.predict(preprocessed_comments[ix]).probas
            else:
                estimation = {'NEG': 0, 'NEU': 1, 'POS': 0}
            sentiment_probas.append(estimation)

        sentiment = [rescale_probs(probs) for probs in sentiment_probas]
        df = pd.DataFrame({"Comment": comments, "Language": languages_detected, "Sentiment": sentiment})

        return df["Sentiment"].mean()

## Instantiate tokens and declare methods for Instagram and Twitter retrieval of data

In [62]:
sum([8,5,6,8,45])

72

In [88]:
# --- INSTAGRAM ---

cl = Client()
# cl.login("clyde_donovan_2022", "clyde_donovan_2022_")
# cl.login("chrismiller202212", "1OE0t5@Po9*z_")
cl.login("samanthaoakley202212", "1OE0t5@Po9*z")

def extract_text(comment):
    comments_text_ls = []
    if len(comment[0]) != 0:
        comments_text_ls = [comment.text for comment in comment[0]]
    return comments_text_ls

def get_ig_data(cl, username, max_posts=5, max_comments=10): 

    user_id = cl.user_id_from_username(username)
    posts = cl.user_medias(user_id)
    user_information = cl.user_info(user_id)

    n_followers = user_information.follower_count
    n_following = user_information.following_count
    n_posts = user_information.media_count

    # time.sleep(60)
    
    posts_info = {}

    for i, post in enumerate(posts):
        if i >= max_posts:
            posts_info[post.id] = {
                                "n_comments": post.comment_count, 
                                "n_likes": post.like_count, 
                                "caption": post.caption_text, 
                                "comments_text": []
                                }    
        else: 
            posts_info[post.id] = {
                                    "n_comments": post.comment_count, 
                                    "n_likes": post.like_count, 
                                    "caption": post.caption_text, 
                                    "comments_text": extract_text(cl.media_comments_chunk(post.id, max_amount=max_comments))
                                    }                              
    # time.sleep(60)
    user_data = {
        "user_name": username,
        "user_id": user_id,
        "n_followers": n_followers,
        "n_following": n_following,
        "n_posts_total": n_posts,
        "n_posts_retrieved": len(posts_info),
        "n_likes_total": sum([posts_info[key]["n_comments"] for key in posts_info.keys()]),
        "n_likes_retrieved": sum([posts_info[key]["n_likes"] for key in list(posts_info.keys())[:max_posts]]),
        "n_comments_total": sum([posts_info[key]["n_comments"] for key in posts_info.keys()]),
        "n_comments_retrieved": sum([len(posts_info[key]["comments_text"]) for key in posts_info.keys()]),
        "created_at": dt.now(),
        "posts_info": posts_info
    }
    return user_data

# --- TWITTER ---

CONSUMER_KEY = 'P8uR3oU2eTF1hZw2SX1lma8Zw'
CONSUMER_SECRET = 'Kg85hTJdliuTdFroydQvoRsg7cpr5WZm2MdEQtCz8EjcjG04dp'
ACCESS_TOKEN = '386339280-5sy1Smvnkw91cBObIwu3ju6aMQJU4B8X8HZNsDSo'
ACCESS_TOKEN_SECRET = 'WuZNLBRfzZgT6DIDnZOPqaVlt8uMtrvOUJ9aSbUwkmocD'

auth = tweepy.OAuth1UserHandler(
    CONSUMER_KEY, 
    CONSUMER_SECRET, 
    ACCESS_TOKEN, 
    ACCESS_TOKEN_SECRET
)

api = tweepy.API(auth, wait_on_rate_limit=True)

twitter_username = ""

def get_replies(api, username, tweet_id, max_replies=10, max_attempts=20):
    replies = tweepy.Cursor(api.search_tweets, q='to:{}'.format(username),
                                    since_id=tweet_id, tweet_mode='extended').items()

    replies_ls = []

    counter_fetched_rep = 0
    counter_attempts = 0
    while counter_fetched_rep < max_replies and counter_attempts < max_attempts:
        try:
            reply = replies.next()
            if not hasattr(reply, 'in_reply_to_status_id_str'):
                continue
            if reply.in_reply_to_status_id == tweet_id:
                replies_ls.append(reply.full_text)
                counter_fetched_rep = counter_fetched_rep + 1 
            counter_attempts = counter_attempts + 1 

        except StopIteration:
            break

        except Exception as e:
            print("Failed while fetching replies {}".format(e))
            break
    return replies_ls

def get_twitter_data(api, username, max_tweets=5, max_replies=10):
    
    n_followers = api.search_users(username)[0].followers_count
    extracted_tweets = []

    for status in tweepy.Cursor(api.search_tweets, 
                                f"from:{username}",
                                count=max_tweets).items(max_tweets):
        extracted_tweets.append(status)
    retweets = [tweet for tweet in extracted_tweets if "RT @" in tweet.text]
    tweets = set(extracted_tweets) - set(retweets)
    n_retweets = len(retweets)
    n_tweets = len(tweets)
    n_id_ls = []
    n_retweets_ls = []
    n_favorites_ls = []
    tweet_text_ls = []
    for tweet in tweets:
        n_id_ls.append(tweet.id)
        n_retweets_ls.append(tweet.retweet_count)
        n_favorites_ls.append(tweet.favorite_count)
        tweet_text_ls.append(tweet.text)
    replies_ls = [get_replies(api, username, tweet_id, max_replies=max_replies) for tweet_id in n_id_ls]

    user_data = {
        "user_name": username,
        "n_followers": n_followers,
        "n_retweets": n_retweets,
        "n_tweets": n_tweets,
        "n_retweets_to_user": sum(n_retweets_ls),
        "n_favorites_to_user": sum(n_favorites_ls),
        "n_replies_to_user": sum([len(replies_post) for replies_post in replies_ls]),
        "tweets_text": tweet_text_ls,
        "tweets_replies": replies_ls,
        "created_at": dt.now()
    }

    return user_data

def get_comments_ls(data_dict, mode="ig"):
    all_comments = []
    if mode=="ig":
        for post_id in data_dict['posts_info'].keys():
            all_comments.extend(data_dict['posts_info'][post_id]["comments_text"])
    else:
        for tweet_replies in data_dict["tweets_replies"]:
            for comment in tweet_replies:
                all_comments.append(comment)
    return all_comments

Status 429: Too many requests
Ignore 429: Continue login


## Retrieve data from social media

In [9]:
user_data_twitter = get_twitter_data(api, "JFCadavid", max_tweets=10, max_replies=10)

In [10]:
user_data_ig = get_ig_data(cl, "jfcadavid", max_posts=5, max_comments=10)

In [11]:
def get_comments_ls(data_dict, mode="ig"):
    all_comments = []
    if mode=="ig":
        for post_id in data_dict['posts_info'].keys():
            all_comments.extend(data_dict['posts_info'][post_id]["comments_text"])
    else:
        for tweet_replies in user_data_twitter["tweets_replies"]:
            for comment in tweet_replies:
                all_comments.append(comment)
    return all_comments

## Compute the sentiment for user accounts (IG and Twitter) 

In [12]:
# Instagram
# posts_comments_ig = [user_data_ig['posts_info'][post_id]['comments_text'] for post_id in user_data_ig['posts_info'].keys()]
posts_comments_ig = get_comments_ls(user_data_ig, mode="ig")
sentiment_instagram = get_sentiment(posts_comments_ig, language_detector, analyzer_en, analyzer_es, tokenizer)

In [15]:
# Twitter
posts_comments_tw = get_comments_ls(user_data_twitter, mode="tw")
sentiment_twitter = get_sentiment(posts_comments_ig, language_detector, analyzer_en, analyzer_es, tokenizer)

In [16]:
posts_comments_tw

['@JFCadavid Uruguay es mucho más equipo, solo que los técnicos son miedosos',
 '@JFCadavid Solo Brazil esta por encima.',
 '@JFCadavid Compara a Pasto tercero con 4 puntos con Santafe LÍDER del otro grupo…..un poco parcializado',
 '@JFCadavid Señor inflador de millos... Porque no hablamos de tu equipo inflado....ya vez llegar la eliminación y lo que nos vamos a reír en diciembre por cuanta de millos city ......',
 '@JFCadavid Pues si no le alcanza a Santa Fe siendo líder, imagínese al resto... qué titular tan tendencioso...',
 '@JFCadavid Le alcanzará a Santa fe ? Si Santa fe es el líder .. más bien le alcanzará a millonarios???',
 '@JFCadavid Los del yuyu xq tenían la mejor nómina del país y clasificaron raspando podríamos decir q el yuyu y nacional amb9s fracasaron en el semestre salvo q nacional quedo campeón en la liga anterior',
 '@JFCadavid Ojalá les alcance la trampa Juan Felipe, Viera demostró cual era el objetivo de Junior y no me vengan a salir con que no hubo nada raro, com

In [52]:
sentiment_twitter

[0.41033692056399884,
 0.5,
 0.3339894809149695,
 0.27873771263383934,
 0.27252542562257887]

In [53]:
print(f"Instagram sentiment: {np.mean(sentiment_instagram)}")
print(f"Twitter sentiment: {np.mean(sentiment_twitter)}")

Instagram sentiment: 0.4369447707948009
Twitter sentiment: 0.3591179079470773


In [54]:
user_data_ig

{'user_name': 'jfcadavid',
 'user_id': '192185562',
 'n_followers': 130317,
 'n_following': 756,
 'n_posts_total': 2834,
 'n_posts_retrieved': 5,
 'n_likes_total': 5049,
 'n_comments_total': 70,
 'date': '21/11/2022',
 'posts_info': {'2975297232093322943_192185562': {'n_comments': 56,
   'n_likes': 1230,
   'caption': '💔 ¿quién más así?',
   'comments_text': ['Yo 😭',
    '😢',
    '🙋🏽\u200d♂️ 😥',
    '😢😢😢😢',
    '☝🏻',
    '😢',
    '\U0001fae4🙋🏼\u200d♀️',
    'Mala noticia el mejor del mundo',
    '💔',
    'Valbuena debe estar igual y las chicas que conocieron con Roberi también... es un crack, es un animal en la cancha, pero la vida se encarga de cobrarte todo, algunos lo llaman Karma 🙌',
    '😢😢😢',
    '☝️',
    '@john_alex86 si',
    '😢🙋🏼\u200d♀️😢',
    'BENZEMA ES MAS SALADO QUE ZIPAQUIRA.  UN NGRAN GOLEADOR SE VA DEL MUNDIAL.  COSAS DE LA VIDA.  Y COLOMBIA EXTRAÑANDO A  LOS PECHIFRIOS QUE DE PREMIO POR SER ELIMINADOS LOS LLEVAN DE PASEO A ESTADOS.  COSAS DEL FÚTBOL',
    'Probableme

## Create a dummy dict with account users

In [5]:
users_dict = {
    "cristiano": {
        "instagram": "cristiano",
        "twitter": "Cristiano"
    },
    "messi": {
        "instagram": "leomessi",
        "twitter": "Ieomessiok"
    },
    "lebron": {
        "instagram": "lebron",
        "twitter": "KingJames"
    },
    "nadal": {
        "instagram": "rafaelnadal",
        "twitter": "RafaelNadal"
    },
    "mayweather": {
        "instagram": "floydmayweather",
        "twitter": "FloydMayweather"
    }
}

## Create a gigamethod to process everything

In [90]:
def get_spi(user_dict):
# Twitter

    start_time = time.time()
    print("Getting twitter profile data")
    twitter_username = user_dict["twitter"]
    user_data_twitter = get_twitter_data(api, twitter_username, max_tweets=10, max_replies=10)

    posts_comments_tw = get_comments_ls(user_data_twitter, mode="tw")
    sentiment_twitter = get_sentiment(posts_comments_tw, language_detector, analyzer_en, analyzer_es, tokenizer)
    user_data_twitter.pop('tweets_text', None)
    user_data_twitter.pop('tweets_replies', None)
    user_data_twitter["sentiment_twitter"] = sentiment_twitter
    print(f"Elapsed time: {(time.time() - start_time)} secs")
    
    
# Instagram
    start_time = time.time()
    print("Getting instagram profile data")
    ig_username = user_dict["instagram"]
    user_data_ig = get_ig_data(cl,ig_username, max_posts=5, max_comments=10)

    posts_comments_ig = get_comments_ls(user_data_ig, mode="ig")
    sentiment_instagram = get_sentiment(posts_comments_ig, language_detector, analyzer_en, analyzer_es, tokenizer)
    user_data_ig.pop('posts_info', None)
    user_data_ig["sentiment_instagram"] = sentiment_instagram
    print(f"Elapsed time: {(time.time() - start_time)} secs")

    return user_data_twitter, user_data_ig

In [None]:
user_data_twitter, user_data_ig = get_spi(users_dict["cristiano"])

In [92]:
start_time_allp = time.time()
users_data = {}

for user in users_dict.keys():
    print(f"Working on {user}")
    user_data_twitter, user_data_ig = get_spi(users_dict[user])
    users_data[user] = {}
    users_data[user]["twitter"] = user_data_twitter
    users_data[user]["instagram"] = user_data_ig
    print(f"Done with {user}")
    print("-----------------")
    time.sleep(60)
print(f"Total elapsed time: {(time.time()-start_time_allp)} secs")

Working on cristiano
Getting twitter profile data
Elapsed time: 87.92663049697876 secs
Getting instagram profile data
Elapsed time: 256.2639436721802 secs
Done with cristiano
-----------------
Working on messi
Getting twitter profile data
Elapsed time: 14.084624290466309 secs
Getting instagram profile data
Elapsed time: 155.21371245384216 secs
Done with messi
-----------------
Working on lebron
Getting twitter profile data
Elapsed time: 8.368600368499756 secs
Getting instagram profile data
Elapsed time: 228.44292306900024 secs
Done with lebron
-----------------
Working on nadal
Getting twitter profile data
Elapsed time: 3.6094443798065186 secs
Getting instagram profile data
Elapsed time: 137.0606071949005 secs
Done with nadal
-----------------
Working on mayweather
Getting twitter profile data
Elapsed time: 1.1450352668762207 secs
Getting instagram profile data
Elapsed time: 121.41418385505676 secs
Done with mayweather
-----------------
Total elapsed time: 1315.1272485256195 secs


In [95]:
users_data_instagram = [users_data[username]["instagram"] for username in users_data.keys()]
df_ig = pd.DataFrame(users_data_instagram)
df_ig["n_likes_retrieved"] = abs(df_ig["n_likes_retrieved"])
df_ig["engagement"] = (df_ig["n_likes_total"] + df_ig["n_comments_total"])/(df_ig["n_followers"])*100
df_ig


Unnamed: 0,user_name,user_id,n_followers,n_following,n_posts_total,n_posts_retrieved,n_likes_total,n_likes_retrieved,n_comments_total,n_comments_retrieved,created_at,sentiment_instagram,engagement
0,cristiano,173560420,502741773,523,3402,3398,85695526,54750995,85695526,80,2022-11-24 16:37:07.291570,0.69624,34.091269
1,leomessi,427553890,378390987,286,949,949,33419326,56111341,33419326,191,2022-11-24 16:40:27.454016,0.583508,17.663912
2,lebron,7855453810,661215,34,2956,2955,142086,33001,142086,160,2022-11-24 16:46:01.935118,0.638398,42.977246
3,rafaelnadal,1938502255,17074848,207,1244,1243,2245446,926669,2245446,170,2022-11-24 16:49:10.334872,0.707313,26.301212
4,floydmayweather,16264572,28522578,317,1044,1043,4564730,1013098,4564730,193,2022-11-24 16:52:16.778585,0.571865,32.007836


In [73]:
df_ig["n_followers"].quantile(0.8)

403261144.20000005

In [77]:
from scipy import stats
stats.percentileofscore(df_ig["n_followers"], 17074848)

40.0

In [86]:
# users_data_twitter = [users_data[username]["twitter"] for username in users_data.keys()]
df_tw = pd.DataFrame(users_data_twitter)
df_tw["relative_engagement_per_post"] = (df_tw["n_favorites_to_user"] + df_tw["n_retweets_to_user"] + + df_tw["n_replies_to_user"])/(df_tw["n_followers"]*df_tw["n_tweets"])*100
df_tw


Unnamed: 0,user_name,n_followers,n_retweets,n_tweets,n_retweets_to_user,n_favorites_to_user,n_replies_to_user,created_at,sentiment_twitter,relative_engagement_per_post
0,Cristiano,105308453,0,10,392256,3884065,6,2022-11-24 15:31:16.871960,0.496363,0.406076
1,Ieomessiok,559544,0,10,16945,383509,10,2022-11-24 15:38:13.685011,0.634433,7.156971
2,KingJames,52476180,2,7,4034,55621,3,2022-11-24 15:41:27.702786,0.663781,0.016241
3,RafaelNadal,15829515,0,1,165,1624,2,2022-11-24 15:46:14.343182,0.430537,0.011314
4,FloydMayweather,7831358,0,0,0,0,0,2022-11-24 15:49:20.490307,0.5,


In [87]:
users_data_twitter

[{'user_name': 'Cristiano',
  'n_followers': 105308453,
  'n_retweets': 0,
  'n_tweets': 10,
  'n_retweets_to_user': 392256,
  'n_favorites_to_user': 3884065,
  'n_replies_to_user': 6,
  'created_at': datetime.datetime(2022, 11, 24, 15, 31, 16, 871960),
  'sentiment_twitter': 0.49636333840584906},
 {'user_name': 'Ieomessiok',
  'n_followers': 559544,
  'n_retweets': 0,
  'n_tweets': 10,
  'n_retweets_to_user': 16945,
  'n_favorites_to_user': 383509,
  'n_replies_to_user': 10,
  'created_at': datetime.datetime(2022, 11, 24, 15, 38, 13, 685011),
  'sentiment_twitter': 0.6344329414317261},
 {'user_name': 'KingJames',
  'n_followers': 52476180,
  'n_retweets': 2,
  'n_tweets': 7,
  'n_retweets_to_user': 4034,
  'n_favorites_to_user': 55621,
  'n_replies_to_user': 3,
  'created_at': datetime.datetime(2022, 11, 24, 15, 41, 27, 702786),
  'sentiment_twitter': 0.6637814821552505},
 {'user_name': 'RafaelNadal',
  'n_followers': 15829515,
  'n_retweets': 0,
  'n_tweets': 1,
  'n_retweets_to_user

In [23]:
np.clip([10,2,3], a_min=1, a_max=5)

array([5, 2, 3])

engagement = (n_likes+n_comments)/(n_followers)*100

In [16]:
(48225587+81)/(502741773/5)*100

47.96266253371391

In [17]:
(56006302+191)/(378390987/5)*100

74.00611394583771

---

In [None]:
user_data_ig = get_ig_data(cl, "jfcadavid", max_posts=5, max_comments=10)
user_data_twitter = get_twitter_data(api, "JFCadavid", max_tweets=10, max_replies=10)

In [None]:
# Instagram
posts_comments_ig = [user_data_ig['posts_info'][post_id]['comments_text'] for post_id in user_data_ig['posts_info'].keys()]
sentiment_instagram = [get_sentiment(comments, language_detector, analyzer_en, analyzer_es, tokenizer) for comments in posts_comments_ig]

# Twitter
posts_comments_tw = user_data_twitter['tweets_replies']
sentiment_twitter= [get_sentiment(comments, language_detector, analyzer_en, analyzer_es, tokenizer) for comments in posts_comments_tw]

In [65]:
user_data_ig

{'user_name': 'jfcadavid',
 'user_id': '192185562',
 'n_followers': 130317,
 'n_following': 756,
 'n_posts_total': 2834,
 'n_posts_retrieved': 5,
 'n_likes_total': 5049,
 'n_comments_total': 70,
 'date': '21/11/2022',
 'posts_info': {'2975297232093322943_192185562': {'n_comments': 56,
   'n_likes': 1230,
   'caption': '💔 ¿quién más así?',
   'comments_text': ['Yo 😭',
    '😢',
    '🙋🏽\u200d♂️ 😥',
    '😢😢😢😢',
    '☝🏻',
    '😢',
    '\U0001fae4🙋🏼\u200d♀️',
    'Mala noticia el mejor del mundo',
    '💔',
    'Valbuena debe estar igual y las chicas que conocieron con Roberi también... es un crack, es un animal en la cancha, pero la vida se encarga de cobrarte todo, algunos lo llaman Karma 🙌',
    '😢😢😢',
    '☝️',
    '@john_alex86 si',
    '😢🙋🏼\u200d♀️😢',
    'BENZEMA ES MAS SALADO QUE ZIPAQUIRA.  UN NGRAN GOLEADOR SE VA DEL MUNDIAL.  COSAS DE LA VIDA.  Y COLOMBIA EXTRAÑANDO A  LOS PECHIFRIOS QUE DE PREMIO POR SER ELIMINADOS LOS LLEVAN DE PASEO A ESTADOS.  COSAS DEL FÚTBOL',
    'Probableme

In [57]:
user_data_ig_ls = user_data_ig.copy()

In [60]:
user_data_ig_ls

{'user_name': 'jfcadavid',
 'user_id': '192185562',
 'n_followers': 130317,
 'n_following': 756,
 'n_posts_total': 2834,
 'n_posts_retrieved': 5,
 'n_likes_total': 5049,
 'n_comments_total': 70,
 'date': '21/11/2022'}