# Sentiment Model for Tweets

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

DATA_BASE_PATH = "./data"

In [2]:
# read in datasets
users = pd.read_excel(f"{DATA_BASE_PATH}/tweets.xlsx", sheet_name="users")
tweets = pd.read_excel(f"{DATA_BASE_PATH}/tweets.xlsx", sheet_name="tweets")

# select correct columns"
tweets = tweets[
    ["Tweet Id", "Name", "UTC", "Favorites", "Retweets", "Text"]
].rename(
    columns={
        "Tweet Id": "tweet_id",
        "Name": "user",
        "UTC": "tweet_date",
        "Favorites": "count_favorites",
        "Retweets": "count_retweets",
        "Text": "text",
    }
)
users = users[["Followers", "Tweets", "Verified", "Location"]].rename(
    columns={
        "Followers": "count_followers",
        "Tweets": "count_all_tweets",
        "Verified": "is_verified",
        "Location": "location",
    }
)

# combine datasets on cols
final = pd.concat([tweets, users], axis=1)

# add tweet_month and tweet_year
final["tweet_date"] = pd.to_datetime(final["tweet_date"])
final["tweet_day"] = final["tweet_date"].dt.day
final["tweet_month"] = final["tweet_date"].dt.month
final["tweet_year"] = final["tweet_date"].dt.year

# parse out city
final["city"] = final["location"].str.split(",", expand=True, n=1)[0]

In [3]:
# load model
tokenizer = AutoTokenizer.from_pretrained("model")
model = AutoModelForSequenceClassification.from_pretrained("model")
classifier = pipeline(
    task="text-classification",
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
)



In [4]:
final.set_index("tweet_id", inplace=True)

# get score and label
text = final["text"].apply(lambda x: classifier(x)[0]).apply(lambda x: pd.Series({i["label"]: i["score"] for i in x}))
text["label"] = text.idxmax(axis="columns")
text.columns = [f"sentiment_{y}" for y in text.columns]

# put it back together
result = final.join(text)
result

Unnamed: 0_level_0,user,tweet_date,count_favorites,count_retweets,text,count_followers,count_all_tweets,is_verified,location,tweet_day,tweet_month,tweet_year,city,sentiment_sadness,sentiment_joy,sentiment_love,sentiment_anger,sentiment_fear,sentiment_surprise,sentiment_label
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1573293537546084355,Aaron Baughman,2022-09-23 12:49:44+00:00,0,0,RT @MMPatriotPride : 8th grade learning from @...,1196,3876,False,,23,9,2022,,0.000380,0.998569,0.000455,0.000232,0.000128,0.000237,joy
1573292617731051523,MeadsMillPatriots,2022-09-23 12:46:05+00:00,5,1,8th grade learning from @DTE_Energy and @Consu...,247,206,False,"Northville, MI",23,9,2022,Northville,0.000416,0.998236,0.000723,0.000255,0.000121,0.000248,joy
1573284344461987841,Jessica Kelsey,2022-09-23 12:13:12+00:00,0,0,@DTE_Energy question I am wondering why they t...,201,2720,False,"Lincoln Park, MI",23,9,2022,Lincoln Park,0.001939,0.002205,0.000395,0.060334,0.932273,0.002855,fear
1573275442349740033,Laura Dodd,2022-09-23 11:37:50+00:00,0,0,RT @MarkCavitt : NOW: SMART unveils first of f...,496,834,False,"Milford, MI",23,9,2022,Milford,0.005076,0.969966,0.001589,0.014690,0.007900,0.000779,joy
1573274884439941120,John Wallace,2022-09-23 11:35:37+00:00,0,0,"@b_hockey25 @DetroitRedWings @DTE_Energy Hey, ...",497,8477,False,"Detroit, MI",23,9,2022,Detroit,0.006556,0.059888,0.002747,0.923784,0.005949,0.001076,anger
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1570352475265470464,BANKOLE THOMPSON,2022-09-15 10:03:00+00:00,0,0,RT @InstitutePu : The nationally renowned Blac...,2443,8551,True,"Detroit, MI",15,9,2022,Detroit,0.003023,0.981682,0.001734,0.009837,0.002241,0.001482,joy
1570344087542439942,The PuLSE Institute,2022-09-15 09:29:40+00:00,4,4,The nationally renowned Black journalist and c...,406,2091,False,"Detroit, MI",15,9,2022,Detroit,0.003104,0.978111,0.001661,0.012501,0.002780,0.001844,joy
1570341239538741248,The PuLSE Institute,2022-09-15 09:18:21+00:00,6,4,"Our virtual Conference on the Economy, Equity ...",406,2091,False,"Detroit, MI",15,9,2022,Detroit,0.002491,0.988219,0.001471,0.005610,0.001566,0.000644,joy
1570204465441153024,Daniel Gilbert,2022-09-15 00:14:52+00:00,0,0,"RT @DTE_Energy : Know how to smell, see and he...",23,2081,False,,15,9,2022,,0.023865,0.459526,0.002449,0.093466,0.416697,0.003997,joy


In [5]:
result.to_csv("data/scored_data.csv")