In [160]:
endpoint_url = 'xxx'
api_key = "xxx"

## Classification with GPT-4

In [46]:
import json
import pandas as pd

In [161]:
from openai import AzureOpenAI


client = AzureOpenAI(
    api_key=api_key,  
    api_version="2024-06-01",
    azure_endpoint = endpoint_url
    )
    
deployment_name='xxx' 
    

In [162]:
# a function to get the text of API response
def return_text(response):
    result = json.loads(response.model_dump_json())
    text = result['choices'][0]['message']['content']
    return text

In [5]:
# import dataset(1000 most popular tweets)
df = pd.read_csv("tweets_clean.csv")
# sample 50 tweets
sampled=df.sample(n=50)
# save sample dataset
sampled.to_csv("sampled.csv",index=False)

In [7]:
sampled = pd.read_csv("sampled.csv")

In [8]:
# estimate tokens of tweets text
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4")

num_tokens = 0
for i in sampled['clean_text']:
        num_tokens += len(encoding.encode(i))

num_tokens

1963

In [None]:
# few-shot prompt to explore topics
prompt = '''You are a social media researcher. You need to read tweets about covid-19 posted during Jan and Feb 2020. 
After reading all tweets, propose the topic of each tweet, topic categories are not limited to those shown in the examples. 
## 
examples: 
Tweet 1: how to wear a mask 
topic: information 
Tweet 2: I am a gambling man, and I would bet my net worth that the actual death toll of the #coronavirus is significantly more than what is being reported by mainstream media.
topic: personal opinion
Tweet 3: Coronavirus is a biowarfare
topic: misinformation
##
output format:
topics: <comma_separated_list_of_topics>
'''

In [None]:
# combine all tweets to a single string
tweet_full = ""
for i in range(50):
    tweet = "Tweet" + str(i) + ": "+ sampled['clean_text'].iloc[i] +" "
    tweet_full = tweet_full +tweet
tweet_full


In [58]:
# Send prompt and tweets to generate an answer
response = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": tweet_full}
    ],
    temperature=0
)

In [63]:
return_text(response)

topics: <situation_update, situation_update, situation_update, situation_update, response, situation_update, situation_update, response, humor, situation_update, personal_opinion, situation_update, situation_update, personal_opinion, misinformation, misinformation, situation_update, response, misinformation, humor, situation_update, misinformation, response, personal_opinion, situation_update, situation_update, response, situation_update, situation_update, situation_update, personal_opinion, situation_update, personal_opinion, situation_update, situation_update, personal_opinion, situation_update, personal_opinion, situation_update, humor, information, response, situation_update, misinformation, situation_update, response, situation_update, situation_update, situation_update, personal_opinion, situation_update>


In [None]:
label = return_text(response)

topics_list = [topic.strip() for topic in label.split(',')]
topics_list

In [68]:
sampled["topic"] = topics_list

In [175]:
# classify top 100 tweets into 7 categories
class_prompt = '''You are a social media researcher. You need to read tweets about covid-19 posted during Jan and Feb 2020. 
Classify each tweet into one of the seven topic categories: <situation_update, personal_opinion, misinformation,individual_story,health_information,humor,public_reaction>
If you can't classify into any category, mark it as unclassifiable
## 
examples: 
Tweet 1: how to wear a mask 
topic: health_information 
```
Tweet 2: I am a gambling man, and I would bet my net worth that the actual death toll of the #coronavirus is significantly more than what is being reported by mainstream media.
topic: personal_opinion
```
Tweet 3: Coronavirus is a biowarfare
topic: misinformation
```
Tweet 4: A Chinese mother, blocked from entering Australia due to the coronavirus travel ban, is losing hope of being able to see her only son before doctors in Melbourne turn off his life support. Utterly tragic
topic: individual_story
```
Tweet 5: #TNGovt is making swift measures with spl approval from @MoHFW_INDIA to setup a #Virologylab at #KingsInstitute,Chennai for faster turnaround of blood test results.   
topic: situation_update
```
Tweet 6: Medical workers in Hong Kong say they will go on strike if authorities dont stop all visitors from China from entering Hong Kong as the #coronavirus outbreak continues.
topic: public_reaction
##
output format:
0:topic,1:topic,etc

'''

In [164]:
# combine tweets to a string
def combine_tweet(start,end,data):
    tweet_full = ""
    for i in range(start,end):
        tweet = "Tweet" + str(i) + ": "+ data['clean_text'].iloc[i] +" "
        tweet_full = tweet_full +tweet
    return (tweet_full)

In [None]:
#return first 50 tweets topics
tweet_full = combine_tweet(0,50,df)
response = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {"role": "system", "content": class_prompt},
        {"role": "user", "content": tweet_full}
    ],
    temperature=0
)

In [None]:
# return 50 -100 tweets topic
tweet_full = combine_tweet(50,100)
response2 = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {"role": "system", "content": class_prompt},
        {"role": "user", "content": tweet_full}
    ],
    temperature=0
)

In [None]:
label1 = return_text(response)
topics_list1 = [topic.strip() for topic in label1.split(',')]
label2 = return_text(response2)
topics_list2 = [topic.strip() for topic in label2.split(',')]
topics_list = topics_list1 + topics_list2

In [88]:
# export df with topic
df["topic"] = ""
df.loc[0:99,'topic'] = topics_list
df.to_csv("tweets_topic",index=False)

In [184]:
# add sampled tweets topic, 137 tweets have labels
df_topic = pd.read_csv("tweets_topic.csv")

In [131]:
# accuracy of gpt classifiation
(100 - df_topic['topic_revised'].iloc[0:100].notnull().sum())/100

np.float64(0.78)

In [135]:
# examine gpt performance
df100 = df_topic.iloc[0:100]
df100.loc[df100['topic_revised'].notnull(),['topic','topic_revised']]

Unnamed: 0,topic,topic_revised
3,public_reaction,situation_update
13,situation_update,public_reaction
22,situation_update,misinformation
35,misinformation,situation_update
38,situation_update,health_information
39,humor,health_information
40,individual_story,situation_update
41,situation_update,unclassifiable
42,situation_update,individual_story
43,misinformation,situation_update


In [185]:
df100 = df_topic.iloc[0:100]

In [186]:
df100['topic_revised'].value_counts()

topic_revised
personal_opinion      8
situation_update      6
individual_story      3
public_reaction       2
health_information    2
misinformation        1
unclassifiable        1
Name: count, dtype: int64

In [134]:
# topic distribution for 137 tweets
df_topic['topic_final'] = ""
df_topic['topic_final'].iloc[0:100] = df100['topic_revised'].fillna(df100['topic']).to_list()
df_topic['topic_final'].value_counts()

topic_final
situation_update      51
personal_opinion      26
individual_story      20
health_information    14
public_reaction       12
unclassifiable         6
misinformation         6
humor                  2
Name: count, dtype: int64

In [157]:
df_topic['topic_final'].value_counts()/137

topic_final
situation_update      0.372263
personal_opinion      0.189781
individual_story      0.124088
public_reaction       0.109489
health_information    0.102190
unclassifiable        0.043796
misinformation        0.043796
humor                 0.014599
Name: count, dtype: float64

## Classification with word embeddings


In [57]:
endpoint_url_embedding = 'xxx'
client = AzureOpenAI(
    api_key=api_key,  
    api_version="2023-05-15",
    azure_endpoint = endpoint_url_embedding
    )
    

In [67]:
# function to generate embeddings
def generate_embeddings(text):
    client = AzureOpenAI(
    api_key=api_key,  
    api_version="2023-05-15",
    azure_endpoint = endpoint_url_embedding
    )
    response = client.embeddings.create(input = text, model = "text-embedding-3-small")
    return response.data

In [150]:
df_topic = pd.read_csv("tweets_topic.csv")
# train dataset:137 labeled tweets
df_train = df_topic.loc[df_topic['topic_final'].notnull(),['clean_text','topic_final']]
# test dataset: 63 tweets
df_test = df_topic.loc[df_topic['topic_final'].isnull()].iloc[0:63,[6,9]]  

In [87]:
# get embeddings
train_emb = generate_embeddings(df_train['clean_text'])

In [90]:
test_emb = generate_embeddings(df_test['clean_text'])

In [104]:
# store embeddings to dataset
train_emb_list = [train_emb[i].embedding for i in range(len(train_emb))]
columns_name = ["embedding_" + str(i) for i in range(len(train_emb_list[0]))]
train = pd.DataFrame(train_emb_list,columns=columns_name)

In [105]:
test_emb_list = [test_emb[i].embedding for i in range(len(test_emb))]
columns_name = ["embedding_" + str(i) for i in range(len(test_emb_list[0]))]
test = pd.DataFrame(test_emb_list,columns=columns_name)

In [97]:
# Use random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [153]:
clf = RandomForestClassifier(n_estimators=100,random_state=0)
clf.fit(train, df_train['topic_final'])


In [180]:
preds = clf.predict(test)
pd.DataFrame(preds).to_csv("preds.csv",index=False)

In [181]:
# import test set with model predicted labels and human-verified labels
df_test = pd.read_csv("df_test.csv")

In [182]:
report = classification_report(df_test['topic_final'], preds)
print(report)

                    precision    recall  f1-score   support

health_information       1.00      1.00      1.00         1
  individual_story       0.50      0.12      0.20         8
    misinformation       0.00      0.00      0.00         4
  personal_opinion       0.86      0.50      0.63        12
   public_reaction       1.00      0.44      0.62         9
  situation_update       0.55      1.00      0.71        27
    unclassifiable       0.00      0.00      0.00         2

          accuracy                           0.62        63
         macro avg       0.56      0.44      0.45        63
      weighted avg       0.62      0.62      0.55        63



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [176]:
# use gpt-4 to classify test dataset
tweet_full = combine_tweet(0,63,df_test)
response = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {"role": "system", "content": class_prompt},
        {"role": "user", "content": tweet_full}
    ],
    temperature=0
)

In [None]:
label = return_text(response)
topics_list = [topic.strip() for topic in label.split(',')]
topics_list

In [183]:
report = classification_report(df_test['topic_final'], df_test['gpt4'])
print(report)

                    precision    recall  f1-score   support

health_information       0.00      0.00      0.00         1
  individual_story       0.78      0.88      0.82         8
    misinformation       0.57      1.00      0.73         4
  personal_opinion       1.00      0.25      0.40        12
   public_reaction       0.53      0.89      0.67         9
  situation_update       0.88      0.85      0.87        27
    unclassifiable       1.00      1.00      1.00         2

          accuracy                           0.75        63
         macro avg       0.68      0.70      0.64        63
      weighted avg       0.81      0.75      0.73        63

