In [1]:
import pandas as pd
import os
import json
import openai
import numpy as np
import time

### Read dataset

In [2]:
df_raw = pd.read_csv('SentimentLabeled_10112022.csv')
df = df_raw.drop_duplicates()
df = df[(df['country'] == 'China') & (df['SentimentScore'].isnull() == False)]
df = df[['id', 'text', 'SentimentScore']]

df.head()

Unnamed: 0,id,text,SentimentScore
1,3559992,"Socialist tyrant Maduro starves, tortures his ...",2.0
7,3467650,"Russia, China, India, Pakistan, North Korea, I...",2.0
10,3797537,"Ileana Ros-Lehtinen, Ted Yoho Condemn China's ...",1.5
14,3560002,"BREAKING: In Oval Office meeting today, the Ch...",4.5
15,3560002,"BREAKING: In Oval Office meeting today, the Ch...",4.0


In [3]:
df_cleaned = df.drop_duplicates(subset='id', keep=False)
df_cleaned = df_cleaned[df_cleaned.SentimentScore.apply(float.is_integer)]
df_cleaned

Unnamed: 0,id,text,SentimentScore
1,3559992,"Socialist tyrant Maduro starves, tortures his ...",2.0
7,3467650,"Russia, China, India, Pakistan, North Korea, I...",2.0
19,3526981,Gov Branstad has longstanding relationship w P...,4.0
23,3560025,"They partner with foreign adversaries, includi...",1.0
24,3560070,Russia has been in violation of the INF treaty...,2.0
...,...,...,...
15835,1311393101492219905,Politics should stop at the water’s edge. \n\n...,2.0
15837,1311347001699823617,We need to end America's dependence on China.\...,2.0
15839,1312156974885138434,Excellent move by @realDonaldTrump. We need to...,2.0
15841,1312023398147649537,Is it fair to make the assessment that China h...,2.0


In [4]:
# Merge duplicate rows by averaging sentiment scores
avg_score = df.groupby('id')['SentimentScore'].mean().to_frame().reset_index()
df = df.drop_duplicates(subset='id')
df = df.drop('SentimentScore', axis=1)
df = df.merge(avg_score, on='id', how='left')

In [5]:
def score_to_label(score):
    if score < 2.5:
        return 'negative'
    elif score >= 2.5 and score <=3.5:
        return 'neutral'
    elif score > 3.5:
        return 'positive'

In [6]:
df['sentiment'] = df.SentimentScore.apply(score_to_label)
df_cleaned['sentiment'] = df_cleaned.SentimentScore.apply(score_to_label)

In [7]:
df_cleaned['SentimentScore'].value_counts()

2.0    3514
1.0    2939
0.0     189
4.0     156
3.0     135
5.0       6
6.0       1
Name: SentimentScore, dtype: int64

### Classify on a sample set (testing)

In [8]:
sampled_df = df_cleaned.groupby('sentiment').apply(lambda x: x.sample(10)).reset_index(drop=True)
sampled_df = sampled_df.sample(frac=1)

In [9]:
sampled_df

Unnamed: 0,id,text,SentimentScore,sentiment
10,4726753,This is why we need to pass #USMCA. It will he...,3.0,neutral
6,1410336888981004292,"In recent years, the Chinese government has be...",2.0,negative
19,3727534,China cooperates with U.S. law enforcement age...,3.0,neutral
4,1289350518909947905,CONFIRMED: @realDonaldTrump’s decisive action ...,1.0,negative
11,3727430,TUNE IN at about 2:15 PM ET I’ll be speaking o...,3.0,neutral
26,3359848,I am encouraged by the continued dialogue betw...,4.0,positive
14,3727537,I’m headed to the Senate floor to talk about h...,3.0,neutral
12,3150999,"@RepSteveStivers ""we have an innovation gap in...",3.0,neutral
29,1306355946172698624,USTR Chief Agricultural Negotiator Gregg Doud ...,4.0,positive
1,1409986280583274500,Chinese American scientists &amp; researchers ...,1.0,negative


In [11]:
question_prompt = """Determine the given tweet's sentiment toward China. Return either positive, neutral, or negative.

Example 1: "The humanitarian, security and health threats personified in the coronavirus are being exacerbated by \
authoritarian socialist policies and the dishonestly of foreign aggressors and abusers, like China. \
https://t.co/CKAtP9qOUT #coronavirus". 
Answer: negative.
Reason: The tweet uses explicit negative sentiment towards China through the words such as \
“dishonest”, “aggressor”, and “abuser”.

Example 2: "The WH makes a targeted effort to counter China's Military-Fusion Strategy by denying visas to \
those grad students supporting Beijing’s effort to replace US as the leading global power.Chinese \
students are still welcome, but not those working for the govt. https://t.co/P8bMTEPMWr". 
Answer: neutral 
Reason: The first sentence is a factual statement by itself and the second sentence \
is a partial comment on the policy with a mixed implicit sentiment.

Example 3: "@rollcall Unclear why effort to create a scientifically valid vaccine can’t be successful \
without China/US involvement. Speaks to how wide split btwn the US and China has become and how that split \
undermines their leadership. US should be the first of the two to join and show leadership". 
Answer: positive 
Reason: it shows support for cooperation with China to develop a vaccine.

Given these examples, value the following tweet: """

In [63]:
# question_prompt = """Determine the given tweet's sentiment toward China. Return either positive, neutral, or negative.

# Example 1: "The humanitarian, security and health threats personified in the coronavirus are being exacerbated by \
# authoritarian socialist policies and the dishonestly of foreign aggressors and abusers, like China. \
# https://t.co/CKAtP9qOUT #coronavirus". 
# The answer to this tweet: negative..

# Example 2: "The WH makes a targeted effort to counter China's Military-Fusion Strategy by denying visas to \
# those grad students supporting Beijing’s effort to replace US as the leading global power.Chinese \
# students are still welcome, but not those working for the govt. https://t.co/P8bMTEPMWr". 
# The answer to this tweet: neutral

# Example 3: "@rollcall Unclear why effort to create a scientifically valid vaccine can’t be successful \
# without China/US involvement. Speaks to how wide split btwn the US and China has become and how that split \
# undermines their leadership. US should be the first of the two to join and show leadership". 

# Given these examples, value the following tweet: """

In [12]:
def classify_sentiment(df, prompt):
    
    i = 0
    sentiments = []
    
    while i < len(df):
        
        row = df.iloc[i]
        tweet = row.text
        tweet_id = row.id
    
        prompt_text = prompt + tweet
        
        try:
    
            response = openai.Completion.create(
              model="text-davinci-003",
              prompt=prompt_text,
              max_tokens=256,
              top_p=1,
              frequency_penalty=0,
              presence_penalty=0
            )
            sentiment = response['choices'][0]['text'].strip()
            sentiments.append((tweet_id,sentiment))
            i += 1
            
        except Exception as e:
            time.sleep(60)
    
    return sentiments

In [46]:
def extract_sentiment(text):
    if text[0] != 'A':
        start_idx = text.find('Answer')
        text = text[start_idx:]
    text = text.lower().splitlines()[0]
    if 'negative' in text:
        return 'negative'
    elif 'positive' in text:
        return 'positive'
    elif 'neutral' in text:
        return 'neutral'

In [14]:
model_output = classify_sentiment(sampled_df, question_prompt)

In [48]:
output_df = pd.DataFrame(model_output, columns=['id', 'model_output'])
model_df = sampled_df.merge(output_df, on='id', how='inner')
model_df['outputed_sentiment'] = model_df.model_output.apply(extract_sentiment)

In [49]:
model_df

Unnamed: 0,id,text,SentimentScore,sentiment,model_output,outputed_sentiment
0,4726753,This is why we need to pass #USMCA. It will he...,3.0,neutral,Answer: positive,positive
1,1410336888981004292,"In recent years, the Chinese government has be...",2.0,negative,Answer: negative,negative
2,3727534,China cooperates with U.S. law enforcement age...,3.0,neutral,Answer: neutral,neutral
3,1289350518909947905,CONFIRMED: @realDonaldTrump’s decisive action ...,1.0,negative,Answer: negative,negative
4,3727430,TUNE IN at about 2:15 PM ET I’ll be speaking o...,3.0,neutral,Answer: positive \nReason: The tweet expresses...,positive
5,3359848,I am encouraged by the continued dialogue betw...,4.0,positive,Answer: positive,positive
6,3727537,I’m headed to the Senate floor to talk about h...,3.0,neutral,Answer: positive,positive
7,3150999,"@RepSteveStivers ""we have an innovation gap in...",3.0,neutral,Answer: neutral,neutral
8,1306355946172698624,USTR Chief Agricultural Negotiator Gregg Doud ...,4.0,positive,Answer: positive,positive
9,1409986280583274500,Chinese American scientists &amp; researchers ...,1.0,negative,Answer: negative \nReason: The tweet unpacks t...,negative


In [50]:
#Accuracy
np.mean(model_df['sentiment'] == model_df['outputed_sentiment'])

0.7

In [19]:
positive = model_df[model_df['sentiment'] == 'positive']
np.mean(positive['sentiment'] == positive['outputed_sentiment'])

0.7

In [20]:
neutral = model_df[model_df['sentiment'] == 'neutral']
np.mean(neutral['sentiment'] == neutral['outputed_sentiment'])

0.6

In [21]:
negative = model_df[model_df['sentiment'] == 'negative']
np.mean(negative['sentiment'] == negative['outputed_sentiment'])

0.8

In [18]:
df.to_csv(os.getcwd()+'/sample_output/output2.csv')

In [142]:
os.getcwd()+'output1.csv'

'/Users/zhengjiazheng/Desktopoutput1.csv'