# Using the OpenAI API to detect fake news

In [152]:
import os
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
import csv
from dotenv import load_dotenv
import openai
from transformers import GPT2TokenizerFast
import numpy as np
from collections import defaultdict
import pandas as pd

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

## Dataset 1: Infodemic Dataset

In [109]:
df1=pd.read_csv('data/english_test_with_labels.csv')

In [110]:
df1.head()

Unnamed: 0,id,tweet,label
0,1,Our daily update is published. States reported...,real
1,2,Alfalfa is the only cure for COVID-19.,fake
2,3,President Trump Asked What He Would Do If He W...,fake
3,4,States reported 630 deaths. We are still seein...,real
4,5,This is the sixth time a global health emergen...,real


In [112]:
df1.value_counts("label")

label
real    1120
fake    1020
dtype: int64

In [114]:
df1.rename(columns={'tweet':'prompt', 'label':'completion'}, inplace=True)

df1.to_csv('labelled.csv', index=False)

In [115]:
# DO IN TERMINAL - YES for all
! openai tools fine_tunes.prepare_data -f labelled.csv

Analyzing...

- Based on your file extension, your file is formatted as a CSV file
- Your file contains 2140 prompt-completion pairs
- The input file should contain exactly two columns/keys per row. Additional columns/keys present are: ['id']
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- There are 1 examples that are very long. These are rows: [1469]
For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://beta.openai.com/docs/guides/fine-tuning/preparing

In [116]:
! openai api fine_tunes.create -t "data/labelled_prepared_train.jsonl" -v "data/labelled_prepared_valid.jsonl" --compute_classification_metrics --classification_positive_class " fake" -m ada

Upload progress: 100%|███████████████████████| 395k/395k [00:00<00:00, 100Mit/s]
Uploaded file from labelled_prepared_train.jsonl: file-BWX1iGdqrjhHwhIDm3neSIJb
Upload progress: 100%|████████████████████| 97.9k/97.9k [00:00<00:00, 55.7Mit/s]
Uploaded file from labelled_prepared_valid.jsonl: file-ko1bE41reJCW3P0AXSJU4r4U
Created fine-tune: ft-t1HJWc9ujRl4ZXkakLXyiqPy
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2023-06-20 13:41:43] Created fine-tune: ft-t1HJWc9ujRl4ZXkakLXyiqPy

Stream interrupted (client disconnected).
To resume the stream, run:

  openai api fine_tunes.follow -i ft-t1HJWc9ujRl4ZXkakLXyiqPy



In [120]:
! openai api fine_tunes.follow -i ft-t1HJWc9ujRl4ZXkakLXyiqPy

[2023-06-20 13:41:43] Created fine-tune: ft-t1HJWc9ujRl4ZXkakLXyiqPy
[2023-06-20 13:43:44] Fine-tune costs $0.15
[2023-06-20 13:43:44] Fine-tune enqueued. Queue number: 18
[2023-06-20 13:43:47] Fine-tune is in the queue. Queue number: 17
[2023-06-20 13:44:06] Fine-tune is in the queue. Queue number: 16
[2023-06-20 13:44:59] Fine-tune is in the queue. Queue number: 15
[2023-06-20 13:46:27] Fine-tune is in the queue. Queue number: 14
[2023-06-20 13:48:46] Fine-tune is in the queue. Queue number: 13
[2023-06-20 13:49:31] Fine-tune is in the queue. Queue number: 12
[2023-06-20 13:51:55] Fine-tune is in the queue. Queue number: 11
[2023-06-20 13:53:00] Fine-tune is in the queue. Queue number: 10
[2023-06-20 13:53:45] Fine-tune is in the queue. Queue number: 9
[2023-06-20 13:54:50] Fine-tune is in the queue. Queue number: 8
[2023-06-20 13:55:37] Fine-tune is in the queue. Queue number: 7
[2023-06-20 13:56:21] Fine-tune is in the queue. Queue number: 6
[2023-06-20 14:06:36] Fine-tune is in th

In [161]:
# load the validation dataset
test = pd.read_json('data/labelled_prepared_valid.jsonl', lines=True)
test.head()

Unnamed: 0,prompt,completion
0,Alfalfa is the only cure for COVID-19.\n\n###\n\n,fake
1,Our daily update is published. We’ve now track...,real
2,"Households should have ""required"" medical kits...",fake
3,An image of a man carrying his old mother on h...,fake
4,"3/10 About 8% of population ""may be infected a...",real


In [162]:
ft_model = 'ada:ft-personal-2023-06-20-12-47-16'
 
def detect(text):
     
    # add the suffix to the prompt
    input_prompt = text + ' \n\n###\n\n'
    response = openai.Completion.create(model=ft_model, prompt=input_prompt, max_tokens=1, temperature=0)
     
    output = response['choices'][0]['text']
    return output
 
 
# get predictions for the test dataset
test['predictions'] = test['prompt'].apply(lambda x:detect(x))
 
test

Unnamed: 0,prompt,completion,predictions
0,Alfalfa is the only cure for COVID-19.\n\n###\n\n,fake,fake
1,Our daily update is published. We’ve now track...,real,real
2,"Households should have ""required"" medical kits...",fake,fake
3,An image of a man carrying his old mother on h...,fake,fake
4,"3/10 About 8% of population ""may be infected a...",real,real
...,...,...,...
423,Total deaths reached 16399. States reported 19...,real,real
424,???China is to blame because the culture where...,fake,fake
425,Banana contains a lectin that is a powerful an...,fake,fake
426,Asymptomatic coronavirus spread is rare @WHO s...,real,real


In [163]:
test.value_counts('predictions')

predictions
 real    219
 fake    209
dtype: int64

In [164]:
np.mean(test.completion==test.predictions)

0.9742990654205608

In [165]:
accuracy = accuracy_score(test.completion, test.predictions)
precision = precision_score(test.completion, test.predictions, pos_label=' fake')
recall = recall_score(test.completion, test.predictions, pos_label=' fake')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.9742990654205608
Precision: 0.9760765550239234
Recall: 0.9714285714285714


In [130]:
# show rows where the predicted label is different from the actual label

test[test.completion!=test.predictions]

Unnamed: 0,prompt,completion,predictions
21,There are 4 #COVID19 Govt. testing lab in #Agr...,real,fake
22,#IndiaFightsCorona: #COVID19 does not even spa...,real,fake
28,The underlying cause of death in the vast majo...,fake,real
76,@geoallison Today's official figures.\n\nA rec...,fake,real
92,A paper estimating that 266796 COVID-19 cases ...,fake,real
165,This is just another indication that COVID-19 ...,real,fake
255,The lack of coronavirus testing in the U.S. is...,fake,real
293,Really we need to move on from hydroxychloroqu...,real,fake
294,"NSW, Australia #COVID19 positive Chinese woman...",fake,real
308,We’re up bright and early celebrating healthca...,real,fake


# Dataset 2: TurthSeeker Dataset

In [31]:
df2 = pd.read_csv('data/Truth_Seeker_Model_Dataset.csv')

In [5]:
df2.head()

Unnamed: 0.1,Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer
0,0,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@POTUS Biden Blunders - 6 Month Update\n\nInfl...,Mostly Agree,Agree
1,1,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,NO MAJORITY,Agree
2,2,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",THE SUPREME COURT is siding with super rich pr...,Agree,Agree
3,3,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@POTUS Biden Blunders\n\nBroken campaign promi...,Mostly Agree,Agree
4,4,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@OhComfy I agree. The confluence of events rig...,Agree,Agree


In [32]:
df2.value_counts('target')

target
True     68930
False    65268
dtype: int64

In [33]:
df2.drop(columns=['Unnamed: 0', 'author', 'statement', 'BinaryNumTarget', 'manual_keywords', '5_label_majority_answer', '3_label_majority_answer'], inplace=True)

In [8]:
df2.head()

Unnamed: 0,target,tweet
0,True,@POTUS Biden Blunders - 6 Month Update\n\nInfl...
1,True,@S0SickRick @Stairmaster_ @6d6f636869 Not as m...
2,True,THE SUPREME COURT is siding with super rich pr...
3,True,@POTUS Biden Blunders\n\nBroken campaign promi...
4,True,@OhComfy I agree. The confluence of events rig...


In [34]:
# rename rows with true to fake and false to true

df2.rename(columns={'tweet':'prompt', 'target':'completion'}, inplace=True)
df2['completion'] = df2['completion'].astype(str)
df2['completion'].replace({'True':'real', 'False':'fake'}, inplace=True)
df2.to_csv('labelled2.csv', index=False)

In [None]:
# DO IN TERMINAL - NO for split data
! openai tools fine_tunes.prepare_data -f labelled2.csv

In [36]:
# we skip fine tuning because we will use the model we trained before

# load the whole new dataset
test2 = pd.read_json('data/labelled2_prepared.jsonl', lines=True)
test2.head()

Unnamed: 0,prompt,completion
0,@POTUS Biden Blunders - 6 Month Update\n\nInfl...,real
1,@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,real
2,THE SUPREME COURT is siding with super rich pr...,real
3,@POTUS Biden Blunders\n\nBroken campaign promi...,real
4,@OhComfy I agree. The confluence of events rig...,real


In [68]:
test2.value_counts('completion')

completion
 real                                                                                                                         68930
 fake                                                                                                                         65268
 #FLDebate                                                                                                                        1
 Our New Hampshire poll is finding real anger toward Sen. Kelly Ayotte over her no vote on background checks #NowIsTheTime        1
 Trump admin hypocritical on States' Rights vis  vis Healthcare v Marijuana. ReeferMadness redux                                  1
 because of gun control: 56                                                                                                       1
 million.                                                                                                                         1
dtype: int64

In [82]:
test2 = test2[test2['completion'].isin([' fake', ' real'])]
test2.value_counts('completion')

completion
 real    68930
 fake    65268
dtype: int64

In [111]:
# take only 0,5% of the dataset for now --> otherwise too long and too expensive API calls

test3 = test2.sample(frac=0.005, random_state=1)

In [105]:
test3 = test3[test3['completion'].isin([' fake', ' real'])] # idk why we need to do this again but there was one row with a different label

In [112]:
test3.value_counts('completion')

completion
 real    358
 fake    313
dtype: int64

In [113]:
#we use the finetune model we trained and the "detect" function we defined above

test3['predictions'] = test3['prompt'].apply(lambda x:detect(x))
 
test3

Unnamed: 0,prompt,completion,predictions
129659,@Ali79596850 @NicolaSturgeon They don't test f...,fake,fake
22878,#Trumplandia &gt; So States rights matter on #...,real,fake
66829,@Celeste03639266 @CCW315 @nytimes Youre not un...,real,fake
17927,@SplashtownDrew @Ian_Cades Not if the minimum ...,real,real
116821,@BreitbartNews God hates hands that shed innoc...,fake,fake
...,...,...,...
94574,"Since the world has gone crazy, can I talk abo...",fake,fake
50590,@MillerLtSword @klcmurphy @JohnSpohn1 @chrlywd...,fake,fake
35543,@zascmo @SpindleyBobbins @ClintSmithIII State ...,real,fake
15449,@MichaelNey19 @JonelleElgaway @CanadaPain @CMe...,real,fake


In [114]:
test3.predictions.value_counts()

 fake    640
 real     31
Name: predictions, dtype: int64

In [115]:
np.mean(test3.completion==test3.predictions)

0.4858420268256334

In [156]:
accuracy = accuracy_score(test3.completion, test3.predictions)
precision = precision_score(test3.completion, test3.predictions, pos_label=' fake')
recall = recall_score(test3.completion, test3.predictions, pos_label=' fake')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.4858420268256334
Precision: 0.475
Recall: 0.9712460063897763


In [116]:
test3[test3.completion!=test3.predictions]

Unnamed: 0,prompt,completion,predictions
22878,#Trumplandia &gt; So States rights matter on #...,real,fake
66829,@Celeste03639266 @CCW315 @nytimes Youre not un...,real,fake
29258,Marco Rubio voted against the Violence Against...,real,fake
2736,Drug overdoses were responsible for killing ov...,real,fake
66639,@CLTKICKSNFUNK @chamath Pro-union/ minimum wag...,real,fake
...,...,...,...
1519,@RealCameronDye @EricksonAaron44 @DanPriceSeat...,real,fake
14999,New York's paid family leave is set to take ef...,real,fake
35543,@zascmo @SpindleyBobbins @ClintSmithIII State ...,real,fake
15449,@MichaelNey19 @JonelleElgaway @CanadaPain @CMe...,real,fake


## Truthseeker Dataset with own fine-tune model

In [117]:
df3 = df2.sample(frac=0.005, random_state=1)

In [120]:
df3.head()

Unnamed: 0,completion,prompt
12759,real,"@MFOLParkland ""We think it's reasonable to pro..."
105052,fake,@MollyJongFast The GOP just sent Tommy Tubervi...
91929,real,"With the highest property taxes, seven percent..."
38545,real,@Kalidog4 @Jimdotbeep @goodyweaver The term wa...
4494,real,@tj110_ the fbi director said white supremacis...


In [121]:
df3.value_counts('completion')

completion
real    347
fake    324
dtype: int64

In [122]:
df3.to_csv('labelled3.csv', index=False)

In [None]:
# DO IN TERMINAL - Yes for all
! openai tools fine_tunes.prepare_data -f labelled3.csv

In [123]:
! openai api fine_tunes.create -t "data/labelled3_prepared_train.jsonl" -v "data/labelled3_prepared_valid.jsonl" --compute_classification_metrics --classification_positive_class " fake" -m ada

Upload progress: 100%|██████████████████████| 141k/141k [00:00<00:00, 48.9Mit/s]
Uploaded file from labelled3_prepared_train.jsonl: file-3nuy8jA0iZRQYNgxLSFoT6J9
Upload progress: 100%|████████████████████| 36.6k/36.6k [00:00<00:00, 17.8Mit/s]
Uploaded file from labelled3_prepared_valid.jsonl: file-dvmEB6JqPqGs2PBYgw7pZL1Z
Created fine-tune: ft-LtV4Q3SDPUDDzLDMgBrE3x1Q
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2023-06-23 11:23:55] Created fine-tune: ft-LtV4Q3SDPUDDzLDMgBrE3x1Q

Stream interrupted (client disconnected).
To resume the stream, run:

  openai api fine_tunes.follow -i ft-LtV4Q3SDPUDDzLDMgBrE3x1Q



In [140]:
! openai api fine_tunes.follow -i ft-LtV4Q3SDPUDDzLDMgBrE3x1Q

[2023-06-23 11:23:55] Created fine-tune: ft-LtV4Q3SDPUDDzLDMgBrE3x1Q
[2023-06-23 11:31:10] Fine-tune costs $0.05
[2023-06-23 11:31:11] Fine-tune enqueued. Queue number: 6
[2023-06-23 11:31:22] Fine-tune is in the queue. Queue number: 5
[2023-06-23 11:35:21] Fine-tune is in the queue. Queue number: 4
[2023-06-23 11:35:55] Fine-tune is in the queue. Queue number: 3
[2023-06-23 11:39:25] Fine-tune is in the queue. Queue number: 2
[2023-06-23 11:41:01] Fine-tune is in the queue. Queue number: 1
[2023-06-23 11:44:42] Fine-tune is in the queue. Queue number: 0
[2023-06-23 11:44:45] Fine-tune started
[2023-06-23 11:46:24] Completed epoch 1/4
[2023-06-23 11:47:50] Completed epoch 2/4
[2023-06-23 11:49:14] Completed epoch 3/4
[2023-06-23 11:50:39] Completed epoch 4/4

Job complete! Status: succeeded 🎉
Try out your fine-tuned model:

openai api completions.create -m ada:ft-personal-2023-06-23-09-51-04 -p <YOUR_PROMPT>


In [141]:
test4 = pd.read_json('data/labelled3_prepared_valid.jsonl', lines=True)


In [142]:
ft_model = 'ada:ft-personal-2023-06-23-09-51-04'

test4['predictions'] = test4['prompt'].apply(lambda x:detect(x))
 
test4

Unnamed: 0,prompt,completion,predictions
0,@MollyJongFast The GOP just sent Tommy Tubervi...,fake,fake
1,@tj110_ the fbi director said white supremacis...,real,real
2,"@goodblackdude @PZonLAKEontario ""No one will e...",fake,real
3,@bradleigh__ The more ppl start doing somethin...,real,real
4,@Aggie21Atx @GhostOfQC44 @mattywatty01 @rcb05 ...,real,real
...,...,...,...
130,@Farnsworth67 @TomiLahren The assault on wealt...,real,real
131,@Morgan11444396 @THR Another reason we need th...,real,real
132,@NYCPBA Politicians are the lowest form of lif...,fake,real
133,@JodiMcKayMP @GladysB Hahaha..gladbag Gladys i...,fake,fake


In [143]:
test4.predictions.value_counts()

 real    72
 fake    63
Name: predictions, dtype: int64

In [144]:
np.mean(test4.completion==test4.predictions)

0.8444444444444444

In [158]:
accuracy = accuracy_score(test4.completion, test4.predictions)
precision = precision_score(test4.completion, test4.predictions, pos_label=' fake')
recall = recall_score(test4.completion, test4.predictions, pos_label=' fake')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.8444444444444444
Precision: 0.8253968253968254
Recall: 0.8387096774193549


In [145]:
test4[test4.completion!=test4.predictions]

Unnamed: 0,prompt,completion,predictions
2,"@goodblackdude @PZonLAKEontario ""No one will e...",fake,real
5,"@realDonaldTrump You ran on building the Wall,...",real,fake
6,#TedCruzCampaignSlogans\nTed Cruz 2016: Becaus...,fake,real
10,Im bout dumb as shit thinking that the militar...,real,fake
16,@WydivaLong @bgiuliano22 @Whiskey100Proof You ...,fake,real
33,@arungovil12 @IRBMishra While India has been g...,real,fake
35,@Bluerai48855500 @N0__S0ul Democrats thought t...,real,fake
36,A large caravan of trump supporters have been ...,real,fake
55,@Mikeago @ProudPTDeeDee @FaceTheNation @tedcru...,fake,real
58,@acad_editor @prateekrsn The really poor don't...,real,fake


## Infodemic Dataset with model trained on Truthseeker dataset

In [148]:
# get predictions for the test dataset defined earlier (our ft model was saved above)
test['predictions1'] = test['prompt'].apply(lambda x:detect(x))
 
test

Unnamed: 0,prompt,completion,predictions
0,Alfalfa is the only cure for COVID-19.\n\n###\n\n,fake,fake
1,Our daily update is published. We’ve now track...,real,fake
2,"Households should have ""required"" medical kits...",fake,fake
3,An image of a man carrying his old mother on h...,fake,fake
4,"3/10 About 8% of population ""may be infected a...",real,fake
...,...,...,...
423,Total deaths reached 16399. States reported 19...,real,fake
424,???China is to blame because the culture where...,fake,fake
425,Banana contains a lectin that is a powerful an...,fake,fake
426,Asymptomatic coronavirus spread is rare @WHO s...,real,fake


In [149]:
test.predictions1.value_counts()

 fake    354
 real     74
Name: predictions, dtype: int64

In [150]:
np.mean(test.completion==test.predictions1)

0.514018691588785

In [157]:
accuracy = accuracy_score(test.completion, test.predictions1)
precision = precision_score(test.completion, test.predictions1, pos_label=' fake')
recall = recall_score(test.completion, test.predictions1, pos_label=' fake')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.514018691588785
Precision: 0.5028248587570622
Recall: 0.8476190476190476
