In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
# import openai
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, accuracy_score, confusion_matrix, precision_score, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates

path = os.getcwd()+'/'
openai.api_key = ""

In [None]:
# read data
full_training_data = pd.read_csv(path+'data/full_training_data_cleaned.csv',index_col=False)
csv_files = glob.glob(path+'data/news/*.{}'.format('csv'))
df_concat = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

# data transformation
df_concat['created_at'] = pd.to_datetime(df_concat['created_at'], dayfirst=True).dt.strftime('%Y-%m-%d')
df_concat = df_concat.sort_values(by=['created_at'])
df_concat['combined_text'] = df_concat['description'] + ' ' + df_concat['content']
df_concat['combined_text'] = df_concat['combined_text'].replace(r'\s+', ' ', regex=True)
df_concat = df_concat[['created_at', 'combined_text']]
df_concat['combined_text'] = df_concat['combined_text'].astype(str)
df_concat.reset_index(inplace=True)
df_concat = df_concat.groupby(['created_at'], as_index=False)['combined_text'].agg({'combined_text': ' '.join})

In [None]:
# full_training_data['created_at'] = pd.to_datetime(full_training_data['created_at'], format='%d/%m/%Y').dt.strftime('%Y-%m-%d')
full_training_data_combined = pd.merge(full_training_data, df_concat, how='left', on='created_at')
required_columns = ['created_at', 'combined_text', 'HSI_OO_ter_0.005']
full_training_data_combined = full_training_data_combined[required_columns]
full_training_data_combined.rename(columns={'combined_text':'prompt', 'HSI_OO_ter_0.005': 'completion'}, inplace=True)
full_training_data_combined.prompt = full_training_data_combined.prompt.replace(r'\s+', ' ', regex=True)
full_training_data_combined.prompt = full_training_data_combined.prompt.str.slice(0, 1500)


In [None]:
full_training_data_combined['prompt'] = full_training_data_combined['prompt'] + '\n\n###\n\n'
full_training_data_combined['completion'] = ' ' + full_training_data_combined['completion'] + '\n\n'
# full_training_data_combined = full_training_data_combined[['prompt', 'completion']]

In [None]:
full_training_data_combined

Unnamed: 0,created_at,prompt,completion
0,2019-12-02,A trade deal between United States and China w...,sell\n\n
1,2019-12-03,Chinese ecommerce giant Alibaba said on Tuesda...,sell\n\n
2,2019-12-04,Chinas Hong Kong and Macau Affairs Office said...,sell\n\n
3,2019-12-05,Hong Kong authorities granted protesters permi...,buy\n\n
4,2019-12-06,Social unrest in Hong Kong contributed two per...,neutral\n\n
...,...,...,...
735,2022-11-23,A Hong Kong stock market debut puts Jakarta at...,sell\n\n
736,2022-11-24,Billionaire investor Bill Ackman said hes bett...,buy\n\n
737,2022-11-25,Pegged to the US dollar since the Hong Kong do...,neutral\n\n
738,2022-11-28,Hong Kong private home prices fell in October ...,sell\n\n


In [None]:
full_training_data_combined.to_csv('full_training_data_combined.csv', index=False)

In [None]:
full_training_data_combined = pd.read_csv(path + 'data/full_training_data_combined.csv')


In [None]:
np.random.seed(112)
df_train, df_test = np.split(full_training_data_combined.sample(frac=1, random_state=42),
                                     [int(.85*len(full_training_data_combined))])

In [None]:
full_training_data_combined

Unnamed: 0,prompt,completion
0,A trade deal between United States and China w...,sell\n\n
1,Chinese ecommerce giant Alibaba said on Tuesda...,sell\n\n
2,Chinas Hong Kong and Macau Affairs Office said...,sell\n\n
3,Hong Kong authorities granted protesters permi...,buy\n\n
4,Social unrest in Hong Kong contributed two per...,neutral\n\n
...,...,...
735,A Hong Kong stock market debut puts Jakarta at...,sell\n\n
736,Billionaire investor Bill Ackman said hes bett...,buy\n\n
737,Pegged to the US dollar since the Hong Kong do...,neutral\n\n
738,Hong Kong private home prices fell in October ...,sell\n\n


In [None]:
df_train.to_json("text.jsonl", orient='records', lines=True)

In [None]:
!pip install --upgrade openai
!openai tools fine_tunes.prepare_data -f text.jsonl -q

In [None]:
ft_model = 'ada:ft-personal-2023-08-03-13-06-01'
news_data="Hong Kong police will officially swap Britishstyle marching"
res = openai.Completion.create(model=ft_model, prompt=news_data + '\n\n###\n\n', max_tokens=1, temperature=0, logprobs=3)

In [None]:
prompt="This training dataset contains news, category. Please consider the listed data below for your responses accordingly. Do NOT add or remove any description, or category without expressed consent in User prompt. The categories are 'buy', 'sell'. 'netural'"
instruction = openai.Completion.create(model=ft_model, prompt=prompt)
instruction

In [None]:
res['choices'][0]['text']

In [None]:
test_input = df_test['prompt'].tolist()
test_completion = []
for i in test_input:
    res = openai.Completion.create(model=ft_model, prompt=i + '\n\n###\n\n', max_tokens=1, temperature=0, logprobs=3)
    test_completion.append(res['choices'][0]['text'])


In [None]:
df_test.reset_index(inplace=True)
# df_test.drop(['index'], axis=1, inplace=True)
df_test['pred_result'] = pd.Series(test_completion)

In [None]:
df_test

Unnamed: 0,index,prompt,completion,pred_result
0,200,Shares of franchised hotel operator Huazhu Gro...,sell\n\n,sell
1,134,Chinas top diplomat told his US counterpart th...,neutral\n\n,buy
2,27,Denmarks most famous statue became entangled i...,neutral\n\n,neutral
3,650,Organizers of the Hong Kong Rugby Sevens are o...,sell\n\n,buy
4,230,China Hong Kong stocks welcome Biden win prosp...,buy\n\n,sell
...,...,...,...,...
106,71,Hong Kong will quarantine for days all people ...,sell\n\n,neutral
107,106,HONG KONG May A virus test could reveal just h...,buy\n\n,neutral
108,270,China said on Wednesday it supports Hong Kong ...,buy\n\n,neutral
109,435,In Hong Kong where land is at a premium and bu...,sell\n\n,sell


In [None]:
train_input = df_train['prompt'].tolist()
train_completion = []
for i in train_input:
    res = openai.Completion.create(model=ft_model, prompt=i + '\n\n###\n\n', max_tokens=1, temperature=0, logprobs=3)
    train_completion.append(res['choices'][0]['text'])

In [None]:
df_train.reset_index(inplace=True)
# df_train.drop(['index'], axis=1, inplace=True)
df_train['pred_result'] = pd.Series(train_completion)

In [None]:
df_train

Unnamed: 0,index,prompt,completion,pred_result
0,120,Gold traded flat on Friday as investors stayed...,sell\n\n,sell
1,416,Asias economies are already showing a hit from...,buy\n\n,buy
2,334,Hong Kongbased Oasis Management said on Tuesda...,sell\n\n,neutral
3,350,Hong Kong stocks inch up as financial energy s...,neutral\n\n,neutral
4,412,Hong Kongbased ESR Cayman Ltd will buy real es...,neutral\n\n,buy
...,...,...,...,...
624,573,Cathay Pacific Airways Ltd plans to reroute it...,buy\n\n,buy
625,406,The call went out from the megalopolis of Shen...,sell\n\n,sell
626,502,Britain scolded China on Tuesday for using a N...,sell\n\n,sell
627,47,The Holland America Westerdam cruise ship left...,buy\n\n,buy


In [None]:
train_json = pd.read_json(path_or_buf=path+'text_prepared_train.jsonl', lines=True)
val_json = pd.read_json(path_or_buf=path+'text_prepared_valid.jsonl', lines=True)

In [None]:
train_merge = pd.merge(train_json, df_train, on=['prompt','completion'], how='left')
val_merge = pd.merge(val_json, df_train, on=['prompt', 'completion'], how='left')

train_merge['completion'] = train_merge['completion'].str.strip()
train_merge['completion'] = train_merge['completion'].replace('\n','', regex=True)
train_merge['pred_result'] = train_merge['pred_result'].str.strip()

val_merge['completion'] = val_merge['completion'].str.strip()
val_merge['completion'] = val_merge['completion'].replace('\n','', regex=True)
val_merge['pred_result'] = val_merge['pred_result'].str.strip()

df_test['completion'] = df_test['completion'].str.strip()
df_test['completion'] = df_test['completion'].replace('\n','', regex=True)
df_test['pred_result'] = df_test['pred_result'].str.strip()

train_merge['compare'] = np.where(train_merge['completion']==train_merge['pred_result']
                     , 1, 0)
val_merge['compare'] = np.where(val_merge['completion']==val_merge['pred_result']
                     , 1, 0)
df_test['compare'] = np.where(df_test['completion']==df_test['pred_result']
                     , 1, 0)

total_acc_train = train_merge['compare'].sum()
total_acc_val = val_merge['compare'].sum()
total_acc_test = val_merge['compare'].sum()

train_label_list = train_merge['completion'].tolist()
train_pred_list = train_merge['pred_result'].tolist()
val_label_list = val_merge['completion'].tolist()
val_pred_list = val_merge['pred_result'].tolist()
test_label_list = df_test['completion'].tolist()
pred_list = df_test['pred_result'].tolist()


In [None]:
print(f'Train Accuracy: {total_acc_train / len(train_merge): .3f}')
print(f"Accuracy of training: {accuracy_score(train_label_list, train_pred_list)}")
print(f"Precision Score of training: {precision_score(train_label_list, train_pred_list, average=None)}")
print(f"Confusion matrix of training: {confusion_matrix(train_label_list, train_pred_list)}")
print(f"Classification report of training: {classification_report(train_label_list, train_pred_list, digits=3)}")

Train Accuracy:  0.978
Accuracy of training: 0.9781312127236581
Precision Score of training: [0.98275862 0.97350993 0.97752809]
Confusion matrix of training: [[171   0   2]
 [  0 147   2]
 [  3   4 174]]
Classification report of training:               precision    recall  f1-score   support

         buy      0.983     0.988     0.986       173
     neutral      0.974     0.987     0.980       149
        sell      0.978     0.961     0.969       181

    accuracy                          0.978       503
   macro avg      0.978     0.979     0.978       503
weighted avg      0.978     0.978     0.978       503



In [None]:
print(f'Val Accuracy: {total_acc_val / len(val_merge): .3f}')
print(f"Accuracy of val: {accuracy_score(val_label_list, val_pred_list)}")
print(f"Precision Score of val: {precision_score(val_label_list, val_pred_list, average=None)}")
print(f"Confusion matrix of val: {confusion_matrix(val_label_list, val_pred_list)}")
print(f"Classification report of val: {classification_report(val_label_list, val_pred_list, digits=3)}")


Val Accuracy:  0.397
Accuracy of val: 0.3968253968253968
Precision Score of val: [0.425      0.22857143 0.49019608]
Confusion matrix of val: [[17 14 17]
 [12  8  9]
 [11 13 25]]
Classification report of val:               precision    recall  f1-score   support

         buy      0.425     0.354     0.386        48
     neutral      0.229     0.276     0.250        29
        sell      0.490     0.510     0.500        49

    accuracy                          0.397       126
   macro avg      0.381     0.380     0.379       126
weighted avg      0.405     0.397     0.399       126



In [None]:
print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')
print(f"Accuracy of test: {accuracy_score(test_label_list, pred_list)}")
print(f"Precision Score of test: {precision_score(test_label_list, pred_list, average=None)}")
print(f"Confusion matrix of test: {confusion_matrix(test_label_list, pred_list)}")
print(f"Classification report of test: {classification_report(test_label_list, pred_list, digits=3)}")

Test Accuracy:  0.450
Accuracy of test: 0.3063063063063063
Precision Score of test: [0.25       0.325      0.33333333]
Confusion matrix of test: [[ 8 11 16]
 [12 13 10]
 [12 16 13]]
Classification report of test:               precision    recall  f1-score   support

         buy      0.250     0.229     0.239        35
     neutral      0.325     0.371     0.347        35
        sell      0.333     0.317     0.325        41

    accuracy                          0.306       111
   macro avg      0.303     0.306     0.303       111
weighted avg      0.304     0.306     0.305       111



In [None]:
df_test

Unnamed: 0,index,prompt,completion,pred_result,compare
0,200,Shares of franchised hotel operator Huazhu Gro...,sell,sell,1
1,134,Chinas top diplomat told his US counterpart th...,neutral,buy,0
2,27,Denmarks most famous statue became entangled i...,neutral,neutral,1
3,650,Organizers of the Hong Kong Rugby Sevens are o...,sell,buy,0
4,230,China Hong Kong stocks welcome Biden win prosp...,buy,sell,0
...,...,...,...,...,...
106,71,Hong Kong will quarantine for days all people ...,sell,neutral,0
107,106,HONG KONG May A virus test could reveal just h...,buy,neutral,0
108,270,China said on Wednesday it supports Hong Kong ...,buy,neutral,0
109,435,In Hong Kong where land is at a premium and bu...,sell,sell,1


In [None]:
train_merge.set_index('index', inplace=True)
val_merge.set_index('index', inplace=True)
df_test.set_index('index', inplace=True)

In [None]:
train_pred_df = pd.DataFrame({'Date': full_training_data_combined.loc[train_merge.index, 'created_at'], 'Predicted': train_pred_list})
val_pred_df = pd.DataFrame({'Date': full_training_data_combined.loc[val_merge.index, 'created_at'], 'Predicted': val_pred_list})
test_pred_df = pd.DataFrame({'Date': full_training_data_combined.loc[df_test.index, 'created_at'], 'Predicted': pred_list})

In [None]:
my_dict = {'buy':'1', 'neutral':'0', 'sell':'-1'}
train_pred_df.Predicted = train_pred_df.Predicted.astype(str)
train_pred_df['col3'] = train_pred_df.Predicted.replace(my_dict, regex=True)
train_pred_df.drop(['Predicted'], axis=1, inplace=True)
train_pred_df.rename(columns = {'col3':'Predicted'}, inplace = True)
# train_pred_df['Predicted'] = train_pred_df.Predicted.apply(lambda x: x[0])
train_pred_df.to_csv(path+f'train_result.csv', index=False)

In [None]:
train_pred_df

Unnamed: 0_level_0,Date,Predicted
index,Unnamed: 1_level_1,Unnamed: 2_level_1
376,2021-06-11,0
665,2022-08-15,-1
246,2020-12-01,-1
499,2021-12-09,0
666,2022-08-16,0
...,...,...
691,2022-09-21,0
69,2020-03-13,-1
720,2022-11-02,1
233,2020-11-12,1


In [None]:
val_pred_df.Predicted = val_pred_df.Predicted.astype(str)
val_pred_df['col3'] = val_pred_df.Predicted.replace(my_dict, regex=True)
val_pred_df.drop(['Predicted'], axis=1, inplace=True)
val_pred_df.rename(columns = {'col3':'Predicted'}, inplace = True)
# train_pred_df['Predicted'] = train_pred_df.Predicted.apply(lambda x: x[0])
val_pred_df.to_csv(path+f'val_result.csv', index=False)

In [None]:
val_pred_df

Unnamed: 0_level_0,Date,Predicted
index,Unnamed: 1_level_1,Unnamed: 2_level_1
416,2021-08-10,1
412,2021-08-04,1
377,2021-06-15,-1
530,2022-01-24,1
603,2022-05-17,1
...,...,...
460,2021-10-15,-1
648,2022-07-21,1
392,2021-07-07,-1
98,2020-04-27,1


In [None]:
test_pred_df.Predicted = test_pred_df.Predicted.astype(str)
test_pred_df['col3'] = test_pred_df.Predicted.replace(my_dict, regex=True)
test_pred_df.drop(['Predicted'], axis=1, inplace=True)
test_pred_df.rename(columns = {'col3':'Predicted'}, inplace = True)
# train_pred_df['Predicted'] = train_pred_df.Predicted.apply(lambda x: x[0])
test_pred_df.to_csv(path+f'test_result.csv', index=False)

In [None]:
test_pred_df

Unnamed: 0_level_0,Date,Predicted
index,Unnamed: 1_level_1,Unnamed: 2_level_1
200,2020-09-22,-1
134,2020-06-18,1
27,2020-01-13,0
650,2022-07-25,1
230,2020-11-09,-1
...,...,...
71,2020-03-17,0
106,2020-05-11,0
270,2021-01-06,0
435,2021-09-06,-1
