In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import xgboost as xgb
import glob
import openai
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, accuracy_score, confusion_matrix, precision_score, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
from google.colab import drive

drive.mount('/content/gdrive')
path = '/content/gdrive/MyDrive/Colab_Notebooks/FYP/'
openai.api_key = ""

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# read data
full_training_data = pd.read_csv(path+'data/full_training_data_cleaned.csv',index_col=False)
csv_files = glob.glob(path+'/data/news/*.{}'.format('csv'))
df_concat = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

# data transformation
df_concat['created_at'] = pd.to_datetime(df_concat['created_at'], dayfirst=True).dt.strftime('%Y-%m-%d')
df_concat = df_concat.sort_values(by=['created_at'])
df_concat['combined_text'] = df_concat['description'] + ' ' + df_concat['content']
df_concat['combined_text'] = df_concat['combined_text'].replace(r'\s+', ' ', regex=True)
df_concat = df_concat[['created_at', 'combined_text']]
df_concat['combined_text'] = df_concat['combined_text'].astype(str)
df_concat.reset_index(inplace=True)
df_concat = df_concat.groupby(['created_at'], as_index=False)['combined_text'].agg({'combined_text': ' '.join})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_concat['combined_text'] = df_concat['combined_text'].astype(str)


In [None]:
full_training_data['created_at'] = pd.to_datetime(full_training_data['created_at']).dt.strftime('%Y-%m-%d')
full_training_data_combined = pd.merge(full_training_data, df_concat, how='left', on='created_at')
required_columns = ['created_at', 'combined_text', 'HSI_OO_ter_0.005']
full_training_data_combined = full_training_data_combined[required_columns]

In [None]:
np.random.seed(112)
df_train, df_test = np.split(full_training_data_combined.sample(frac=1, random_state=42),
                                     int(.85*len(full_training_data_combined)))

In [None]:
df_train.to_json("text.jsonl", orient='records', lines=True)

In [None]:
!pip install --upgrade openai
!openai tools fine_tunes.prepare_data -f text.jsonl -q

In [None]:
# model fine tuning
!openai api fine_tunes.create -t "text_train.jsonl" -v "text_valid.jsonl" \
 -m "ada" --compute_classification_metrics --classification_n_classes 3

In [None]:
#  Prompt testing
!openai api completions.create -m <FINE_TUNED_MODEL> -p "prompt"

In [None]:
pred_list.append(test_result)
test_label_list.append(label)

In [None]:
  print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
  print(f"Accuracy of training: {accuracy_score(test_label_list, pred_list)}")
  print(f"Precision Score of training: {precision_score(test_label_list, pred_list, average=None)}")
  print(f"Confusion matrix of training: {confusion_matrix(test_label_list, pred_list)}")
  print(f"Classification report of training: {classification_report(test_label_list, pred_list, digits=3)}")
  pred_df = pd.DataFrame({'Date': full_training_data_combined.loc[test_data.index, 'created_at'], 'Predicted': pred_list})

In [None]:
result = pd.concat([train_pred_df, val_pred_df])
result = pd.concat([result, test_pred_df])
result = result.sort_values(by='Date').reset_index(drop=True)

# output result
result.to_csv(path+f'chatgpt_stock_prediction_classification.csv', index=False)