# 300_gpt_tuning_v2

> In this notebook we will fine tune multiple GPT 3.5 models with what we learned from the first set and with progressive and conservative models seperate.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import openai
import json
from googleapiclient import discovery
from googleapiclient.errors import HttpError


Enable apis with keys. 

In [2]:
with open("C:/Users/danie/OneDrive/Desktop/buffed_perspective_api_key_.txt") as f:
    api_key = f.readline()

perspective_client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=api_key,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,
)

with open("C:/Users/danie/OneDrive/Desktop/openai_youtube_api_key.txt") as f:
    api_key = f.readline()

openai.api_key = api_key

Set up helper functions:

In [3]:
# Quantifier 
def get_toxicity_score(comment):
    analyze_request = {
        'comment': { 'text': comment },
        'requestedAttributes': {'TOXICITY': {}}
    }
    
    try:
        response = perspective_client.comments().analyze(body=analyze_request).execute()
        return float(response['attributeScores']['TOXICITY']['summaryScore']['value'])
    except HttpError as e:
        return [e.resp.status]

In [4]:
def format_comment_for_finetuning(row):
    affil = row.affiliation
    affil = 'conservative' if affil == 'R' else 'progressive'
    title = row.video_title
    desc = row.video_description
    comment = row.comment_text
    formatted = {"messages": [{"role": "system", "content": f"You are a {affil} American political commentator."},
                              {"role": "user", "content": f"You just watched the youtube video '{title}' with the description '{desc}'. Give your opinion on the subject matter."},
                              {"role": "assistant", "content": comment}]}
    return json.dumps(formatted)

In [12]:
def create_training_file(df, affil):
    size = df.shape[0]
    if affil == 'l':
        path = f'../data/cleaned/progressive_size_{size}_train.jsonl'
    else:
        path = f'../data/cleaned/conservative_size_{size}_train.jsonl'
        
    ft_train = open(path, mode='w')

    for i in range(size):
        row_dic = format_comment_for_finetuning(df.iloc[i,:])
        ft_train.write(row_dic)
        ft_train.write('\n')
    
    print(f"Successfully wrote size {affil} {size} training file.")
    
    return path

            

In [6]:
comments_df = pd.read_csv("../data/cleaned/channel_subset_with_comments.csv", index_col='comment_id')
comments_df.columns

Index(['channel_id', 'channel_title', 'affiliation', 'video_id', 'video_title',
       'video_description', 'comment_text'],
      dtype='object')

In [24]:
# Right data will be from Turning Point USA
right_comments = comments_df.loc[comments_df.affiliation == 'R']
# Left comments will be from The Young Turks 
left_comments =comments_df.loc[comments_df.affiliation == "L"]

print(right_comments.info())
print(left_comments.info())

<class 'pandas.core.frame.DataFrame'>
Index: 33762 entries, Ugxtc_P8-7EmyHXAaF14AaABAg to UgxzzRFwm9vU6PRQtcl4AaABAg.9IsI0tTqTMR9IsJNADMXAb
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   channel_id         33762 non-null  object
 1   channel_title      33762 non-null  object
 2   affiliation        33762 non-null  object
 3   video_id           33762 non-null  object
 4   video_title        33762 non-null  object
 5   video_description  33762 non-null  object
 6   comment_text       33762 non-null  object
dtypes: object(7)
memory usage: 2.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 42706 entries, UgzxGyGWCRYlN6MsV5l4AaABAg to UgwWdIXwRecT8RRGoZ14AaABAg.9IAyY08RTtV9IB-WsSjP9A
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   channel_id         42706 non-null  object
 1   channel_title      42706 non-null  object


In [26]:
model_sizes = np.logspace(2, 6, 6, base=5).astype(int).tolist()
model_sizes

[25, 90, 328, 1189, 4311, 15625]

In [14]:

created_files = []
for N in model_sizes:
    l = left_comments.sample(n=N)
    r = right_comments.sample(n=N)
    
    created_files.append(create_training_file(l, 'l'))
    created_files.append(create_training_file(r, 'r'))
    
print(created_files)

Successfully wrote size l 5 training file.
Successfully wrote size r 5 training file.
Successfully wrote size l 25 training file.
Successfully wrote size r 25 training file.
Successfully wrote size l 125 training file.
Successfully wrote size r 125 training file.
Successfully wrote size l 625 training file.
Successfully wrote size r 625 training file.
Successfully wrote size l 3125 training file.
Successfully wrote size r 3125 training file.
['../data/cleaned/progressive_size_5_train.jsonl', '../data/cleaned/conservative_size_5_train.jsonl', '../data/cleaned/progressive_size_25_train.jsonl', '../data/cleaned/conservative_size_25_train.jsonl', '../data/cleaned/progressive_size_125_train.jsonl', '../data/cleaned/conservative_size_125_train.jsonl', '../data/cleaned/progressive_size_625_train.jsonl', '../data/cleaned/conservative_size_625_train.jsonl', '../data/cleaned/progressive_size_3125_train.jsonl', '../data/cleaned/conservative_size_3125_train.jsonl']


In [15]:
'../data/cleaned/progressive_size_5_train.jsonl'.index('progressive')

16

In [21]:
file_api_ids = []

for f,s in zip(created_files, [x for x in model_sizes for _ in range(2)]):
    train_response = openai.File.create(
        file=open(f, "rb"),
        purpose='fine-tune',
        user_provided_filename=f'{f[16:f.index("_size")]}_size_{s}_training_examples'
    )
    file_api_ids.append(train_response['id'])


In [23]:
job_ids = []
for f_id in file_api_ids:
    response_1 = openai.FineTuningJob.create(
        training_file=f_id,
        model="gpt-3.5-turbo"
    )

    response_2 = openai.FineTuningJob.retrieve(response_1['id'])
    job_ids.append(response_2['id'])

RateLimitError: This fine-tune request has been rate-limited. Your organization has reached the maximum of 3 active requests (0 running, 3 pending) for the model 'gpt-3.5-turbo-0613'.

In [None]:
for j_id in job_ids:
    print(openai.FineTuningJob.retrieve(j_id))