# Clean data prior to ChatML formatting

In [None]:
import pandas as pd
import json
pd.options.mode.chained_assignment = None

dfArtistApplicationContent = pd.read_csv('Government_Program_Application_Data.csv')
dfPanelNotesScores = pd.read_csv('Grant_Program_Panelist_Notes_Scores.csv')

In [None]:
# fields from applications as they align with panelist review categories
# Combine fields

dfArtistApplicationContent['ArtisticProduct_Application'] = dfArtistApplicationContent['What will you create'] + dfArtistApplicationContent['Project Goals']
dfArtistApplicationContent['ArtisticAccess_Application'] = dfArtistApplicationContent['Public Sharing Component'] + dfArtistApplicationContent['Target Audience'] + dfArtistApplicationContent['Connection to General Public']
dfArtistApplicationContent['ArtisticManagement_Application'] = dfArtistApplicationContent['How will you execute your project'] + dfArtistApplicationContent['Collaborators']

moneyFieldsList = ['Request Amount', 'Grant Total Paid']

for col in moneyFieldsList:
    dfArtistApplicationContent[col] = dfArtistApplicationContent[col].astype(str)
    dfArtistApplicationContent[col] = dfArtistApplicationContent[col].str.replace('$', '')
    dfArtistApplicationContent[col] = dfArtistApplicationContent[col].str.replace(',', '')
    dfArtistApplicationContent[col] = dfArtistApplicationContent[col].astype(float)
    dfArtistApplicationContent[col] = dfArtistApplicationContent[col].astype(int)

In [3]:
# splitting data for each reviewer

projectsToRemove = ['New works. And rebranding ', 'Avocado Head Man Public Art Installation']

dfPanelNotesScores = dfPanelNotesScores[~dfPanelNotesScores['Project Name'].isin(projectsToRemove)]

dfPanelNotesScores[['ArtisticProductScore_R1', 'ArtisticProductScore_R2', 'ArtisticProductScore_R3']] = dfPanelNotesScores['Quality of the artistic product processor or service 035'].str.split('; ', expand=True).astype(int)
dfPanelNotesScores[['ArtisticAccessScore_R1', 'ArtisticAccessScore_R2', 'ArtisticAccessScore_R3']] = dfPanelNotesScores['Access to the Arts 035'].str.split('; ', expand=True).astype(int)
dfPanelNotesScores[['ArtisticManagementScore_R1', 'ArtisticManagementScore_R2', 'ArtisticManagementScore_R3']] = dfPanelNotesScores['Management 030'].str.split('; ', expand=True).astype(int)

dfPanelNotesScores['productAvgScore'] = dfPanelNotesScores[['ArtisticProductScore_R1', 'ArtisticProductScore_R2', 'ArtisticProductScore_R3']].mean(axis=1).astype(int)
dfPanelNotesScores['accessAvgScore'] = dfPanelNotesScores[['ArtisticAccessScore_R1', 'ArtisticAccessScore_R2', 'ArtisticAccessScore_R3']].mean(axis=1).astype(int)
dfPanelNotesScores['managementAvgScore'] = dfPanelNotesScores[['ArtisticManagementScore_R1', 'ArtisticManagementScore_R2', 'ArtisticManagementScore_R3']].mean(axis=1).astype(int)

dfPanelNotesScores['Panelist_Product_Avg'] = 'Avg_Reviewer_Score: ' + dfPanelNotesScores['productAvgScore'].astype(str) + '; Reviewer_Notes: ' + dfPanelNotesScores['Notes on Artistic Quality']
dfPanelNotesScores['Panelist_Access_Avg'] = 'Avg_Reviewer_Score: ' + dfPanelNotesScores['accessAvgScore'].astype(str) + '; Reviewer_Notes: ' + dfPanelNotesScores['Notes on Access to the arts']
dfPanelNotesScores['Panelist_Management_Avg'] = 'Avg_Reviewer_Score: ' + dfPanelNotesScores['managementAvgScore'].astype(str) + '; Reviewer_Notes: ' + dfPanelNotesScores['Notes on Management']

In [4]:
# Keep relevannt columnns from both dfs, and then merge them together (merge on Project Name, inner).

dfArtistApplicationContent = dfArtistApplicationContent[['Project Name', 'Grant Total Paid', 'Request Amount', 'ArtisticProduct_Application' , 'ArtisticAccess_Application', 'ArtisticManagement_Application']]

dfPanelNotesScores = dfPanelNotesScores[['Project Name', 'Panelist_Product_Avg', 'Panelist_Access_Avg', 'Panelist_Management_Avg']]

df = pd.merge(dfArtistApplicationContent, dfPanelNotesScores, on='Project Name', how='inner')

# Fill na with 'nothing' as ChatML doesn't like null values
df = df.fillna('nothing')

# Format data in ChatML

In [None]:
# Keep useful columns and assign roles

systemTextProduct = """You are a grant review panelist. Score the following application on a scale of 0-35. 0 being the lowest and 35 being the highest. Provide a brief explanation of your score based on the following criteria:
The work deepens its impact by welcoming people of all abilities and backgrounds.
Strong ideas expressed with clarity advance artistic goals.
The emotional and sensory impact of the work engages community.
The creative work demonstrates integrity and ethical use of material with specific cultural origins and context.
A work can disrupt artistic conventions, concepts of beauty, or how the audience engages. Effective disruption is mindful and intentional.
"""

systemTextAccess = """You are a grant review panelist. Score the following application on a scale of 0-35. 0 being the lowest and 35 being the highest. Provide a brief explanation of your score based on the following criteria:
Does the applicant demonstrate knowledge of the target audience?
Does the applicant demonstrate effective plans and strategies to reach the target audience?
Does the applicant demonstrate plans to reach the general public?
"""

systemTextManagement = """You are a grant review panelist. Score the following application on a scale of 0-30. 0 being the lowest and 30 being the highest. Provide a brief explanation of your score based on the following criteria:
Evaluate the quality of measurable goals and objectives as indicated by applicant.
Is the budget appropriate?
Does the applicant demonstrate a plan to successfully execute the project?
Does the applicant have collaborators who will help assist in the execution of the project (if applicable)?
"""


dfChatMLProduct = df[['ArtisticProduct_Application', 'Panelist_Product_Avg']]
dfChatMLProduct['systemTextProduct'] = systemTextProduct
dfChatMLProduct = dfChatMLProduct.rename(columns={'systemTextProduct': 'system', 'ArtisticProduct_Application': 'user', 'Panelist_Product_Avg': 'assistant'})


dfChatMLAccess = df[['ArtisticAccess_Application', 'Panelist_Access_Avg']]
dfChatMLAccess['systemTextAccess'] = systemTextAccess
dfChatMLAccess = dfChatMLAccess.rename(columns={'systemTextAccess': 'system', 'ArtisticAccess_Application': 'user', 'Panelist_Access_Avg': 'assistant'})

dfChatMLManagement = df[['ArtisticManagement_Application', 'Panelist_Management_Avg']]
dfChatMLManagement['systemTextManagement'] = systemTextManagement
dfChatMLManagement = dfChatMLManagement.rename(columns={'systemTextManagement': 'system', 'ArtisticManagement_Application': 'user', 'Panelist_Management_Avg': 'assistant'})

dfChatMLConcat = pd.concat([dfChatMLProduct, dfChatMLAccess, dfChatMLManagement])

dfChatMLConcat = dfChatMLConcat[['system', 'user', 'assistant']]

# remove line breaks from strings
dfChatMLConcat = dfChatMLConcat.replace({'\n':' '}, regex=True)
dfChatMLConcat = dfChatMLConcat.replace({"\"":"'"}, regex=True)
dfChatMLConcat = dfChatMLConcat.replace({",":""}, regex=True)

dfChatMLConcat.to_csv('Grant_App_and_Review_Data_Combined_ChatML.csv', index=False)

In [None]:
# Format lines for JSONL

partialString1 = """[{"role": "system", "content": \""""
partialString2 = """"}, {"role": "user", "content": \""""
partialString3 = """"}, {"role": "assistant", "content": \""""
partialString4 = """"}]"""

dfChatMLConcat['conversations'] = partialString1 + dfChatMLConcat['system'].astype(str) + partialString2 + dfChatMLConcat['user'].astype(str) + partialString3 + dfChatMLConcat['assistant'].astype(str) + partialString4

# Format as JSON
dfChatMLConcat['conversations'] = dfChatMLConcat['conversations'].str.replace("\\", "", regex=False)

dfChatMLConcat = dfChatMLConcat['conversations'].to_list()

jsonFormatList = []

for record in dfChatMLConcat:
    jsonFormatList.append(json.loads(record))

wrapped_data = {"conversations": jsonFormatList}


with open('FineTuningData_JSONFormatted_TopLevel.json', 'w') as outfile:
    json.dump(wrapped_data, outfile, indent=4)

In [None]:
# Define the file path
file_path = "FineTuningData_JSONFormatted_TopLevel.json"

# Open and read the JSON file
with open(file_path, "r") as f:
    data = json.load(f)  # Load the file content into a Python object

# Convert the Python object back to a JSON-formatted string using json.dumps()
json_string = json.dumps(data, indent=2)  # `indent=2` makes the output pretty-printed