In [7]:
# Test 1: Understand if ChatGPT is able to find differences between different categories

# ['Pothole',
#  'Missed Collection: Garbage',
#  'Street Light Out',
#  'Interior of Building in Disrepair',
#  'Sanitation Inspector Notification']

# Test 2: Understand if ChatGPT is able to find the difference between similar categories

# ['Garbage Cart: Delete',
# 'Garbage Cart: Additional',
# 'Garbage Cart: Damaged',
# 'Garbage Cart: Missing',
# 'Garbage Cart: No Cart']

# Additional Test A: Understand if ChatGPT is able to find specific information in the description (For Example the street)

# General prompt

# "You have this list of categories " + str(all_categories) + 
# " and the following message, representing a ticket\n\n" + str(df_cleaned.iloc[i].CASECLOSUREREASONDESCRIPTION) + 
# "\n\ncan you identify the category of the ticket and the street in which the problem occurred?" + 
# "Please answer like this: <name of the category>; <name of street>\n" +
# "Please do not use any other syntactic sugar."


In [8]:
import pandas as pd
import openai

openai.organization = "org-pzwdKxabuPnrc0kMWRWci2qK"
openai.api_key_path = "OPENAI_API_KEY.txt"

# Category and street dataset processing

In [9]:
df = pd.read_csv("data/callcenterdatahistorical.csv")
print("Length of the df before filtering: " + str(len(df)))

df_cleaned = df[df['CASECLOSUREREASONDESCRIPTION'].apply(lambda x: len(str(x).split()) > 15)]
print("Length of the df after filtering for the length of the case description: " + str(len(df_cleaned)))

df_cleaned = df_cleaned[df_cleaned.groupby('TITLE')['TITLE'].transform('count') >= 800]
print("Length of the df after filtering for the number of examples for each category: " + str(len(df_cleaned)))

# Filter records with the street specified in the CASECLOSUREREASONDESCRIPTION field
street_pattern = r'\b(?:Av|Ave|Avenue|Street|\d+(?:st|nd|rd|th))\b'
df_cleaned = df_cleaned[df_cleaned['CASECLOSUREREASONDESCRIPTION'].str.contains(street_pattern, case=False, na=False)]
print("Length of the df after filtering records with the street specified: " + str(len(df_cleaned)))

all_categories = list(df_cleaned.TITLE.unique())
all_categories

Length of the df before filtering: 147514
Length of the df after filtering for the length of the case description: 29991
Length of the df after filtering for the number of examples for each category: 8583
Length of the df after filtering records with the street specified: 2246


['Pothole',
 'Street Light Out',
 'Missed Collection: Garbage',
 'Interior of Building in Disrepair',
 'Garbage Supervisor Notification',
 'Garbage Supervisor Callback',
 'Garbage Cart: Damaged',
 'Sanitation Inspector Notification']

In [10]:
samples_per_group = 32

grouped = df_cleaned.groupby('TITLE', group_keys=False)

df_cleaned = grouped.apply(lambda x: x.sample(min(len(x), samples_per_group)))

df_cleaned = df_cleaned.reset_index(drop=True)

print(df_cleaned['TITLE'].value_counts().reset_index())
print(len(df_cleaned))

                               TITLE  count
0              Garbage Cart: Damaged     32
1        Garbage Supervisor Callback     32
2    Garbage Supervisor Notification     32
3  Interior of Building in Disrepair     32
4         Missed Collection: Garbage     32
5                            Pothole     32
6  Sanitation Inspector Notification     32
7                   Street Light Out     32
256


# Subcategories dataset processing

In [11]:
df = pd.read_csv("data/callcenterdatahistorical.csv")
print("Length of the df before filtering: " + str(len(df)))

df_new = df[df['CASECLOSUREREASONDESCRIPTION'].apply(lambda x: len(str(x).split()) > 15)]
print("Length of the df after filtering for the length of the case description: " + str(len(df_new)))

df_new = df_new[df_new.groupby('TITLE')['TITLE'].transform('count') >= 150]
print("Length of the df after filtering for the number of examples for each category: " + str(len(df_new)))

all_categories = list(df_new.TITLE.unique())
df_new = df_new[df_new['TITLE'].str.contains("Garbage Cart:", case=False, na=False)].reset_index(drop=True)
print("Length of the df after filtering records with the category specified: " + str(len(df_new)))

all_categories = [category for category in all_categories if "Garbage Cart" not in category]
all_categories.append("Garbage Cart")
print(all_categories)
all_subcategories = list(df_new.TITLE.unique())
print(all_subcategories)

Length of the df before filtering: 147514
Length of the df after filtering for the length of the case description: 29991
Length of the df after filtering for the number of examples for each category: 24007
Length of the df after filtering records with the category specified: 2298
['All Other Signs', 'Pothole', 'Missed Collection: Recycling', 'Missed Collection: Garbage', 'Street Light Out', 'Large Items Discarded on Private Property', 'Area Dark', 'Missed Collection: Additional Items', 'Garbage Supervisor Notification', 'Exterior of Building in Disrepair', 'Recycling Supervisor Notification', 'Clogged Catch Basin/Surface Flooding', 'Miscellaneous Sewers Request', 'Interior of Building in Disrepair', 'Scattered Litter and Debris on Private Property', 'Sanitation Inspector Callback', 'Other Miscellaneous Forestry Request', 'Construction Concerns', 'Recycling Cart Collection Request: Cart Not at Collection Point', 'Miscellaneous Street Maintenance Request', 'Brush Pickup Request, Less tha

In [12]:
samples_per_group = 32

grouped = df_new.groupby('TITLE', group_keys=False)

df_cleaned = grouped.apply(lambda x: x.sample(min(len(x), samples_per_group)))

df_cleaned = df_cleaned.reset_index(drop=True)

print(df_cleaned['TITLE'].value_counts().reset_index())
print(len(df_cleaned))

                      TITLE  count
0  Garbage Cart: Additional     32
1     Garbage Cart: Damaged     32
2      Garbage Cart: Delete     32
3     Garbage Cart: Missing     32
4     Garbage Cart: No Cart     32
160
