In [5]:
!pip install -q -U google-generativeai



In [74]:
import pathlib
import textwrap

import google.generativeai as genai
import numpy as np
import pandas as pd

from IPython.display import display
from IPython.display import Markdown

In [75]:
from config import GOOGLE_API_KEY

genai.configure(api_key=GOOGLE_API_KEY)

In [8]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-pro
models/gemini-pro-vision


In [76]:
model = genai.GenerativeModel('gemini-pro')

In [14]:
%%time
response = model.generate_content("What is the meaning of life?")

CPU times: user 57 ms, sys: 5.14 ms, total: 62.1 ms
Wall time: 4.8 s


In [113]:
response = model.generate_content(make_prompt("3", "239"))
response.text

'| Three Digit Code (ISIC v4) | Four Digit Code (ISIC v3.1) | Proportion |\n|---|---|---|\n| 239 | 2692 | 25% |\n| 239 | 2691 | 20% |\n| 239 | 2693 | 15% |\n| 239 | 2694 | 10% |\n| 239 | 2695 | 10% |\n| 239 | 2696 | 5% |\n| 239 | 2699 | 15% |'

In [114]:
from io import StringIO
import csv

# Define the data string
data_string = response.text

# Create a StringIO object to mimic a file-like object
data_io = StringIO(data_string)
print(data_io)

<_io.StringIO object at 0x000002018B367250>


In [115]:
df_2 = pd.read_csv(data_io, delimiter="|")
df_2 = df_2.iloc[1:].reset_index(drop = True)
df_2


Unnamed: 0.1,Unnamed: 0,Three Digit Code (ISIC v4),Four Digit Code (ISIC v3.1),Proportion,Unnamed: 4
0,,239,2692,25%,
1,,239,2691,20%,
2,,239,2693,15%,
3,,239,2694,10%,
4,,239,2695,10%,
5,,239,2696,5%,
6,,239,2699,15%,


In [116]:
names = df_2.columns
for name in names: 
    if "Unnamed" in name:
        df_2 = df_2.drop(columns = [name])
df_2

Unnamed: 0,Three Digit Code (ISIC v4),Four Digit Code (ISIC v3.1),Proportion
0,239,2692,25%
1,239,2691,20%
2,239,2693,15%
3,239,2694,10%
4,239,2695,10%
5,239,2696,5%
6,239,2699,15%


Index(['Unnamed: 0', ' Previous Code in Version 3.1 ',
       ' Proportion of Jobs Now Coded in 239 ', 'Unnamed: 3'],
      dtype='object')
Index([' Previous Code in Version 3.1 ', ' Proportion of Jobs Now Coded in 239 '], dtype='object')


In [100]:
names = df_2.columns
df_2[names[-1]] = df_2[names[-1]].str.strip()
df_2[names[-1]] = df_2[names[-1]].str.replace("%", "")
df_2[names[-1]] = df_2[names[-1]].astype(float) / 100
df_2

Unnamed: 0,Original ISIC Code,Proportion of Jobs Now Coded in 239
0,2691,0.3
1,2692,0.2
2,2693,0.15
3,2694,0.2
4,2695,0.05
5,2696,0.05
6,2699,0.05


In [122]:
##Function takes the created prompt and returns a dataframe that has proportions showing how each 3 digit cade in ISIC 4 is created in ISIC 3.1

def get_proportions(prompt):
    from io import StringIO
    import csv
    import pathlib
    import textwrap
    import google.generativeai as genai
    import numpy as np
    import pandas as pd
    from config import GOOGLE_API_KEY
    
    
    #setting the model and API key
    genai.configure(api_key=GOOGLE_API_KEY)
    model = genai.GenerativeModel('gemini-pro')
    
    # Input string prompt to the google model 
    response = model.generate_content(prompt)
    
    # Define the data string
    data_string = response.text
    
    # Create a StringIO object to mimic a file-like object
    data_io = StringIO(data_string)
    
    # Create first df to be cleaned and returned 
    df = pd.read_csv(data_io, delimiter="|")
    
    # Drop empty top row 
    df = df.iloc[1:].reset_index(drop = True)
    
    # Drop empty columns
    names = df.columns
    for name in names: 
        if "Unnamed" in name: 
            df = df.drop(columns = [name])
    
    # Making proportion column a float between 0 and 1
    names = df.columns
    names_2 = ["three_code", "four_code", "Proportion of Jobs"]
    df.columns = names_2
    df["Proportion of Jobs"] = df["Proportion of Jobs"].str.strip()
    df["Proportion of Jobs"] = df["Proportion of Jobs"].str.replace("%", "")
    df["Proportion of Jobs"] = df["Proportion of Jobs"].astype(float) 
    if df["Proportion of Jobs"][0] > 1:
        df["Proportion of Jobs"] = df["Proportion of Jobs"]/100
    
    # Replace non-numeric characters with an empty string
    pattern = r'\D'  # Matches any non-digit character
    df["three_code"] = df["three_code"].str.replace(pattern, '')    
    df["four_code"] = df["four_code"].str.replace(pattern, '')
    return df

In [25]:
## Creating a usable prompt 
## Start by importing the correspondence table and .txt files with with codes and descriptions
corr_table = pd.read_csv("ISIC4_ISIC31.csv", dtype={"ISIC4code": str, "partialISIC4": str, "ISIC31code": str, "partialISIC31": str})
corr_table = corr_table.fillna("")
ISIC_4 = pd.read_csv("isic4.txt", dtype={'num': str}, delimiter = "|")
ISIC_31 = pd.read_csv("isic31.txt", dtype={'num': str}, delimiter = "|")
col_names = ["code", "description"]
ISIC_4.columns = col_names
ISIC_31.columns = col_names

In [26]:
# Cleaning data and making dfs for each code length 
code_length_4 = ISIC_4['code'].str.len()
code_length_31 = ISIC_31['code'].str.len()

# Filter DataFrames based on code length
ISIC_4_2digit = ISIC_4[code_length_4 == 2]
ISIC_4_3digit = ISIC_4[code_length_4 == 3]
ISIC_4_4digit = ISIC_4[code_length_4 == 4]

ISIC_3_2digit = ISIC_31[code_length_31 == 2]
ISIC_3_3digit = ISIC_31[code_length_31 == 3]
ISIC_3_4digit = ISIC_31[code_length_31 == 4]

# Making 3 digit and 2 digit 4.0 codes columns in the correspondence table 
corr_table['ISIC_4_3d'] = corr_table['ISIC4code'].str[:3]
corr_table['ISIC_4_2d'] = corr_table['ISIC4code'].str[:2]

In [109]:
def make_prompt(digits, three_code):
    num_codes = corr_table["ISIC_4_3d"].value_counts()
    prompt = "A " + digits + " digit code "+ three_code + " which is " + str(ISIC_4_3digit[ISIC_4_3digit["code"] == three_code]["description"].iloc[0]) + " in ISIC version 4 is comprised of " + str(num_codes[three_code]) + " in ISICs version 3.1, "
    codes_31 = corr_table[corr_table["ISIC_4_3d"] == three_code]
    ## Code to handle when there are multiple instances of a code and possibly multiple "details" for that code in the correspondence table
    for code in codes_31["ISIC31code"].unique(): 
        ## start by just adding the code and its standard description from the ISIC code data frame
        prompt = prompt + code + " which is (" + ISIC_3_4digit[ISIC_3_4digit['code'] == code]['description'].iloc[0] + "), "
        ## now test if the code is unique, that is, it only appears once within the 3 digit code we are considering from version 4
        if codes_31['ISIC31code'].eq(code).sum() == 1:
            ## If it is unique, we test if the detail column of the correspondence table is empty, if it is not we add that extra detail to the prompt
            if codes_31[codes_31["ISIC31code"] == code]["Detail"].iloc[0] != "":
                prompt = prompt + " and includes (" + codes_31[codes_31["ISIC31code"] == code]["Detail"].iloc[0] + ") "
        elif codes_31['ISIC31code'].eq(code).sum() > 1:
            ## we now test if the code is not unique, if this is the case our code is more complicated, start by making an index list that will track appearances of this code
            ind = []
            ## Now we will loop through the instances of the code and add the index of each instance if the detail section is not empty, we are collecting all the different details to include 
            for i in range(codes_31['ISIC31code'].eq(code).sum()):
                if codes_31[codes_31["ISIC31code"] == code]["Detail"].iloc[i] != "":
                    ind.append(i)
            ## If the code is repeated but never with a detail the prompt remains unchanged
            if len(ind) == 0:
                prompt = prompt
            ## If the code is repeated but only with 1 detail, just include that single detail
            elif len(ind) == 1:
                prompt = prompt + "and includes (" + codes_31[codes_31["ISIC31code"] == code]["Detail"].iloc[ind[0]] + ", "
            ## If the code is repeated with multiple details, start with the first detail and drop the first item in the list so it is not repeated, then loop through the list adding all the details
            else: 
                prompt = prompt + " and includes (" + codes_31[codes_31["ISIC31code"] == code]["Detail"].iloc[ind[0]]
                ind = ind[1:]
                for k in ind:
                    prompt = prompt + ", and (" + codes_31[codes_31["ISIC31code"] == code]["Detail"].iloc[k] + ")"
    prompt = prompt + " What is your best estimate of the proportion of jobs now coded in 239 that were in each of the previous codes in version 3.1? Can you give me your best guesses in a table with 3 columns, first the three digit code, then the four didgit codes, then the proportions?"
    return prompt

In [124]:
prompt = make_prompt("3", "239")
df = get_proportions(prompt)
print(df)

  three_code four_code  Proportion of Jobs
0       239      2692                  0.4
1       239      2691                  0.2
2       239      2693                  0.1
3       239      2694                  0.1
4       239      2695                  0.1
5       239      2696                  0.1
6       239      2699                  0.1
