In [1]:
import os
import openai
import pandas as pd


from getpass import getpass
from langchain.llms import OpenAI
from pyspark.sql import SparkSession
from langchain.agents import create_spark_dataframe_agent
from langchain_addons.agents import create_multiple_spark_dataframes_agent

In [2]:
pd.options.display.max_columns = None
os.environ['OPENAI_API_KEY'] = openai.api_key = getpass("Insert your key:")

Insert your key: ········


In [3]:
spark = SparkSession.builder.getOrCreate()
csv_file_path = "data/hr_ibm.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.limit(3).toPandas()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0


In [4]:
MODEL_ID = "text-davinci-003"

model = OpenAI(temperature=0, model=MODEL_ID)

agent = create_spark_dataframe_agent(
    llm=model,
    df=df,
    verbose=True
)
agent.run("how many rows are there?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find out how many rows are in the dataframe
Action: python_repl_ast
Action Input: df.count()[0m
Observation: [36;1m[1;3m1470[0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: There are 1470 rows in the dataframe.[0m

[1m> Finished chain.[0m


'There are 1470 rows in the dataframe.'

## Multiple datasets

In [5]:
df_empresa_1 = spark.createDataFrame([(1, 'a'), (2,'b')], ['cnpj', 'classe'])
df_empresa_2 = spark.createDataFrame([(1, 'x'), (2,'y'), (3, 'z')], ['cnpj', 'value'])

my_dfs_dict = {'df_empresa_1': df_empresa_1, 'df_empresa_2': df_empresa_2}

model = OpenAI(temperature=0, model="text-davinci-003")

agent = create_multiple_spark_dataframes_agent(
    llm=model,
    dataframes_dict=my_dfs_dict,
    verbose=True
)

In [6]:
agent.run("how many rows are in df_empresa_1?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find out how many rows are in the dataframe
Action: python_repl_ast
Action Input: df_empresa_1.count()[0m
Observation: [36;1m[1;3m2[0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: There are 2 rows in df_empresa_1.[0m

[1m> Finished chain.[0m


'There are 2 rows in df_empresa_1.'

In [7]:
agent.run("how many rows are in df_empresa_2?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find out how many rows are in the dataframe
Action: python_repl_ast
Action Input: len(df_empresa_2.collect())[0m
Observation: [36;1m[1;3m3[0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: There are 3 rows in df_empresa_2.[0m

[1m> Finished chain.[0m


'There are 3 rows in df_empresa_2.'

In [13]:
agent.run("faça o join na coluna em comum das duas tabelas e salve o resultado em data/spark_join.csv mantendo o nome das colunas")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to join the two dataframes on the common column
Action: python_repl_ast
Action Input: df_empresa_1.join(df_empresa_2, on='cnpj').write.csv('data/spark_join.csv', header=True)[0m
Observation: [36;1m[1;3mAnalysisException: [PATH_ALREADY_EXISTS] Path file:/home/jovyan/project/data/spark_join.csv already exists. Set mode as "overwrite" to overwrite the existing path.[0m
Thought:[32;1m[1;3m I need to set the mode to overwrite
Action: python_repl_ast
Action Input: df_empresa_1.join(df_empresa_2, on='cnpj').write.mode('overwrite').csv('data/spark_join.csv', header=True)[0m
Observation: [36;1m[1;3m[0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: df_empresa_1.join(df_empresa_2, on='cnpj').write.mode('overwrite').csv('data/spark_join.csv', header=True)[0m

[1m> Finished chain.[0m


"df_empresa_1.join(df_empresa_2, on='cnpj').write.mode('overwrite').csv('data/spark_join.csv', header=True)"

In [15]:
df = spark.read.csv("data/spark_join.csv", header=True)

df.show()

+----+------+-----+
|cnpj|classe|value|
+----+------+-----+
|   1|     a|    x|
|   2|     b|    y|
+----+------+-----+

