In [2]:
from langchain_community.document_loaders import GitLoader

loader = GitLoader(
    clone_url="https://github.com/rjwharry/todo",
    repo_path="./repo/todo",
    branch="main",
)

In [3]:
data = loader.load()
len(data)
print(data[0])

page_content='{
  "parser": "@typescript-eslint/parser",
  "plugins": ["@typescript-eslint"],
  "extends": [
    // "airbnb",
    // "airbnb/hooks",
    "plugin:@typescript-eslint/recommended",
    "plugin:prettier/recommended"
  ],
  "parserOptions": {
    "project": "./tsconfig.json"
  },
  "rules": {
  },
  "settings": {
    "import/parsers": {
      "@typescript-eslint/parser": [".ts", ".tsx"]
    },
    "import/resolver": {
      "typescript": {
        "alwaysTryTypes": true // always try to resolve types under `<root>@types` directory even it doesn't contain any source code, like `@types/unist`
      }
    },
    "project": [
      "./tsconfig.json"
    ]
  }
}' metadata={'source': 'dnd-practice/.eslintrc.json', 'file_path': 'dnd-practice/.eslintrc.json', 'file_name': '.eslintrc.json', 'file_type': '.json'}


In [4]:
backend = list(filter(lambda d: d.metadata["source"].startswith("backend/"), data))

In [5]:
print(len(backend))

20


In [6]:
backend[-1]

Document(metadata={'source': 'backend/kotlin/src/main/kotlin/com/practice/kotlin/services/TodoService.kt', 'file_path': 'backend/kotlin/src/main/kotlin/com/practice/kotlin/services/TodoService.kt', 'file_name': 'TodoService.kt', 'file_type': '.kt'}, page_content='package com.practice.kotlin.services\n\nimport com.practice.kotlin.dto.TodoDto\nimport com.practice.kotlin.repositories.STATUS\nimport com.practice.kotlin.repositories.Todo\nimport com.practice.kotlin.repositories.TodoRepository\nimport org.springframework.stereotype.Service\n\n@Service\nclass TodoService(private val todoRepository: TodoRepository) {\n\tfun getAllTodos(): List<Todo> {\n\t\treturn todoRepository.findAll()\n\t}\n\n\tfun getTodoById(id: Long): Todo {\n\t\treturn todoRepository.findById(id).get()\n\t}\n\n\n\tfun createTodo(todo: Todo): Todo {\n\t\tval lastTodo = getLastTodoByStatus(todo.status)\n\t\tval newTodo = todoRepository.save(todo)\n\t\tnewTodo.prev = lastTodo?.id\n\t\tnewTodo.next = null\n\t\ttodoRepositor

In [7]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [8]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-flash-latest", streaming=True)

  from .autonotebook import tqdm as notebook_tqdm
I0000 00:00:1721817529.522144   28250 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache
I0000 00:00:1721817529.541905   28250 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1721817529.543461   28250 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [9]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

class Instance(BaseModel):
    name: str = Field(description="Name of instance type")
    cpu: str = Field(description="The number of instance cpu cores")
    memory: str = Field(description="The capacity of instance memory (GiB)")
    storage: str = Field(description="The capacity of instance storage (GiB)")
    gpu: str = Field(description="Name of gpu model and memory (GiB) of instance")

class Cost(BaseModel):
    monthly: str = Field(description="Estimated monthly measurements")
    hourly: str = Field(description="Estimated measurements per hour")

class Estimate(BaseModel):
    instance: Instance = Field(description="Instance information of Cloud Provider")
    pricing: Cost = Field(description="Estimated pricing while running an instance of the instance_type (USD)")
    power_consumption: Cost = Field(description="Estimated power consumption while running an instance of the instance_type(kWh)")
    carbon_footprint: Cost = Field(description="Estimated carbon footprint while running an instance of the instance_type(kg CO2)")
    description: str = Field(description="A rationale and detailed explanation for estimations")
    

class Result(BaseModel):
    gcp: Estimate = Field(description="Estimated Result of Google Cloud Platform(GCP)")
    aws: Estimate = Field(description="Estimated Result of Amazon Web Services(AWS)")
    azure: Estimate = Field(description="Estimated Result of Microsoft Azure")
    conclusion: Estimate = Field(description="The most appropriate among gcp, aws, and azure")

output_parser = JsonOutputParser(pydantic_object=Result)
# output_parser = JsonOutputParser()

In [10]:
from langchain.prompts import PromptTemplate
prompt = PromptTemplate(
    template="""
As an expert in analyzing software repositories and estimating resource consumption and environmental impact, your task is to provide a comprehensive analysis of a GitHub repository.

Instructions
Analyze the structure of the files in the provided GitHub repository by examining the metadata in the documentation.
Identify the entry point of the repository.
Determine the minimum resources required to run the repository on GCP, AWS, and Azure platforms.
Estimate the instance cost, power consumption, and carbon footprint on each platform.
Context
The goal is to gain a detailed understanding of the repository’s requirements and its environmental impact. Your analysis should be thorough, taking into account all relevant aspects of the repository and the different cloud platforms.

Output
Ensure the output is structured in a clear and detailed manner, adhering to the JSON format specified by the following guide: {format_instruction}

GitHub Repository
{GITHUB}

Additional Guidelines
Be specific and detailed in your analysis.
Provide calculations and assumptions used in estimating resources, costs, and environmental impact.
Compare and contrast the findings across the three cloud platforms (GCP, AWS, Azure).
Use technical terminology appropriately to convey precision and expertise.
""",
    input_variables=["GITHUB"],
    partial_variables={"format_instruction": output_parser.get_format_instructions()})
chain = prompt | llm | output_parser

In [11]:
# for chunk in chain.stream({"GITHUB": backend}):
#     print(chunk, end="", flush=True)
chain.invoke({"GITHUB": backend})

{'gcp': {'instance': {'name': 'n1-standard-1',
   'cpu': '1 vCPU',
   'memory': '3.75 GiB',
   'storage': '100 GiB',
   'gpu': 'None'},
  'pricing': {'monthly': '$14.88', 'hourly': '$0.0104'},
  'power_consumption': {'monthly': '216 kWh', 'hourly': '1.5 kWh'},
  'carbon_footprint': {'monthly': '17.3 kg CO2', 'hourly': '1.2 kg CO2'},
  'description': "GCP's n1-standard-1 instance is chosen for its balance of processing power and cost-effectiveness. The instance provides 1 vCPU and 3.75 GiB of memory, suitable for running the Kotlin Spring Boot application. The instance also includes 100 GiB of persistent disk for storing data and logs. The monthly cost is calculated based on the on-demand pricing of the instance, and the power consumption is estimated based on the instance's specifications. The carbon footprint is calculated based on the power consumption and the average carbon intensity of the GCP data centers."},
 'aws': {'instance': {'name': 't3.micro',
   'cpu': '1 vCPU',
   'memory