Skip to content

Commit

Permalink
Merge pull request #2 from cosmonaut-nz/local_provider
Browse files Browse the repository at this point in the history
Refactor: Removed usage of IO on file handling. Moved to keeping file content in memory.
  • Loading branch information
avastmick committed Nov 26, 2023
2 parents 863e342 + 3b190a1 commit 1a48954
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 77 deletions.
126 changes: 68 additions & 58 deletions src/review/code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,18 @@
use crate::review::data::LanguageFileType;
use linguist::{
container::InMemoryLanguageContainer,
resolver::{resolve_language, Language, Scope},
utils::{is_configuration, is_documentation, is_dotfile, is_vendor},
// error::LinguistError,
resolver::{resolve_language_from_content_str, Language, Scope},
// utils::{is_configuration, is_documentation, is_dotfile, is_vendor},
};
use log::error;
use regex::RegexSet;
use serde::{Deserialize, Serialize};
use std::{
collections::HashMap,
fs::File,
io::{BufRead, BufReader},
path::Path,
ffi::{OsStr, OsString},
sync::Arc,
};
use walkdir::DirEntry;

pub mod predefined {
include!(concat!(env!("OUT_DIR"), "/languages.rs"));
Expand Down Expand Up @@ -66,25 +65,38 @@ pub fn initialize_language_analysis() -> (
(lc, breakdown, rules, docs)
}

pub struct FileInfo {
pub contents: Arc<OsString>,
pub name: Arc<OsString>,
pub ext: Arc<OsString>,
pub language: Option<Language>,
pub file_size: Option<u64>,
pub loc: Option<i64>,
}

pub fn analyse_file_language(
entry: &DirEntry,
file_info: &FileInfo,
lc: &InMemoryLanguageContainer,
rules: &RegexSet,
docs: &RegexSet,
) -> Option<(Language, u64, i64, String)> {
let path = entry.path();
let relative_path = entry.path().strip_prefix(path).unwrap();
if is_vendor(entry.path(), rules)
|| is_documentation(relative_path, docs)
|| is_dotfile(relative_path)
|| is_configuration(relative_path)
{
// TODO: handle if is_documentation: if so then work out frequency; higher the count the better for overall RAG
// if no documentation then needs to be in repository summary and flagged as issue
// - i.e. best practice is that documentation is versioned with code, new developers will find it more easily, etc.
return None;
}
let language: &Language = match resolve_language(path, lc) {
_rules: &RegexSet,
_docs: &RegexSet,
) -> Option<(Language, u64, i64)> {
// TODO: resolve the type of file if docs, dotfile, or config
// if is_vendor(entry.path(), rules)
// || is_documentation(relative_path, docs)
// || is_dotfile(relative_path)
// || is_configuration(relative_path)
// {
// // TODO: handle if is_documentation: if so then work out frequency; higher the count the better for overall RAG
// // if no documentation then needs to be in repository summary and flagged as issue
// // - i.e. best practice is that documentation is versioned with code, new developers will find it more easily, etc.
// return None;
// }
let language: &Language = match resolve_language_from_content_str(
file_info.contents.as_os_str(),
file_info.name.as_os_str(),
file_info.ext.as_os_str(),
lc,
) {
Ok(Some(lang)) => lang,
_ => return None,
};
Expand All @@ -93,67 +105,65 @@ pub fn analyse_file_language(
return None;
}

let file_size = match entry.metadata() {
Ok(metadata) => metadata.len(),
Err(_) => return None,
let file_size = match get_file_contents_size(file_info.contents.as_os_str()) {
Ok(size) => size,
Err(e) => {
error!("Error when determining file size: {}", e);
0
}
};
let loc: i64 = count_lines_of_code(path);
let extension = path
.extension()
.unwrap_or_default()
.to_string_lossy()
.to_string();

Some((language.clone(), file_size, loc, extension))
}

// Function to count lines of code in a file
fn count_lines_of_code(file_path: &Path) -> i64 {
let file = match File::open(file_path) {
Ok(file) => file,
let loc: i64 = match count_lines_of_code(&file_info.contents) {
Ok(num_lines) => num_lines,
Err(e) => {
error!("Failed to open file {}: {}", file_path.display(), e);
return 0;
error!("Error when determining lines of code: {}", e);
0
}
};
let reader = BufReader::new(file);

Some((language.clone(), file_size, loc))
}

fn get_file_contents_size(file_contents: impl AsRef<OsStr>) -> Result<u64, &'static str> {
let content_str = file_contents
.as_ref()
.to_str()
.ok_or("Invalid UTF-8 content")?;
let length: u64 = content_str
.len()
.try_into()
.map_err(|_| "Length conversion error")?;
Ok(length)
}

/// Function to count lines of code in a file, skipping comments
fn count_lines_of_code(file_content: impl AsRef<OsString>) -> Result<i64, &'static str> {
let content_str = file_content
.as_ref()
.to_str()
.ok_or("Invalid UTF-8 content")?;
let mut is_comment_block = false;
let mut functional_lines = 0;

for line in reader.lines() {
let line = match line {
Ok(line) => line,
Err(e) => {
// Optionally log the error and continue to the next line
error!("Error reading line in {}: {}", file_path.display(), e);
continue;
}
};
for line in content_str.lines() {
let line = line.trim();

// Check for block comment start or end
if line.starts_with("/*") {
is_comment_block = true;
}
if line.ends_with("*/") {
is_comment_block = false;
continue;
}

// Skip the line if it's a comment
if COMMENT_PREFIXES
.iter()
.any(|&prefix| line.starts_with(prefix))
|| is_comment_block
{
continue;
}

functional_lines += 1;
}

functional_lines
Ok(functional_lines)
}

#[derive(Clone, Serialize, Deserialize, Debug, PartialEq)]
Expand Down
90 changes: 71 additions & 19 deletions src/review/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ mod tools;
use crate::provider::api::ProviderCompletionResponse;
use crate::provider::prompts::PromptData;
use crate::provider::review_code_file;
use crate::review::code::{analyse_file_language, initialize_language_analysis};
use crate::review::code::{analyse_file_language, initialize_language_analysis, FileInfo};
use crate::review::data::{FileReview, LanguageFileType, RAGStatus, RepositoryReview};
use crate::review::tools::{get_git_contributors, is_not_blacklisted};
use crate::settings::Settings;
Expand All @@ -16,12 +16,14 @@ use log::{debug, error, info};
use regex::Regex;
use serde::Deserialize;
use std::error::Error;
use std::ffi::OsStr;
use std::fmt;
use std::fs;
use std::io::Write;
use std::path::Path;
use std::path::PathBuf;
use walkdir::WalkDir;
use std::sync::Arc;
use walkdir::{DirEntry, WalkDir};

// TODO: "security_issues": [
// {
Expand All @@ -41,11 +43,12 @@ use walkdir::WalkDir;
// }
// ],

/// Takes the filepath to a repository and iterates over the code, sending each relevant file for review.
/// Takes the filepath to a repository and iterates over the code, gaining stats, and sending each relevant file for review.
///
/// # Parameters
///
/// * `settings` - A [`Settings`] that contains information for the LLM
///
pub async fn assess_codebase(
settings: Settings,
) -> Result<RepositoryReview, Box<dyn std::error::Error>> {
Expand All @@ -69,36 +72,69 @@ pub async fn assess_codebase(
),
)));
}
// First get the repository blacklist to avoid unneccessary file/folder traversal (should be largely derived from .gitignore)
// Get the repository blacklist to avoid unneccessary file/folder traversal (should be largely derived from .gitignore)
let blacklisted_dirs: Vec<String> = tools::get_blacklist_dirs(repository_root);
debug!("BLACKLIST: {:?}", blacklisted_dirs);

let mut overall_file_count: i32 = 0;
let (lc, mut breakdown, rules, docs) = initialize_language_analysis();

// Second review code files
// Fetch files from non-blacklisted dirs
for entry in WalkDir::new(repository_root)
.into_iter()
.filter_entry(|e| is_not_blacklisted(e, &blacklisted_dirs))
.filter_map(|e| e.ok())
.filter(|e| e.file_type().is_file())
{
let path = entry.path();

if let Some((language, file_size, loc, extension)) =
analyse_file_language(&entry, &lc, &rules, &docs)
{
let result: Option<FileInfo> = get_file_info(&entry).and_then(|file_info| {
analyse_file_language(&file_info, &lc, &rules, &docs).map(
|(language, file_size, loc)| FileInfo {
contents: file_info.contents,
name: file_info.name,
ext: file_info.ext,
language: Some(language),
file_size: Some(file_size),
loc: Some(loc),
},
)
});
if let Some(file_info) = result {
overall_file_count += 1;

breakdown.add_usage(&language.name, &extension, file_size, loc);

breakdown.add_usage(
&file_info.language.unwrap().name,
file_info.ext.to_str().unwrap_or_default(),
file_info.file_size.unwrap(),
file_info.loc.unwrap(),
);
#[cfg(debug_assertions)]
if let Some(max_count) = settings.max_file_count {
if overall_file_count >= max_count {
continue;
}
}
match review_file(&settings, path).await {
let contents_str = match file_info.contents.to_str() {
Some(contents) => contents,
None => {
error!("Contents of the code file are not valid UTF-8");
continue;
}
};

let file_name_str = match file_info.name.to_str() {
Some(name) => name,
None => {
error!("File name is not valid UTF-8");
continue;
}
};

match review_file(
&settings,
&file_name_str.to_string(),
&contents_str.to_string(),
)
.await
{
Ok(Some(reviewed_file)) => {
review.add_file_review(reviewed_file);
}
Expand Down Expand Up @@ -148,6 +184,23 @@ pub async fn assess_codebase(
Ok(review)
}

/// gets the content, filename and extension of a [`walkdir::DirEntry`]
fn get_file_info(entry: &DirEntry) -> Option<FileInfo> {
let path = entry.path();
let contents = fs::read_to_string(path).ok()?;
let name = path.file_name()?.to_os_string();
let ext = path.extension()?.to_os_string();

Some(FileInfo {
contents: Arc::new(OsStr::new(contents.as_str()).to_os_string()),
name: Arc::new(name),
ext: Arc::new(ext),
language: None,
file_size: None,
loc: None,
})
}

/// gives an overall [`RAGStatus`]
fn get_overall_rag_for(review: &RepositoryReview) -> RAGStatus {
let mut total_score = 0;
Expand Down Expand Up @@ -214,9 +267,10 @@ impl ReviewType {
///
async fn review_file(
settings: &Settings,
path: &Path,
code_file_path: &String,
code_file_contents: &String,
) -> Result<Option<FileReview>, Box<dyn std::error::Error>> {
info!("Handling output_file: {}", path.display());
info!("Handling output_file: {}", code_file_path);
// Set up the right provider
let provider: &crate::settings::ProviderSettings = settings.get_active_provider()
.expect("Either a default or chosen provider should be configured in \'default.json\'. \
Expand All @@ -232,9 +286,7 @@ async fn review_file(
}
};

// TODO move this up and have this function expect a FileReview struct instead, which includes the code from the file
let code_from_file: String = fs::read_to_string(path)?;
let review_request: String = format!("File name: {}\n{}\n", path.display(), code_from_file);
let review_request: String = format!("File name: {}\n{}\n", code_file_path, code_file_contents);
// Add the file as PromptData
prompt_data.add_user_message_prompt(review_request);
// debug!("Prompt data sent: {:?}", prompt_data);
Expand Down

0 comments on commit 1a48954

Please sign in to comment.