Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(npm): make tarball extraction more reliable #23759

Merged
merged 9 commits into from
May 14, 2024
45 changes: 32 additions & 13 deletions cli/npm/managed/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use crate::util::fs::hard_link_dir_recursive;
use crate::util::progress_bar::ProgressBar;

use super::tarball::verify_and_extract_tarball;
use super::tarball::TarballExtractionMode;

/// Stores a single copy of npm packages in a cache.
#[derive(Debug)]
Expand Down Expand Up @@ -69,7 +70,7 @@ impl NpmCache {
/// to ensure a package is only downloaded once per run of the CLI. This
/// prevents downloads from re-occurring when someone has `--reload` and
/// and imports a dynamic import that imports the same package again for example.
fn should_use_global_cache_for_package(&self, package: &PackageNv) -> bool {
fn should_use_cache_for_package(&self, package: &PackageNv) -> bool {
self.cache_setting.should_use_for_npm_package(&package.name)
|| !self
.previously_reloaded_packages
Expand All @@ -91,26 +92,22 @@ impl NpmCache {

async fn ensure_package_inner(
&self,
package: &PackageNv,
package_nv: &PackageNv,
dist: &NpmPackageVersionDistInfo,
registry_url: &Url,
) -> Result<(), AnyError> {
let package_folder = self
.cache_dir
.package_folder_for_name_and_version(package, registry_url);
if self.should_use_global_cache_for_package(package)
&& self.fs.exists_sync(&package_folder)
// if this file exists, then the package didn't successfully extract
// the first time, or another process is currently extracting the zip file
&& !self.fs.exists_sync(&package_folder.join(NPM_PACKAGE_SYNC_LOCK_FILENAME))
{
.package_folder_for_name_and_version(package_nv, registry_url);
let should_use_cache = self.should_use_cache_for_package(package_nv);
if should_use_cache && self.fs.exists_sync(&package_folder) {
return Ok(());
} else if self.cache_setting == CacheSetting::Only {
return Err(custom_error(
"NotCached",
format!(
"An npm specifier not found in cache: \"{}\", --cached-only is specified.",
&package.name
&package_nv.name
)
)
);
Expand All @@ -127,7 +124,26 @@ impl NpmCache {
.await?;
match maybe_bytes {
Some(bytes) => {
verify_and_extract_tarball(package, &bytes, dist, &package_folder)
let extraction_mode = if should_use_cache {
TarballExtractionMode::SiblingTempDir
} else {
// the user ran with `--reload`, so overwrite the package instead of
// deleting it since the package might get corrupted if a user kills
// their deno process while it's deleting a package directory
TarballExtractionMode::Overwrite
dsherret marked this conversation as resolved.
Show resolved Hide resolved
};
let dist = dist.clone();
let package_nv = package_nv.clone();
deno_core::unsync::spawn_blocking(move || {
verify_and_extract_tarball(
&package_nv,
&bytes,
&dist,
&package_folder,
extraction_mode,
)
})
.await?
}
None => {
bail!("Could not find npm package tarball at: {}", dist.tarball);
Expand All @@ -150,7 +166,7 @@ impl NpmCache {
.package_folder_for_id(folder_id, registry_url);

if package_folder.exists()
// if this file exists, then the package didn't successfully extract
// if this file exists, then the package didn't successfully initialize
// the first time, or another process is currently extracting the zip file
&& !package_folder.join(NPM_PACKAGE_SYNC_LOCK_FILENAME).exists()
&& self.cache_setting.should_use_for_npm_package(&folder_id.nv.name)
Expand All @@ -161,6 +177,9 @@ impl NpmCache {
let original_package_folder = self
.cache_dir
.package_folder_for_name_and_version(&folder_id.nv, registry_url);

// it seems Windows does an "AccessDenied" error when moving a
// directory with hard links, so that's why this solution is done
with_folder_sync_lock(&folder_id.nv, &package_folder, || {
hard_link_dir_recursive(&original_package_folder, &package_folder)
})?;
Expand Down Expand Up @@ -206,7 +225,7 @@ impl NpmCache {

const NPM_PACKAGE_SYNC_LOCK_FILENAME: &str = ".deno_sync_lock";

pub fn with_folder_sync_lock(
fn with_folder_sync_lock(
package: &PackageNv,
output_folder: &Path,
action: impl FnOnce() -> Result<(), AnyError>,
Expand Down
65 changes: 59 additions & 6 deletions cli/npm/managed/tarball.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

use std::collections::HashSet;
use std::fs;
use std::io::ErrorKind;
use std::path::Path;
use std::path::PathBuf;

use base64::prelude::BASE64_STANDARD;
use base64::Engine;
use deno_core::anyhow::bail;
use deno_core::anyhow::Context;
use deno_core::error::AnyError;
use deno_npm::registry::NpmPackageVersionDistInfo;
use deno_npm::registry::NpmPackageVersionDistInfoIntegrity;
Expand All @@ -16,19 +18,70 @@ use flate2::read::GzDecoder;
use tar::Archive;
use tar::EntryType;

use super::cache::with_folder_sync_lock;
use crate::util::path::get_atomic_dir_path;

#[derive(Debug, Copy, Clone)]
pub enum TarballExtractionMode {
/// Overwrites the destination directory without deleting any files.
Overwrite,
/// Creates and writes to a sibling temporary directory. When done, moves
/// it to the final destination.
///
/// This is more robust than `Overwrite` as it better handles multiple
/// processes writing to the directory at the same time.
SiblingTempDir,
}

pub fn verify_and_extract_tarball(
package: &PackageNv,
package_nv: &PackageNv,
data: &[u8],
dist_info: &NpmPackageVersionDistInfo,
output_folder: &Path,
extraction_mode: TarballExtractionMode,
) -> Result<(), AnyError> {
verify_tarball_integrity(package, data, &dist_info.integrity())?;
verify_tarball_integrity(package_nv, data, &dist_info.integrity())?;

match extraction_mode {
TarballExtractionMode::Overwrite => extract_tarball(data, output_folder),
TarballExtractionMode::SiblingTempDir => {
let temp_dir = get_atomic_dir_path(output_folder);
extract_tarball(data, &temp_dir)?;
rename_with_retries(&temp_dir, output_folder)
.map_err(AnyError::from)
.context("Failed moving extracted tarball to final destination.")
}
}
}

with_folder_sync_lock(package, output_folder, || {
extract_tarball(data, output_folder)
})
fn rename_with_retries(
temp_dir: &Path,
output_folder: &Path,
) -> Result<(), std::io::Error> {
let mut count = 0;
// renaming might be flaky if a lot of processes are trying
// to do this, so retry a few times
loop {
match fs::rename(temp_dir, output_folder) {
Ok(_) => return Ok(()),
Err(err) if err.kind() == ErrorKind::AlreadyExists => {
// another process copied here, just cleanup
let _ = fs::remove_dir_all(temp_dir);
dsherret marked this conversation as resolved.
Show resolved Hide resolved
}
Err(err) => {
count += 1;
if count > 5 {
// too many tries, cleanup and return the error
let _ = fs::remove_dir_all(temp_dir);
return Err(err);
}

// wait a bit before retrying... this should be very rare or only
// in error cases, so ok to sleep a bit
let sleep_ms = std::cmp::min(100, 20 * count);
std::thread::sleep(std::time::Duration::from_millis(sleep_ms));
}
}
}
}

fn verify_tarball_integrity(
Expand Down
12 changes: 2 additions & 10 deletions cli/util/fs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

use std::collections::HashSet;
use std::env::current_dir;
use std::fmt::Write as FmtWrite;
use std::fs::FileType;
use std::fs::OpenOptions;
use std::io::Error;
Expand All @@ -23,12 +22,12 @@ use deno_core::error::AnyError;
pub use deno_core::normalize_path;
use deno_core::unsync::spawn_blocking;
use deno_core::ModuleSpecifier;
use deno_runtime::deno_crypto::rand;
use deno_runtime::deno_fs::FileSystem;
use deno_runtime::deno_node::PathClean;

use crate::util::gitignore::DirGitIgnores;
use crate::util::gitignore::GitIgnoreTree;
use crate::util::path::get_atomic_file_path;
use crate::util::progress_bar::ProgressBar;
use crate::util::progress_bar::ProgressBarStyle;
use crate::util::progress_bar::ProgressMessagePrompt;
Expand Down Expand Up @@ -56,14 +55,7 @@ pub fn atomic_write_file<T: AsRef<[u8]>>(
}

fn inner(file_path: &Path, data: &[u8], mode: u32) -> std::io::Result<()> {
let temp_file_path = {
let rand: String = (0..4).fold(String::new(), |mut output, _| {
let _ = write!(output, "{:02x}", rand::random::<u8>());
output
});
let extension = format!("{rand}.tmp");
file_path.with_extension(extension)
};
let temp_file_path = get_atomic_file_path(file_path);

if let Err(write_err) =
atomic_write_file_raw(&temp_file_path, file_path, data, mode)
Expand Down
26 changes: 26 additions & 0 deletions cli/util/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,32 @@ pub fn get_extension(file_path: &Path) -> Option<String> {
.map(|e| e.to_lowercase());
}

pub fn get_atomic_dir_path(file_path: &Path) -> PathBuf {
let rand = gen_rand_path_component();
let new_file_name = format!(
".{}_{}",
file_path
.file_name()
.map(|f| f.to_string_lossy())
.unwrap_or(Cow::Borrowed("")),
rand
);
file_path.with_file_name(new_file_name)
}

pub fn get_atomic_file_path(file_path: &Path) -> PathBuf {
let rand = gen_rand_path_component();
let extension = format!("{rand}.tmp");
file_path.with_extension(extension)
}

fn gen_rand_path_component() -> String {
(0..4).fold(String::new(), |mut output, _| {
output.push_str(&format!("{:02x}", rand::random::<u8>()));
output
})
}

/// TypeScript figures out the type of file based on the extension, but we take
/// other factors into account like the file headers. The hack here is to map the
/// specifier passed to TypeScript to a new specifier with the file extension.
Expand Down