Skip to content
This repository has been archived by the owner on May 11, 2019. It is now read-only.

Commit

Permalink
Refactoring aggregation code (#112)
Browse files Browse the repository at this point in the history

* group by works
  • Loading branch information
andygrove committed Apr 22, 2018
1 parent ade7139 commit 820a1fb
Show file tree
Hide file tree
Showing 10 changed files with 389 additions and 109 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Expand Up @@ -15,3 +15,8 @@ Cargo.lock
# test output
_*.csv
_*.txt
_*.quiver
temp

# test data
datasets
4 changes: 2 additions & 2 deletions Cargo.toml
Expand Up @@ -28,8 +28,8 @@ path = "src/bin/console/main.rs"
bytes = "0.4"
byteorder = "1"
csv = "1.0.0-beta.5"
#datafusion-arrow = "0.1.0-nightly-20180418"
datafusion-arrow = { path = "../datafusion-arrow" }
datafusion-arrow = "0.1.0-nightly-20180420"
#datafusion-arrow = { path = "../datafusion-arrow" }
parquet = "0.1.0"
#futures = "0.1.18"
#futures-timer = "0.1"
Expand Down
17 changes: 10 additions & 7 deletions examples/csv_sql_aggregates.rs
Expand Up @@ -23,7 +23,6 @@ use datafusion::exec::*;
use datafusion::functions::conversions::*;

fn main() {

// download data file from https://www.kaggle.com/kaggle/sf-salaries/discussion/18736
let path = "datasets/Salaries.csv";
match File::open(path) {
Expand All @@ -34,7 +33,7 @@ fn main() {

// define schema for data source (csv file)
let schema = Schema::new(vec![
Field::new("id", DataType::UInt32, false),
Field::new("id", DataType::Utf8, false),
Field::new("employee_name", DataType::Utf8, false),
Field::new("job_title", DataType::Utf8, false),
Field::new("base_pay", DataType::Utf8, false),
Expand All @@ -43,10 +42,10 @@ fn main() {
Field::new("benefits", DataType::Utf8, false),
Field::new("total_pay", DataType::Utf8, false),
Field::new("total_pay_benefits", DataType::Utf8, false),
Field::new("year", DataType::UInt16, false),
Field::new("year", DataType::Utf8, false),
Field::new("notes", DataType::Utf8, true),
Field::new("agency", DataType::Utf8, false),
Field::new("satus", DataType::Utf8, false),
Field::new("status", DataType::Utf8, false),
]);

// open a CSV file as a dataframe
Expand All @@ -56,9 +55,14 @@ fn main() {
ctx.register("salaries", salaries);

// define the SQL statement
let sql = "SELECT MIN(to_float64(base_pay)), MAX(to_float64(base_pay)) \
let sql = "SELECT year, MIN(to_float64(base_pay)), MAX(to_float64(base_pay)) \
FROM salaries \
WHERE base_pay != 'Not Provided' AND base_pay != ''";
WHERE base_pay != 'Not Provided' AND base_pay != '' \
GROUP BY year";

// let sql = "SELECT MIN(to_float64(base_pay)), MAX(to_float64(base_pay)) \
// FROM salaries \
// WHERE base_pay != 'Not Provided' AND base_pay != ''";

// create a data frame
let df = ctx.sql(&sql).unwrap();
Expand All @@ -68,5 +72,4 @@ fn main() {
}
_ => println!("Could not locate {} - try downloading it from https://www.kaggle.com/kaggle/sf-salaries/discussion/18736", path)
}

}

0 comments on commit 820a1fb

Please sign in to comment.