Merge pull request #36 from martindurant/speedups

speedup
dask · Dec 1, 2016 · ff1d5d9 · ff1d5d9
2 parents 18a9c38 + 4f906fc
commit ff1d5d9
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 20 deletions.
diff --git a/fastparquet/api.py b/fastparquet/api.py
@@ -149,9 +149,10 @@ def grab_cats(self, columns, row_group_index=0):
         -------
         {column: [list, of, values]}
         """
+        if len(columns) == 0:
+            return {}
         rg = self.row_groups[row_group_index]
-        ofname = self.sep.join([os.path.dirname(self.fn),
-                                rg.columns[0].file_path])
+        ofname = self.row_group_filename(rg)
         out = {}
 
         with self.open(ofname, 'rb') as f:

diff --git a/fastparquet/benchmarks/columns.py b/fastparquet/benchmarks/columns.py
@@ -35,7 +35,7 @@ def time_column():
             out = pf.to_pandas()
 
         with measure('write random times, no nulls but has_null=True', result):
-            write(fn, df)
+            write(fn, df, has_nulls=True)
 
         pf = ParquetFile(fn)
         out = pf.to_pandas()  # warm-up
@@ -45,7 +45,7 @@ def time_column():
 
         df.loc[n//2, 'x'] = pd.to_datetime('NaT')
         with measure('write random times, with null', result):
-            write(fn, df)
+            write(fn, df, has_nulls=True)
 
         pf = ParquetFile(fn)
         out = pf.to_pandas()  # warm-up

diff --git a/fastparquet/converted_types.py b/fastparquet/converted_types.py
@@ -87,7 +87,7 @@ def convert(data, se):
     data: pandas series of primitive type
     se: a schema element.
     """
-    data = np.asarray(data, dtype=simple[se.type])
+    #data = np.asarray(data, dtype=simple[se.type])
     ctype = se.converted_type
     if ctype is None:
         return data

diff --git a/fastparquet/core.py b/fastparquet/core.py
@@ -217,7 +217,7 @@ def read_col(column, schema_helper, infile, use_cat=False,
     any_def = any(_[0] is not None for _ in out)
     do_convert = True
     if all_dict:
-        final = np.empty(cmd.num_values, np.int64)
+        dtype = np.int64
         my_nan = -1
         do_convert = False
     else:
@@ -232,23 +232,32 @@ def read_col(column, schema_helper, infile, use_cat=False,
             my_nan = -9223372036854775808  # int64 version of NaT
         else:
             my_nan = None
-        final = np.empty(cmd.num_values, dtype)
-    start = 0
-    for defi, rep, val, d in out:
+    if len(out) == 1 and not any_def:
+        defi, rep, val, d = out[0]
         if d and not all_dict:
-            cval = dic[val]
+            final = dic[val]
         elif do_convert:
-            cval = convert(val, se)
-        else:
-            cval = val
-        if defi is not None:
-            part = final[start:start+len(defi)]
-            part[defi != 1] = my_nan
-            part[defi == 1] = cval
-            start += len(defi)
+            final = convert(val, se)
         else:
-            final[start:start+len(val)] = cval
-            start += len(val)
+            final = val
+    else:
+        final = np.empty(cmd.num_values, dtype)
+        start = 0
+        for defi, rep, val, d in out:
+            if d and not all_dict:
+                cval = dic[val]
+            elif do_convert:
+                cval = convert(val, se)
+            else:
+                cval = val
+            if defi is not None:
+                part = final[start:start+len(defi)]
+                part[defi != 1] = my_nan
+                part[defi == 1] = cval
+                start += len(defi)
+            else:
+                final[start:start+len(val)] = cval
+                start += len(val)
     if all_dict:
         final = pd.Categorical.from_codes(final, categories=dic)
     return final