Merge pull request #3972 from fastai/revert-3971-pandas_cat_type

Revert "Resolve Pandas CategoricalDtype FutureWarning"
fastai · Oct 13, 2023 · 4dceef2 · 4dceef2
2 parents 2d1ce65 + 773bbc3
commit 4dceef2
Show file tree

Hide file tree

Showing 6 changed files with 29 additions and 29 deletions.
diff --git a/fastai/data/transforms.py b/fastai/data/transforms.py
@@ -162,7 +162,7 @@ def ColSplitter(col='is_valid', on=None):
     def _inner(o):
         assert isinstance(o, pd.DataFrame), "ColSplitter only works when your items are a pandas DataFrame"
         c = o.iloc[:,col] if isinstance(col, int) else o[col]
-        if on is None:      valid_idx = c.values.astype('bool')
+        if on is None:      valid_idx = c.values.astype('bool') 
         elif is_listy(on):  valid_idx = c.isin(on)
         else:               valid_idx = c == on
         return IndexSplitter(mask2idxs(valid_idx))(o)
@@ -222,7 +222,7 @@ def __call__(self, o, **kwargs):
 class CategoryMap(CollBase):
     "Collection of categories with the reverse mapping in `o2i`"
     def __init__(self, col, sort=True, add_na=False, strict=False):
-        if isinstance(col, CategoricalDtype):
+        if is_categorical_dtype(col):
             items = L(col.cat.categories, use_list=True)
             #Remove non-used categories while keeping order
             if strict: items = L(o for o in items if o in col.unique())
@@ -256,7 +256,7 @@ def setups(self, dsets):
         if self.vocab is None and dsets is not None: self.vocab = CategoryMap(dsets, sort=self.sort, add_na=self.add_na)
         self.c = len(self.vocab)
 
-    def encodes(self, o):
+    def encodes(self, o): 
         try:
             return TensorCategory(self.vocab.o2i[o])
         except KeyError as e:
@@ -279,7 +279,7 @@ def setups(self, dsets):
             for b in dsets: vals = vals.union(set(b))
             self.vocab = CategoryMap(list(vals), add_na=self.add_na)
 
-    def encodes(self, o):
+    def encodes(self, o): 
         if not all(elem in self.vocab.o2i.keys() for elem in o):
             diff = [elem for elem in o if elem not in self.vocab.o2i.keys()]
             diff_str = "', '".join(diff)

diff --git a/fastai/imports.py b/fastai/imports.py
@@ -24,7 +24,7 @@
 
 # External modules
 import requests,yaml,matplotlib.pyplot as plt,pandas as pd,scipy
-from pandas.api.types import CategoricalDtype,is_numeric_dtype
+from pandas.api.types import is_categorical_dtype,is_numeric_dtype
 from numpy import array,ndarray
 from scipy import ndimage
 from pdb import set_trace

diff --git a/fastai/metrics.py b/fastai/metrics.py
@@ -428,7 +428,7 @@ def accumulate(self, learn):
                     c,t = self.get_correct_ngrams(pred, targ, i+1, max_n=self.vocab_sz)
                     if c == 0:
                         smooth_mteval *= 2
-                        c = 1 / smooth_mteval    # exp smoothing, method 3 from http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
+                        c = 1 / smooth_mteval    # exp smoothing, method 3 from https://aclanthology.org/W14-3346/
                     self.corrects[i] += c
                     self.counts[i]   += t
 

diff --git a/fastai/tabular/core.py b/fastai/tabular/core.py
@@ -230,7 +230,7 @@ def name(self): return f"{super().name} -- {getattr(self,'__stored_args__',{})}"
 
 # %% ../../nbs/40_tabular.core.ipynb 58
 def _apply_cats (voc, add, c):
-    if not isinstance(c, CategoricalDtype):
+    if not is_categorical_dtype(c):
         return pd.Categorical(c, categories=voc[c.name][add:]).codes+add
     return c.cat.codes+add #if is_categorical_dtype(c) else c.map(voc[c.name].o2i)
 def _decode_cats(voc, c): return c.map(dict(enumerate(voc[c.name].items)))
@@ -346,7 +346,7 @@ def show_batch(x: Tabular, y, its, max_n=10, ctxs=None):
 # %% ../../nbs/40_tabular.core.ipynb 94
 @delegates()
 class TabDataLoader(TfmdDL):
-    "A transformed `DataLoader` for Tabular data"
+    "A transformed `DataLoader` for Tabular data"    
     def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
         if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatch(dataset)
         super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

diff --git a/nbs/05_data.transforms.ipynb b/nbs/05_data.transforms.ipynb
@@ -563,8 +563,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "fnames = [path/'train/3/9932.png', path/'valid/7/7189.png',\n",
-    "          path/'valid/7/7320.png', path/'train/7/9833.png',\n",
+    "fnames = [path/'train/3/9932.png', path/'valid/7/7189.png', \n",
+    "          path/'valid/7/7320.png', path/'train/7/9833.png',  \n",
     "          path/'train/3/7666.png', path/'valid/3/925.png',\n",
     "          path/'train/7/724.png', path/'valid/3/93055.png']\n",
     "splitter = GrandparentSplitter()"
@@ -684,7 +684,7 @@
     "    def _inner(o):\n",
     "        assert isinstance(o, pd.DataFrame), \"ColSplitter only works when your items are a pandas DataFrame\"\n",
     "        c = o.iloc[:,col] if isinstance(col, int) else o[col]\n",
-    "        if on is None:      valid_idx = c.values.astype('bool')\n",
+    "        if on is None:      valid_idx = c.values.astype('bool') \n",
     "        elif is_listy(on):  valid_idx = c.isin(on)\n",
     "        else:               valid_idx = c == on\n",
     "        return IndexSplitter(mask2idxs(valid_idx))(o)\n",
@@ -981,7 +981,7 @@
     "class CategoryMap(CollBase):\n",
     "    \"Collection of categories with the reverse mapping in `o2i`\"\n",
     "    def __init__(self, col, sort=True, add_na=False, strict=False):\n",
-    "        if isinstance(col, CategoricalDtype):\n",
+    "        if is_categorical_dtype(col):\n",
     "            items = L(col.cat.categories, use_list=True)\n",
     "            #Remove non-used categories while keeping order\n",
     "            if strict: items = L(o for o in items if o in col.unique())\n",
@@ -1082,7 +1082,7 @@
     "        if self.vocab is None and dsets is not None: self.vocab = CategoryMap(dsets, sort=self.sort, add_na=self.add_na)\n",
     "        self.c = len(self.vocab)\n",
     "\n",
-    "    def encodes(self, o):\n",
+    "    def encodes(self, o): \n",
     "        try:\n",
     "            return TensorCategory(self.vocab.o2i[o])\n",
     "        except KeyError as e:\n",
@@ -1169,7 +1169,7 @@
     "            for b in dsets: vals = vals.union(set(b))\n",
     "            self.vocab = CategoryMap(list(vals), add_na=self.add_na)\n",
     "\n",
-    "    def encodes(self, o):\n",
+    "    def encodes(self, o): \n",
     "        if not all(elem in self.vocab.o2i.keys() for elem in o):\n",
     "            diff = [elem for elem in o if elem not in self.vocab.o2i.keys()]\n",
     "            diff_str = \"', '\".join(diff)\n",

diff --git a/nbs/40_tabular.core.ipynb b/nbs/40_tabular.core.ipynb
@@ -274,7 +274,7 @@
    "outputs": [],
    "source": [
     "#|hide\n",
-    "test_eq(df.columns, ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',\n",
+    "test_eq(df.columns, ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', \n",
     "            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'Elapsed'])\n",
     "test_eq(df[df.Elapsed.isna()].shape,(1, 13))\n",
     "\n",
@@ -385,8 +385,8 @@
    "source": [
     "#|hide\n",
     "# Test Order of columns when date isn't in first position\n",
-    "test_eq(df.columns, ['f1', 'f2', 'f3', 'f4', 'Year', 'Month', 'Week', 'Day',\n",
-    "            'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',\n",
+    "test_eq(df.columns, ['f1', 'f2', 'f3', 'f4', 'Year', 'Month', 'Week', 'Day', \n",
+    "            'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', \n",
     "            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'Elapsed'])\n",
     "\n",
     "# Test that week dtype is consistent with other datepart fields\n",
@@ -582,9 +582,9 @@
    "outputs": [],
    "source": [
     "# Example with simple numpy types\n",
-    "df = pd.DataFrame({'cat1': [1, 2, 3, 4], 'cont1': [1., 2., 3., 2.], 'cat2': ['a', 'b', 'b', 'a'],\n",
-    "                   'i8': pd.Series([1, 2, 3, 4], dtype='int8'),\n",
-    "                   'u8': pd.Series([1, 2, 3, 4], dtype='uint8'),\n",
+    "df = pd.DataFrame({'cat1': [1, 2, 3, 4], 'cont1': [1., 2., 3., 2.], 'cat2': ['a', 'b', 'b', 'a'], \n",
+    "                   'i8': pd.Series([1, 2, 3, 4], dtype='int8'), \n",
+    "                   'u8': pd.Series([1, 2, 3, 4], dtype='uint8'), \n",
     "                   'f16': pd.Series([1, 2, 3, 4], dtype='float16'),\n",
     "                   'y1': [1, 0, 1, 0], 'y2': [2, 1, 1, 0]})\n",
     "cont_names, cat_names = cont_cat_split(df)"
@@ -692,7 +692,7 @@
     "#|hide\n",
     "cont, cat = cont_cat_split(df, max_card=0)\n",
     "test_eq((cont, cat), (\n",
-    "    ['ui32', 'i64', 'f16', 'd1_Year', 'd1_Month', 'd1_Week', 'd1_Day', 'd1_Dayofweek', 'd1_Dayofyear', 'd1_Elapsed'],\n",
+    "    ['ui32', 'i64', 'f16', 'd1_Year', 'd1_Month', 'd1_Week', 'd1_Day', 'd1_Dayofweek', 'd1_Dayofyear', 'd1_Elapsed'], \n",
     "    ['cat1', 'd1_date', 'd1_Is_month_end', 'd1_Is_month_start', 'd1_Is_quarter_end', 'd1_Is_quarter_start', 'd1_Is_year_end', 'd1_Is_year_start']\n",
     "    ))"
    ]
@@ -1287,7 +1287,7 @@
    "source": [
     "#|export\n",
     "def _apply_cats (voc, add, c):\n",
-    "    if not isinstance(c, CategoricalDtype):\n",
+    "    if not is_categorical_dtype(c):\n",
     "        return pd.Categorical(c, categories=voc[c.name][add:]).codes+add\n",
     "    return c.cat.codes+add #if is_categorical_dtype(c) else c.map(voc[c.name].o2i)\n",
     "def _decode_cats(voc, c): return c.map(dict(enumerate(voc[c.name].items)))"
@@ -1743,7 +1743,7 @@
    "outputs": [],
    "source": [
     "#|hide\n",
-    "fill1,fill2,fill3 = (FillMissing(fill_strategy=s)\n",
+    "fill1,fill2,fill3 = (FillMissing(fill_strategy=s) \n",
     "                     for s in [FillStrategy.median, FillStrategy.constant, FillStrategy.mode])\n",
     "df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]})\n",
     "df1 = df.copy(); df2 = df.copy()\n",
@@ -1768,7 +1768,7 @@
    "outputs": [],
    "source": [
     "#|hide\n",
-    "fill = FillMissing()\n",
+    "fill = FillMissing() \n",
     "df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4], 'b': [0,1,2,3,4,5,6]})\n",
     "to = TabularPandas(df, fill, cont_names=['a', 'b'])\n",
     "test_eq(fill.na_dict, {'a': 1.5})\n",
@@ -1785,7 +1785,7 @@
    "outputs": [],
    "source": [
     "#|hide\n",
-    "fill = FillMissing()\n",
+    "fill = FillMissing() \n",
     "df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4], 'b': [0,1,2,3,4,5,6]})\n",
     "to = TabularPandas(df, fill, cont_names=['a', 'b'])\n",
     "test_eq(hasattr(to.procs.fill_missing, 'to'), False)"
@@ -1934,7 +1934,7 @@
     "#|export\n",
     "@delegates()\n",
     "class TabDataLoader(TfmdDL):\n",
-    "    \"A transformed `DataLoader` for Tabular data\"\n",
+    "    \"A transformed `DataLoader` for Tabular data\"    \n",
     "    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):\n",
     "        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatch(dataset)\n",
     "        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)\n",
@@ -3642,12 +3642,12 @@
    "outputs": [],
    "source": [
     "@MultiCategorize\n",
-    "def encodes(self, to:Tabular):\n",
+    "def encodes(self, to:Tabular): \n",
     "    #to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0))\n",
     "    return to\n",
-    "\n",
+    "  \n",
     "@MultiCategorize\n",
-    "def decodes(self, to:Tabular):\n",
+    "def decodes(self, to:Tabular): \n",
     "    #to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}))\n",
     "    return to"
    ]