Skip to content

Commit

Permalink
Se agrega un parámetro opcional (lower_words) al método `nombre_pro…
Browse files Browse the repository at this point in the history
…pio`, para especificar palabras que no se capitalizan.
  • Loading branch information
abenassi committed Apr 30, 2016
1 parent 277a269 commit 968999f
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 9 deletions.
6 changes: 6 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
History
=======

0.1.18 (2016-4-30)
------------------

* Se agrega un parámetro opcional (`lower_words`) al método `nombre_propio`, para especificar palabras que no se capitalizan.


0.1.16 (2016-4-16)
------------------

Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,20 +229,23 @@ Argumentos opcionales:

* **keep_original**: True para conservar la columna original / False para removerla (Default: False)
* **sufix**: Sufijo para agregar a la nueva columna limpia (Default: "clean")
* **lower_words**: Lista de palabras que deben mantenerse en minúsculas, sin aplicar capitalización (Default: ["el", "los", "la", "las", "de", "del", "en", "y"])

**Especificación:**

```python
{"nombre_propio": [
{"field": "columna_1"},
{"field": "columna_2"}
{"field": "columna_2", "lower_words": ["lower_word1", "lower_word2"]}
]}
```

**Ejemplo:**

```python
{"nombre_propio": [
{"field": "dependencia", "lower_words": ["en", "la"]}
{"field": "dependencia", "lower_words": []}
{"field": "dependencia"}
]}
```
Expand Down
12 changes: 8 additions & 4 deletions data_cleaner/capitalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import string
import pandas as pd
from functools import partial


LOWER_WORDS = [
Expand All @@ -17,7 +18,7 @@
]


def normalize_word(word):
def normalize_word(word, lower_words=None):
"""Normaliza una palabra, capitalizándola cuando corresponde.
Si contiene signos de puntacion se capitaliza dentro de esas strings.
Expand All @@ -28,17 +29,19 @@ def normalize_word(word):
Returns:
str: Palabra normalizada
"""
lower_words = lower_words or LOWER_WORDS

for character in string.punctuation:
if character in word:
return capitalize(word, sep=character)
if word.lower() in IGNORE_WORDS:
return word
if word.lower() in LOWER_WORDS:
if word.lower() in lower_words:
return word.lower()
return word.title()


def capitalize(string, sep=None, encoding="utf-8"):
def capitalize(string, sep=None, encoding="utf-8", lower_words=None):
"""Capitaliza una string que puede estar compuesta por varias palabras
Args:
Expand All @@ -58,6 +61,7 @@ def capitalize(string, sep=None, encoding="utf-8"):
if len(words) == 0:
return ""
first_word = words[0].title()
normalized_words = [first_word] + map(normalize_word, words[1:])
partial_normalize_word = partial(normalize_word, lower_words=lower_words)
normalized_words = [first_word] + map(partial_normalize_word, words[1:])

return (sep if sep else " ").join(normalized_words)
6 changes: 3 additions & 3 deletions data_cleaner/data_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,8 @@ def renombrar_columnas(self, field, new_field, inplace=False):

return renamed_df

def nombre_propio(self, field, sufix=None, keep_original=False,
inplace=False):
def nombre_propio(self, field, sufix=None, lower_words=None,
keep_original=False, inplace=False):
"""Regla para todos los nombres propios.
Capitaliza los nombres de países, ciudades, personas, instituciones y
Expand All @@ -264,7 +264,7 @@ def nombre_propio(self, field, sufix=None, keep_original=False,
sufix = sufix or self.DEFAULT_SUFIX
field = self._normalize_field(field)
series = self.df[field]
capitalized = series.apply(capitalize)
capitalized = series.apply(capitalize, lower_words=lower_words)

if inplace:
self._update_series(field=field, sufix=sufix,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

setup(
name='data-cleaner',
version='0.1.17',
version='0.1.18',
description="Paquete para limpieza de datos, según estándares de la SSIPyGA - Gobierno Abierto Argentina",
long_description=readme + '\n\n' + history,
author="Gobierno Abierto Argentina",
Expand Down
6 changes: 6 additions & 0 deletions tests/output/clean_nombre_propio_lower_words.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
dependencia
"Presidencia de la nación"
"Presidencia de la nación"
"Presidencia de la nación"
"Presidencia de la nación"
"Presidencia de la nación"
18 changes: 18 additions & 0 deletions tests/test_data_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,24 @@ def test_nombre_propio_keep_original(self):

self.assertIn("dependencia_normalizado", dc.df.columns)

def test_nombre_propio_lower_words(self):
input_path = get_input("nombre_propio")
output_path = get_output("nombre_propio_lower_words")
field = "dependencia"

# obtengo el resultado de limpiar el csv
dc = DataCleaner(input_path)
series = dc.nombre_propio(
field, lower_words=["nación", "de", "la"],
keep_original=True, inplace=True)
res = list(series)

# cargo el csv limpio para comparar
df = pd.read_csv(output_path, encoding="utf-8")
exp = list(df[field])

self.assertEqual(res, exp)

# @unittest.skip("skip")
def test_string_normal(self):
input_path = get_input("string_normal")
Expand Down

0 comments on commit 968999f

Please sign in to comment.