Skip to content

Commit

Permalink
[stackexchange] Make author anonymization optional for stackexchange
Browse files Browse the repository at this point in the history
This code makes it optional to pseudo-anonymize the owner of
question and answers. It creates a hash of the name and user_id and
remove author_link and profile_image.

For enabling the anonymization, include the following parameter in
the stackexchange section:
```
[stackexchange]
...
anonymize = true
```

Signed-off-by: JJMerchante <jj.merchante@gmail.com>
  • Loading branch information
jjmerchante authored and sduenas committed Mar 9, 2021
1 parent 4e78f92 commit c4811ae
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 4 deletions.
2 changes: 1 addition & 1 deletion grimoire_elk/identities/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
# Jose Javier Merchante Picazo <jjmerchante@gcauldron.io>
# Jose Javier Merchante Picazo <jjmerchante@cauldron.io>
#

from grimoire_elk.identities.identities import Identities
Expand Down
2 changes: 1 addition & 1 deletion grimoire_elk/identities/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
# Jose Javier Merchante Picazo <jjmerchante@gcauldron.io>
# Jose Javier Merchante Picazo <jjmerchante@cauldron.io>
#

from grimoire_elk.identities.identities import Identities
Expand Down
2 changes: 1 addition & 1 deletion grimoire_elk/identities/identities.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
# Jose Javier Merchante Picazo <jjmerchante@gcauldron.io>
# Jose Javier Merchante Picazo <jjmerchante@cauldron.io>
#

import hashlib
Expand Down
2 changes: 1 addition & 1 deletion grimoire_elk/identities/meetup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
# Jose Javier Merchante Picazo <jjmerchante@gcauldron.io>
# Jose Javier Merchante Picazo <jjmerchante@cauldron.io>
#

from grimoire_elk.identities.identities import Identities
Expand Down
54 changes: 54 additions & 0 deletions grimoire_elk/identities/stackexchange.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2020 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
# Jose Javier Merchante Picazo <jjmerchante@cauldron.io>
#

from grimoire_elk.identities.identities import Identities


class StackExchangeIdentities(Identities):
@classmethod
def anonymize_item(cls, item):
"""Remove or hash the fields that contain personal information
Comments are removed because can cause complexity, there could be many
and are not used in the enrichment process
"""

item = item['data']

item['comments'] = []
if 'owner' in item and item['owner']:
cls._sanitize_owner(item['owner'])

if 'answers' in item and item['answers']:
for answer in item['answers']:
if 'owner' in answer and answer['owner']:
cls._sanitize_owner(answer['owner'])
answer['comments'] = []

@classmethod
def _sanitize_owner(cls, owner):
"""Remove links and hash personal information"""
if 'display_name' in owner:
owner['display_name'] = cls._hash(owner['display_name'])
if 'user_id' in owner:
owner['user_id'] = cls._hash(str(owner['user_id']))
owner['profile_image'] = ''
owner['link'] = ''
2 changes: 2 additions & 0 deletions grimoire_elk/raw/stackexchange.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from .elastic import ElasticOcean
from ..elastic_mapping import Mapping as BaseMapping
from ..identities.stackexchange import StackExchangeIdentities


class Mapping(BaseMapping):
Expand Down Expand Up @@ -64,6 +65,7 @@ class StackExchangeOcean(ElasticOcean):
"""StackExchange Ocean feeder"""

mapping = Mapping
identities = StackExchangeIdentities

@classmethod
def get_perceval_params_from_url(cls, url):
Expand Down
37 changes: 37 additions & 0 deletions tests/test_stackexchange.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ class TestStackexchange(TestBaseBackend):

connector = "stackexchange"
ocean_index = "test_" + connector
ocean_index_anonymized = "test_" + connector + "_anonymized"
enrich_index = "test_" + connector + "_enrich"
enrich_index_anonymized = "test_" + connector + "_enrich_anonymized"

def test_has_identites(self):
"""Test value of has_identities method"""
Expand Down Expand Up @@ -129,6 +131,41 @@ def test_copy_raw_fields(self):
else:
self.assertIsNone(eitem[attribute])

def test_items_to_raw_anonymized(self):
"""Test whether JSON items are properly inserted into ES anonymized"""

result = self._test_items_to_raw_anonymized()

self.assertEqual(result['items'], 3)
self.assertEqual(result['raw'], 3)

item = self.items[0]['data']
self.assertEqual(item['owner']['display_name'], '80490d00f668dde48d4e0ce62142c8a2ac9a1465')
self.assertEqual(item['owner']['user_id'], '182b39d390fc9fde7594184cbe6e6f8653cfd5b2')
self.assertEqual(item['owner']['link'], '')
self.assertEqual(item['owner']['profile_image'], '')
self.assertEqual(len(item['comments']), 0)
self.assertEqual(item['answers'][0]['owner']['display_name'], '0d2244465bfc8b636bf1fbe74912cc2c748b42e4')
self.assertEqual(item['answers'][0]['owner']['user_id'], 'c7b7c5dea6f6a1a4531bf491b207d123ca41da4c')
self.assertEqual(item['answers'][0]['owner']['link'], '')
self.assertEqual(item['answers'][0]['owner']['profile_image'], '')
self.assertEqual(len(item['answers'][0]['comments']), 0)

def test_raw_to_enrich_anonymized(self):
"""Test whether the raw index is properly enriched"""

result = self._test_raw_to_enrich_anonymized()

self.assertEqual(result['raw'], 3)
self.assertEqual(result['enrich'], 6)

enrich_backend = self.connectors[self.connector][2]()

item = self.items[0]
eitem = enrich_backend.get_rich_item(item)
self.assertEqual(eitem['author'], '80490d00f668dde48d4e0ce62142c8a2ac9a1465')
self.assertEqual(eitem['author_link'], '')


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
Expand Down

0 comments on commit c4811ae

Please sign in to comment.