# RE19-classification: enrichment of a new dataset

This notebook takes as input a previously enriched dataset and a new dataset, and enriches the new dataset according to the columns in the enriched dataset.

## 0. Set up (optional)

Run the following three install functions if running Jupyter on a cloud environment like Colaboratory, which does not allow you to install the libraries permanently on your local machine

In [1]:
!pip install cython numpy
!pip install benepar[cpu]

Collecting benepar[cpu]
[?25l  Downloading https://files.pythonhosted.org/packages/a0/7b/6cd9c60e1613a5ad388b4f883fa2aeaddcd8a7ad0a8d5ed87e0d23f159d8/benepar-0.1.2.tar.gz (72kB)
[K    100% |████████████████████████████████| 81kB 3.2MB/s 
Building wheels for collected packages: benepar
  Building wheel for benepar (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/c6/f5/06/d88543b19a9b326007d7538298a139e994b1d2eecb003bf5af
Successfully built benepar
Installing collected packages: benepar
Successfully installed benepar-0.1.2


## 1. Import libraries

In [2]:
# Basic numpy, sklearn, pandas libraries
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from IPython.display import display

# Basic NLTK tooling
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

# The benepar parser -- this is supposed to be a better parser than Stanford's parser used in the RE'17 paper
import benepar
benepar.download('benepar_en2')

# Tqdm, for progress bars -- useful to show that the parsing is working
from tqdm import tqdm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package benepar_en2 to /root/nltk_data...


## [Functions] dataset enrichment

Additional features are added, if also present in the original dataset.

In [0]:
def enrich_ling(data):
  # Text length
  if 'Length' in data.columns:
    data['Length'] = 0
    idx = 0
    for x in data['RequirementText']:
        data.at[idx, 'Length'] = len(x)
        idx = idx + 1  

  # POS tags and tree information
  parser = benepar.Parser("benepar_en2")
  if 'Modal' in data.columns:
    data['Modal'] = 0.0
  if 'Adjective' in data.columns:
    data['Adjective'] = 0.0
  if 'Noun' in data.columns:
    data['Noun'] = 0.0
  if 'Adverb' in data.columns:
    data['Adverb'] = 0.0
  if 'Verb' in data.columns:
    data['Verb'] = 0.0
  if 'TreeHeight' in data.columns:
    data['TreeHeight'] = 0
  if 'SubTrees' in data.columns:
    data['SubTrees'] = 0

  idx = 0
  for req in tqdm(data['RequirementText'], desc='Parse trees', position=0):
      tokens = tokenizer.tokenize(req)
      if 'Words' in data.columns:
        data.at[idx, 'Words'] = len(tokens)
      tags = nltk.pos_tag(tokens)
      fd = nltk.FreqDist(tag for (word, tag) in tags)
      for key, value in fd.items():
          if key=="MD" and 'Modal' in data.columns:
              data.at[idx, 'Modal'] = value
          if key.startswith("JJ") and 'Adjective' in data.columns:
              data.at[idx, 'Adjective'] = value
          if key.startswith("VB") and 'Verb' in data.columns:
              data.at[idx, 'Verb'] = value
          if key.startswith("NN") and 'Noun' in data.columns:
              data.at[idx, 'Noun'] = value
          if key=="RB" and 'Adverb' in data.columns:
              data.at[idx, 'Adverb'] = value
      if 'Modal' in data.columns:
        data.at[idx, 'Modal'] = data.at[idx, 'Modal'] / len(tokens)
      if 'Adjective' in data.columns:
        data.at[idx, 'Adjective'] = data.at[idx, 'Adjective'] / len(tokens)
      if 'Noun' in data.columns:
        data.at[idx, 'Noun'] = data.at[idx, 'Noun'] / len(tokens)
      if 'Adverb' in data.columns:
        data.at[idx, 'Adverb'] = data.at[idx, 'Adverb'] / len(tokens)
      if 'Verb' in data.columns:
        data.at[idx, 'Verb'] = data.at[idx, 'Verb'] / len(tokens)       
      tree = parser.parse(req)
      if 'TreeHeight' in data.columns:
        data.at[idx, 'TreeHeight'] = tree.height()
      if 'SubTrees' in data.columns:
        data.at[idx, 'SubTrees'] = len(tree)
      idx = idx + 1    

  print(data.head())

In [0]:
from nltk.util import ngrams
from collections import Counter
from sklearn.feature_extraction import stop_words
from nltk.stem import WordNetLemmatizer

def enrich_ngram(data, file, target, nrfeat):
  bigrams = []
  trigrams = []
  frequencies = Counter([])
  frequencies2 = Counter([])
  frequencies3 = Counter([])
  pfrequencies = Counter([])
  pfrequencies2 = Counter([])
  pfrequencies3 = Counter([])

  wn_lemmatizer = WordNetLemmatizer()

  # Generation of [1, 2, 3] textgrams, [1, 2, 3] POSgrams
  for req in tqdm(data['RequirementText'], desc='n-grams generation', position=0):
      token = tokenizer.tokenize(req)
      token = [word.lower() for word in token]
      tags = nltk.pos_tag(token)
      token = [w for w in token if not w in stop_words.ENGLISH_STOP_WORDS]
      token = [wn_lemmatizer.lemmatize(w) for w in token]
      frequencies += Counter(token)
      bigrams = ngrams(token,2)
      trigrams = ngrams(token,3)
      frequencies2 += Counter(bigrams)
      frequencies3 += Counter(trigrams)
      punigrams = [tag for (word, tag) in tags]
      pfrequencies += Counter(punigrams)
      pbigrams = ngrams([tag for (word, tag) in tags], 2)
      pfrequencies2 += Counter(pbigrams)
      ptrigrams = ngrams([tag for (word, tag) in tags], 3)
      pfrequencies3 += Counter(ptrigrams)

  #print (len(frequencies), len(frequencies2), len(frequencies3), len(pfrequencies), len(pfrequencies2), len(pfrequencies3))

  # Populating the n-grams
  idx = 0
  for req in tqdm(data['RequirementText'], desc='n-grams population', position=0):
      token = tokenizer.tokenize(req)

      for t in token:
        exists = [col for col in data.columns if col == str('_' + t + '_')]
        if exists != []:
          data.at[idx, exists] = 1

      bigrams = ngrams(token,2)
      for bg in bigrams:
        exists = [col for col in data.columns if col == str('_' + bg[0] + '_' + bg[1] + '_')]
        if exists != []:
          data.at[idx, exists] = 1

      trigrams = ngrams(token,3)
      for tg in trigrams:
        exists = [col for col in data.columns if col == str('_' + tg[0] + '_' + tg[1] + '_' + tg[2] + '_')]
        if exists != []:
          data.at[idx, exists] = 1

      tags = nltk.pos_tag(token)

      for t in tags:
        exists = [col for col in data.columns if col == str(t)]
        if exists != []:
          data.at[idx, exists] = 1

      pbigrams = ngrams([tag for (word, tag) in tags], 2)
      for bg in pbigrams:
        exists = [col for col in data.columns if col == str(bg[0] + '_' + bg[1])]
        if exists != []:
          data.at[idx, exists] = 1

      ptrigrams = ngrams([tag for (word, tag) in tags], 3)
      for tg in ptrigrams:
        exists = [col for col in data.columns if col == str(tg[0] + '_' + tg[1] + '_' + tg[2])]
        if exists != []:
          data.at[idx, exists] = 1

      idx = idx + 1

  data = data.fillna(0.0)

  # for column in data.columns:
  #   if data[column].isnull().sum()==len(data):
  #     data[column] = 0.0

  data.columns = data.columns.map(str)

  print (data.head())

  # The big dataset is now saved
  data.to_csv(file + '-tagged-' + str(nrfeat) + '-' + target + '.csv', encoding='utf-8')

In [0]:
def enrich_dataset(file, target, nrfeat):
  # Loading the enriched data set
  data = pd.read_csv('https://raw.githubusercontent.com/explainable-re/re19-data/master/promise-km-' + str(nrfeat) + '-' + target + '.csv', engine='python')
  data = data[0:0]

  # Take the new dataset

  data2 = pd.read_csv(file + '.csv', engine='python')
  data['RequirementText'] = data2['RequirementText']
  data['ProjectID'] = 1
  data['Class'] = 'F'
  if target == 'q' or target == 'f':
    data['IsFunctional'] = data2['IsFunctional']
    data['IsQuality'] = data2['IsQuality']

  if target == 'oq':
    data['OnlyQuality'] = ~data2['IsFunctional'] & data2['IsQuality']

  if target == 'of':
    data['OnlyFunctional'] = data2['IsFunctional'] & ~data2['IsQuality']


  data = data.drop(data.columns[0], axis=1)

  print (data.head())

  enrich_ling(data)
  enrich_ngram(data, file, target, nrfeat)

## 3. Main file

Imports the enriched data set and the new dataset, then invokes the other functions

In [6]:
# Define the files that you want to process here.  
# filename = ['dronology', 'ds2', 'ds3', 'reqview', 'wasp', 'leeds', 'esa-eucl-est']
filename = ['combined']
target_type = ['f', 'q', 'of', 'oq']
#target_type = ['oq']

for f in filename:
  for t in target_type:
      enrich_dataset(file=f, target=t, nrfeat=100)

   CD_IN  VB_DT  _allow_  Adverb  MD_VB_NNS  NN_IN_CD  Adjective  MD_VB_DT  \
0    NaN    NaN      NaN     NaN        NaN       NaN        NaN       NaN   
1    NaN    NaN      NaN     NaN        NaN       NaN        NaN       NaN   
2    NaN    NaN      NaN     NaN        NaN       NaN        NaN       NaN   
3    NaN    NaN      NaN     NaN        NaN       NaN        NaN       NaN   
4    NaN    NaN      NaN     NaN        NaN       NaN        NaN       NaN   

   _use_  _interface_      ...       _hit_  NNS_CD_IN  VB_JJ_NN  TO_NN  \
0    NaN          NaN      ...         NaN        NaN       NaN    NaN   
1    NaN          NaN      ...         NaN        NaN       NaN    NaN   
2    NaN          NaN      ...         NaN        NaN       NaN    NaN   
3    NaN          NaN      ...         NaN        NaN       NaN    NaN   
4    NaN          NaN      ...         NaN        NaN       NaN    NaN   

   TO_VB_NNS                                    RequirementText  ProjectID  \
0       

Parse trees: 100%|██████████| 1502/1502 [07:32<00:00,  3.67it/s]
n-grams generation:   0%|          | 0/1502 [00:00<?, ?it/s]

   CD_IN  VB_DT  _allow_    Adverb  MD_VB_NNS  NN_IN_CD  Adjective  MD_VB_DT  \
0    NaN    NaN      NaN  0.000000        NaN       NaN   0.000000       NaN   
1    NaN    NaN      NaN  0.000000        NaN       NaN   0.000000       NaN   
2    NaN    NaN      NaN  0.000000        NaN       NaN   0.068966       NaN   
3    NaN    NaN      NaN  0.055556        NaN       NaN   0.111111       NaN   
4    NaN    NaN      NaN  0.000000        NaN       NaN   0.085714       NaN   

   _use_  _interface_      ...       _hit_  NNS_CD_IN  VB_JJ_NN  TO_NN  \
0    NaN          NaN      ...         NaN        NaN       NaN    NaN   
1    NaN          NaN      ...         NaN        NaN       NaN    NaN   
2    NaN          NaN      ...         NaN        NaN       NaN    NaN   
3    NaN          NaN      ...         NaN        NaN       NaN    NaN   
4    NaN          NaN      ...         NaN        NaN       NaN    NaN   

   TO_VB_NNS                                    RequirementText  ProjectID

n-grams generation: 100%|██████████| 1502/1502 [00:05<00:00, 262.48it/s]
n-grams population: 100%|██████████| 1502/1502 [00:18<00:00, 80.50it/s]


   CD_IN  VB_DT  _allow_    Adverb  MD_VB_NNS  NN_IN_CD  Adjective  MD_VB_DT  \
0    0.0    1.0      0.0  0.000000        0.0       0.0   0.000000       1.0   
1    0.0    1.0      0.0  0.000000        0.0       0.0   0.000000       1.0   
2    1.0    0.0      0.0  0.000000        0.0       1.0   0.068966       0.0   
3    1.0    0.0      0.0  0.055556        0.0       0.0   0.111111       0.0   
4    1.0    0.0      0.0  0.000000        0.0       1.0   0.085714       0.0   

   _use_  _interface_      ...       _hit_  NNS_CD_IN  VB_JJ_NN  TO_NN  \
0    0.0          0.0      ...         0.0        0.0       0.0    0.0   
1    0.0          0.0      ...         0.0        0.0       0.0    0.0   
2    0.0          0.0      ...         0.0        0.0       0.0    0.0   
3    0.0          0.0      ...         0.0        0.0       0.0    0.0   
4    0.0          0.0      ...         0.0        0.0       0.0    0.0   

   TO_VB_NNS                                    RequirementText  ProjectID

Parse trees: 100%|██████████| 1502/1502 [07:06<00:00,  3.91it/s]
n-grams generation:   7%|▋         | 99/1502 [00:00<00:01, 988.74it/s]

   _allow_  MD_VB_DT  CD_IN  SubTrees    Adverb  CD_NNS  _meeting_     Modal  \
0      NaN       NaN    NaN         4  0.000000     NaN        NaN  0.111111   
1      NaN       NaN    NaN         3  0.000000     NaN        NaN  0.062500   
2      NaN       NaN    NaN         6  0.000000     NaN        NaN  0.068966   
3      NaN       NaN    NaN         6  0.055556     NaN        NaN  0.055556   
4      NaN       NaN    NaN         4  0.000000     NaN        NaN  0.057143   

   VB_IN  _preferred_      ...       VBG_VBG  VB_IN_DT  PRP_CC_PRP  \
0    NaN          NaN      ...           NaN       NaN         NaN   
1    NaN          NaN      ...           NaN       NaN         NaN   
2    NaN          NaN      ...           NaN       NaN         NaN   
3    NaN          NaN      ...           NaN       NaN         NaN   
4    NaN          NaN      ...           NaN       NaN         NaN   

   DT_NNS_VBP  WDT_VBP_VBN                                    RequirementText  \
0         NaN    

n-grams generation: 100%|██████████| 1502/1502 [00:03<00:00, 408.80it/s]
n-grams population: 100%|██████████| 1502/1502 [00:18<00:00, 79.80it/s]


   _allow_  MD_VB_DT  CD_IN  SubTrees    Adverb  CD_NNS  _meeting_     Modal  \
0      0.0       1.0    0.0         4  0.000000     1.0        0.0  0.111111   
1      0.0       1.0    0.0         3  0.000000     0.0        0.0  0.062500   
2      0.0       0.0    1.0         6  0.000000     0.0        0.0  0.068966   
3      0.0       0.0    1.0         6  0.055556     1.0        0.0  0.055556   
4      0.0       0.0    1.0         4  0.000000     0.0        0.0  0.057143   

   VB_IN  _preferred_      ...       VBG_VBG  VB_IN_DT  PRP_CC_PRP  \
0    0.0          0.0      ...           0.0       0.0         0.0   
1    0.0          0.0      ...           0.0       0.0         0.0   
2    0.0          0.0      ...           0.0       0.0         0.0   
3    0.0          0.0      ...           0.0       0.0         0.0   
4    1.0          0.0      ...           0.0       0.0         0.0   

   DT_NNS_VBP  WDT_VBP_VBN                                    RequirementText  \
0         0.0    

Parse trees: 100%|██████████| 1502/1502 [07:08<00:00,  4.03it/s]
n-grams generation:   7%|▋         | 98/1502 [00:00<00:01, 973.20it/s]

   _allow_  MD_VB_DT  CD_IN  SubTrees  IN_CD  _meeting_  CD_NNS  \
0      NaN       NaN    NaN         4    NaN        NaN     NaN   
1      NaN       NaN    NaN         3    NaN        NaN     NaN   
2      NaN       NaN    NaN         6    NaN        NaN     NaN   
3      NaN       NaN    NaN         6    NaN        NaN     NaN   
4      NaN       NaN    NaN         4    NaN        NaN     NaN   

   _shall_allow_     Modal  _record_       ...        _lab_section_  \
0            NaN  0.111111       NaN       ...                  NaN   
1            NaN  0.062500       NaN       ...                  NaN   
2            NaN  0.068966       NaN       ...                  NaN   
3            NaN  0.055556       NaN       ...                  NaN   
4            NaN  0.057143       NaN       ...                  NaN   

   TreeHeight  PRP_CC_PRP  _e_  NNS_RB_IN  \
0           6         NaN  NaN        NaN   
1          13         NaN  NaN        NaN   
2          14         NaN  NaN     

n-grams generation: 100%|██████████| 1502/1502 [00:03<00:00, 422.08it/s]
n-grams population: 100%|██████████| 1502/1502 [00:16<00:00, 90.89it/s] 


   _allow_  MD_VB_DT  CD_IN  SubTrees  IN_CD  _meeting_  CD_NNS  \
0      0.0       1.0    0.0         4    0.0        0.0     1.0   
1      0.0       1.0    0.0         3    0.0        0.0     0.0   
2      0.0       0.0    1.0         6    1.0        0.0     0.0   
3      0.0       0.0    1.0         6    0.0        0.0     1.0   
4      0.0       0.0    1.0         4    1.0        0.0     0.0   

   _shall_allow_     Modal  _record_       ...        _lab_section_  \
0            0.0  0.111111       0.0       ...                  0.0   
1            0.0  0.062500       0.0       ...                  0.0   
2            0.0  0.068966       0.0       ...                  0.0   
3            0.0  0.055556       0.0       ...                  0.0   
4            0.0  0.057143       0.0       ...                  0.0   

   TreeHeight  PRP_CC_PRP  _e_  NNS_RB_IN  \
0           6         0.0  0.0        0.0   
1          13         0.0  0.0        0.0   
2          14         0.0  0.0     

Parse trees: 100%|██████████| 1502/1502 [07:05<00:00,  3.95it/s]
n-grams generation:   7%|▋         | 98/1502 [00:00<00:01, 972.93it/s]

   CD_IN  MD_VB_DT  NN_IN_CD  _allow_  SubTrees      Noun  _interface_  \
0    NaN       NaN       NaN      NaN         4  0.111111          NaN   
1    NaN       NaN       NaN      NaN         3  0.187500          NaN   
2    NaN       NaN       NaN      NaN         6  0.068966          NaN   
3    NaN       NaN       NaN      NaN         6  0.055556          NaN   
4    NaN       NaN       NaN      NaN         4  0.085714          NaN   

   VB_JJ_IN    Adverb  VB_DT      ...       _cohort_  CD_CD  IN_NNS  WP_MD_VB  \
0       NaN  0.000000    NaN      ...            NaN    NaN     NaN       NaN   
1       NaN  0.000000    NaN      ...            NaN    NaN     NaN       NaN   
2       NaN  0.000000    NaN      ...            NaN    NaN     NaN       NaN   
3       NaN  0.055556    NaN      ...            NaN    NaN     NaN       NaN   
4       NaN  0.000000    NaN      ...            NaN    NaN     NaN       NaN   

   DT_NN_TO                                    RequirementText  Proj

n-grams generation: 100%|██████████| 1502/1502 [00:03<00:00, 403.55it/s]
n-grams population: 100%|██████████| 1502/1502 [00:20<00:00, 74.93it/s]


   CD_IN  MD_VB_DT  NN_IN_CD  _allow_  SubTrees      Noun  _interface_  \
0    0.0       1.0       0.0      0.0         4  0.111111          0.0   
1    0.0       1.0       0.0      0.0         3  0.187500          0.0   
2    1.0       0.0       1.0      0.0         6  0.068966          0.0   
3    1.0       0.0       0.0      0.0         6  0.055556          0.0   
4    1.0       0.0       1.0      0.0         4  0.085714          0.0   

   VB_JJ_IN    Adverb  VB_DT      ...       _cohort_  CD_CD  IN_NNS  WP_MD_VB  \
0       0.0  0.000000    1.0      ...            0.0    0.0     0.0       0.0   
1       0.0  0.000000    1.0      ...            0.0    0.0     0.0       0.0   
2       1.0  0.000000    0.0      ...            0.0    0.0     1.0       0.0   
3       1.0  0.055556    0.0      ...            0.0    0.0     0.0       0.0   
4       1.0  0.000000    0.0      ...            0.0    0.0     1.0       0.0   

   DT_NN_TO                                    RequirementText  Proj