In [2]:
import csv
from pathlib import Path

import nltk

# nltk.download('book')
# nltk.download('brown')

from nltk.corpus import brown
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

CROWD_RE_REQUIREMENTS = Path('..', 'crowdre_cleaned-csv', 'requirements.csv')

In [3]:
class Requirement(object):
    def __init__(self, text):
        self.text = text
    
    def tokenize(self):
        self.tokens = nltk.word_tokenize(self.text)
    
    def redundant_start(self):
        self.starts_with_smart_home = self.text.lower().startswith("my smart home to")
        if self.starts_with_smart_home:
            self.cleaned_text = self.text[16:]
        else:
            self.cleaned_text = self.text
    
    def remove_stopwords(self):
        self.lexical_words = [word for word in self.tokens if word not in stopwords.words('english')]
        
    def complete_analysis(self):
        self.tokenize()
        self.redundant_start()
        self.remove_stopwords()
    
    def __str__(self):
        return self.text

In [4]:
class RequirementsList(object):
    def __init__(self, path):
        if not path.exists() and path.is_file():
            raise("The given path does not exist or is not a file.")
        self._build_requirements_list(path)
    
    def __iter__(self):
        return self.requirements.__iter__() if hasattr(self, 'requirements') else []

    def _build_requirements_list(self, path):
        self.requirements = []
        with open(path, newline='') as requirements_csv:
            re_reader = csv.DictReader(requirements_csv, delimiter=',')
            for row in re_reader:
                requirementText = "As a " + row['role'] + " I want " + row['feature'] + " so that " + row['benefit']
                requirement = Requirement(requirementText)
                self.requirements.append(requirement)

    def count(self):
        return len(self.requirements) if hasattr(self, 'requirements') else 0

re_list = RequirementsList(CROWD_RE_REQUIREMENTS)

In [5]:
# NLP analysis
joined_text = " ".join(map(lambda re: re.text, re_list.requirements))
joined_text_tokenized = nltk.word_tokenize(joined_text)

no_of_requirements = re_list.count()
tokens = []
lexical_words = []
re_starting_with_smart_home = 0

lexical_words = []

for requirement in re_list:
    requirement.complete_analysis()
    tokens += requirement.tokens    
    lexical_words += requirement.lexical_words    
    if requirement.starts_with_smart_home:
        re_starting_with_smart_home += 1 

In [6]:
lancester = LancasterStemmer()
porter = PorterStemmer()
stems_lancester = []
stems_porter = []
for token in tokens:
    stems_lancester.append(lancester.stem(token))
    stems_porter.append(porter.stem(token))

In [21]:
print("Number of Tokens (unique): \t\t{} ({})".format(len(tokens), len(set(tokens))))
print("Number of Lexical Words: \t\t{}".format(len(lexical_words)))

print("\nVocabulary Size (Lexical Words): \t{}".format(len(set(lexical_words))))
print("Vocabulary Size (Stems): \t\t{}".format(len(set(stems_porter))))

print("\nAverage Sentence Length (Tokens): \t{}".format(round(len(tokens) / no_of_requirements)))
print("Average Sentence Length (Lexical Words): {}".format(round(len(lexical_words) / no_of_requirements)))

print("\nLexical Diversity: \t\t\t{}".format(round(len(set(lexical_words)) / len(joined_text),3)))
print("Requirements starting with\n\t'I want my smart home to...': \t{}/{} ({}%)".format(re_starting_with_smart_home, no_of_requirements, round(re_starting_with_smart_home / no_of_requirements * 100, 2)))


Number of Tokens (unique): 		35747 (3519)
Number of Lexical Words: 		20178

Vocabulary Size (Lexical Words): 	3411
Vocabulary Size (Stems): 		2461

Average Sentence Length (Tokens): 	12
Average Sentence Length (Lexical Words): 7

Lexical Diversity: 			0.018
Requirements starting with
	'I want my smart home to...': 	410/2966 (13.82%)
